PKUHPC · huerni · Oct 16, 2025 · Nov 19, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/docs/en/reference/keepalived.md b/docs/en/reference/keepalived.md
@@ -0,0 +1,194 @@
+# Keepalived Configuration
+
+The system adopts a high-availability management solution based on the
+VRRP (Virtual Router Redundancy Protocol).
+It enables second-level master–backup failover and transparent service migration,
+significantly improving system disaster recovery capability and availability.
+
+By automatically detecting failures of the master node and switching the
+Virtual IP (VIP) to the backup node, the system can quickly restore services
+and ensure high availability.
+In addition, the health check module supports custom monitoring logic,
+providing application-level high-availability scheduling and further enhancing
+system stability and reliability.
+
+## Installation
+```shell
+dnf install keepalived
+```
+## Configuration Files
+
+### Crane Config (/etc/crane/config.yaml)
+```bash
+Keepalived:
+  # the base directory of NFS storage
+  CraneNFSBaseDir: /var/crane/
+  # file path of cranectld alive file (relative to CraneBaseDir)
+  CraneCtldAliveFile: cranectld/cranectld.alive
+
+```
+
+### Master Node (/etc/keepalived/keepalived.conf)
+```bash
+global_defs {
+   script_user root # user used to execute scripts
+   enable_script_security
+}
+
+vrrp_script chk_cranectld {
+    script "/etc/keepalived/check_and_failover.sh"
+    interval 1   # script execution interval
+    fall 2       # mark as failed only after 2 consecutive failures (~4 seconds)
+    weight -20   # reduce master priority by 20 when script fails
+}
+
+vrrp_instance VI_1 {
+    state MASTER               # use BACKUP on standby nodes
+    interface ens33            # replace with your actual NIC name
+    virtual_router_id 51
+    priority 100               # use 90 or lower on backup nodes
+    advert_int 1
+    authentication {
+        auth_type PASS
+        auth_pass 1111
+    }
+    virtual_ipaddress {
+        192.168.1.211          # replace with your VIP
+    }
+    track_script {
+        chk_cranectld
+    }
+    notify_master "/etc/keepalived/on_master.sh"   # optional, executed when becoming MASTER
+    # notify_backup "/etc/keepalived/on_backup.sh" # optional, executed when becoming BACKUP
+    # notify_fault  "/etc/keepalived/on_fault.sh"  # optional, executed on fault
+}
+```
+
+### Backup Node (/etc/keepalived/keepalived.conf)
+```bash
+global_defs {
+   script_user root # user used to execute scripts
+   enable_script_security
+}
+
+vrrp_instance VI_1 {
+    state BACKUP
+    interface ens33            # replace with your actual NIC name
+    virtual_router_id 51
+    priority 90
+    advert_int 1
+    authentication {
+        auth_type PASS
+        auth_pass 1111
+    }
+    virtual_ipaddress {
+        192.168.1.211          # replace with your VIP
+    }
+    track_script {
+        chk_cranectld
+    }
+    notify_master "/etc/keepalived/on_master.sh"   # optional, executed when switching to MASTER
+    notify_backup "/etc/keepalived/on_backup.sh"   # optional, executed when switching to BACKUP
+    # notify_fault  "/etc/keepalived/on_fault.sh"  # optional, executed on fault
+}
+```
+
+## Scripts
+
+**_All scripts, every directory in their paths, and the root (/) directory
+must be writable only by root and owned by root.
+Otherwise, keepalived will refuse to execute the scripts._**
+
+### Health Check Script (/etc/keepalived/check_and_failover.sh)
+```bash
+#!/bin/bash
+
+set -e
+
+SCRIPT_NAME=$(basename "$0")
+PROC_NAME="cranectld"
+ALIVE_FILE="/var/crane/cranectld/cranectld.alive"
+
+if ! pgrep -f "$PROC_NAME" > /dev/null 2>&1 && [ -f "$ALIVE_FILE" ]; then
+    echo "[$(date)] [$SCRIPT_NAME] $PROC_NAME not running, $ALIVE_FILE exists, triggering failover..."
+    exit 1
+else
+    echo "[$(date)] [$SCRIPT_NAME] health check passed."
+    exit 0
+fi
+```
+
+### OnMaster Script (/etc/keepalived/on_master.sh)
+```bash
+#!/bin/bash
+
+LOCK_FILE="/nfs/home/shouxin/crane/cranectld/cranectld.lock"
+
+echo "$date on_master execute" >> /tmp/on_master.log
+
+# Open file descriptor 9
+exec 9>"$LOCK_FILE"
+
+# Try to acquire an exclusive lock, wait up to 2 seconds
+if flock -x -w 2 9; then
+  echo "Lock is NOT held by another CraneCtld instance (acquired)." >> /tmp/on_master.log
+  # Release the lock without deleting the file
+  flock -u 9
+
+  systemctl restart cranectld >> /tmp/on_master.log 2>&1
+else
+  echo "Could not acquire lock within 5 seconds. Assuming lock is held by another instance." >> /tmp/on_master.log
+  # Close fd 9
+  exec 9>&-
+fi
+
+# Email notifications can be configured here if needed
+```
+
+### OnBackup Script (/etc/keepalived/on_backup.sh)
+```bash
+#!/bin/bash
+
+# Email notifications can be configured here if needed
+
+echo "on_backup execute" >> /tmp/on_backup.log
+
+systemctl stop cranectld >> /tmp/on_backup.log 2>&1
+```
+
+## Startup
+
+1. Deploy **ctld**: only start `ctld` on the **master** node.  
+   Starting `ctld` manually on the **backup** node is strictly prohibited.
+2. Start **keepalived**:
+```bash
+systemctl enable keepalived
+systemctl start keepalived
+```
+
+**_Notes_**
+
+1. If `journalctl -u keepalived` does **not** show 
+   `Unsafe permission found for script 'xxx.sh' -disabling`, 
+   the script permissions are configured correctly.
+2. If the **onMaster** or **onBackUp** scripts fail to execute, you can disable SELinux:
+   `sudo setenforce 0`
+3. If email notifications are required, configure them in the onmaster, onbackup, and onfault scripts.
+4. Do not manually start ctld on the backup node, as this may prevent ctld from starting on the master node.
+5. If ctld fails to start on the master node with an error indicating that another instance already exists, you can delete the file:
+`/var/crane/cranectld/cranectld.alive`, After deletion, the master node will be re-elected automatically, ctld will start running on the master node, 
+and ctld on the backup node will be stopped automatically. If the master node becomes unavailable, 
+the administrator should restore the master node as soon as possible.
+
+
+```bash
+# /etc/keepalived/on_master.sh: line 13: /usr/bin/systemctl: Permission denied
+
+# When SELinux is in Enforcing mode, it may prevent keepalived
+# from executing systemctl as root.
+# Solution:
+# Check SELinux status:
+getenforce
+# If the output is Enforcing, try disabling it temporarily:
+setenforce 0
+```
diff --git a/docs/zh/reference/keepalived.md b/docs/zh/reference/keepalived.md
@@ -0,0 +1,174 @@
+# keepalived 配置
+
+系统采用基于 VRRP（Virtual Router Redundancy Protocol） 协议的高可用管理方案，
+实现秒级主备切换和业务无感知迁移，大幅提升了系统的容灾能力和可用性。通过自动检测主节点故障并将虚拟 IP（VIP）切换到备节点，
+系统可快速恢复业务，确保高可用性。此外，健康检查模块支持自定义监控逻辑，
+提供应用层级的高可用调度，进一步增强系统的稳定性与可靠性。
+
+## 安装
+```shell
+dnf install keepalived
+```
+## 配置文件
+### Crane Config 配置（/etc/crane/config.yaml）
+```yaml
+Keepalived:
+  # the base directory of NFS storage
+  CraneNFSBaseDir: /var/crane/
+  # file path of cranectld alive file (relative to CraneBaseDir)
+  CraneCtldAliveFile: cranectld/cranectld.alive
+```
+### Master 节点（ /etc/keepalived/keepalived.conf）
+```bash
+global_defs {
+   script_user root # 执行脚本的user
+   enable_script_security
+}
+
+vrrp_script chk_cranectld {
+    script "/etc/keepalived/check_and_failover.sh"
+    interval 1 # 脚本运行间隔时间  
+    fall 2 # 连续2次失败（约4秒）才判定真正挂了
+    weight -20 # 当偏移时，master priority-20
+}
+
+vrrp_instance VI_1 {
+    state MASTER               # 备节点写 BACKUP
+    interface ens33            # 改为你的实际网卡名
+    virtual_router_id 51
+    priority 100               # 备节点写90或更低
+    advert_int 1
+    authentication {
+        auth_type PASS
+        auth_pass 1111
+    }
+    virtual_ipaddress {
+        192.168.1.211          # 改为你的VIP
+    }
+    track_script {
+        chk_cranectld
+    }
+    notify_master "/etc/keepalived/on_master.sh"   # 可选，切换成主状态时执行
+    # notify_backup "/etc/keepalived/on_backup.sh"   # 可选，切换成备状态时执行
+    # notify_fault  "/etc/keepalived/on_fault.sh"    # 可选，故障时执行
+}
+
+```
+
+### BackUp节点（/etc/keepalived/keepalived.conf）
+```bash
+global_defs {
+   script_user root # 执行脚本的user
+   enable_script_security
+}
+
+vrrp_instance VI_1 {
+    state BACKUP               # 备节点写 BACKUP
+    interface ens33            # 改为你的实际网卡名
+    virtual_router_id 51
+    priority 90               
+    advert_int 1
+    authentication {
+        auth_type PASS
+        auth_pass 1111
+    }
+    virtual_ipaddress {
+        192.168.1.211          # 改为你的VIP
+    }
+    track_script {
+        chk_cranectld
+    }
+    notify_master "/etc/keepalived/on_master.sh"   # 可选，主状态切换时执行
+    notify_backup "/etc/keepalived/on_backup.sh"   # 可选，备状态切换时执行
+    # notify_fault  "/etc/keepalived/on_fault.sh"    # 可选，故障时执行
+}
+```
+
+## 脚本
+**_脚本、脚本路径每一层、以及`/`目录，需设置只有root可写，且属主为root，
+否则keepalived不会启用脚本。_**
+
+### 检查脚本（/etc/keepalived/check_and_failover.sh）
+```bash
+#!/bin/bash
+
+set -e
+
+SCRIPT_NAME=$(basename "$0")
+PROC_NAME="cranectld"
+ALIVE_FILE="/var/crane/cranectld/cranectld.alive"
+
+if ! pgrep -f "$PROC_NAME" > /dev/null 2>&1 && [ -f "$ALIVE_FILE" ]; then
+    echo "[$(date)] [$SCRIPT_NAME] $PROC_NAME not running, $ALIVE_FILE exists, triggering failover..."
+    exit 1
+else
+    echo "[$(date)] [$SCRIPT_NAME] health check passed."
+    exit 0
+fi
+```
+
+### OnMaster脚本  /etc/keepalived/on_master.sh
+```bash
+#!/bin/bash
+
+LOCK_FILE="/nfs/home/shouxin/crane/cranectld/cranectld.lock"
+
+echo "$date on_master execute">>/tmp/on_master.log
+
+# 打开文件描述符 9
+exec 9>"$LOCK_FILE"
+
+# 尝试获取排它锁，等待最多 2 秒
+if flock -x -w 2 9; then
+  echo "Lock is NOT held by another CraneCtld instance (acquired).">>/tmp/on_master.log
+  # 释放锁，不删除文件
+  flock -u 9
+
+  systemctl restart cranectld>>/tmp/on_master.log 2>&1
+else
+  echo "Could not acquire lock within 5 seconds. Assuming lock is held by another instance.">>/tmp/on_master.log
+  # 关闭 fd 9
+  exec 9>&-
+fi
+
+# 如有需要，可在此配置邮件通知
+```
+
+### OnBackup脚本 /etc/keepalived/on_backup.sh
+```bash
+#!/bin/bash
+
+# 如有需要，可在此配置邮件通知
+
+echo "on_backup execute">>/tmp/on_backup.log
+
+systemctl stop cranectld>>/tmp/on_backup.log 2>&1
+```
+
+## 启动
+1. 部署ctld  只在master节点启动ctld，禁止在backup节点手动启动ctld
+2. 启动keepalived
+```bash
+   systemctl enable keepalived
+   systemctl start keepalived 
+```
+
+**_注意事项_**
+
+1. `journalctl -u keepalived`未出现`Unsafe permission found for script 'xxx.sh' -disabling`，则权限设置正确
+2. 如果onMaster脚本与onBackUp脚本执行失败，可关闭 SELinux`sudo setenforce 0`
+3. 如需配置邮件通知，可在onmaster、onbackup和onfault脚本内设置。
+4. 禁止在backup节点手动启动ctld，有可能导致master节点ctld无法启动
+5. 当出现在master节点启动ctld报错有其他实例存在的情况，可删除/var/crane/cranectld/cranectld.alive文件，此时master节点会重新当选，ctld会自动启动运行，backup节点ctld会自动关闭。
+   当master节点不可用时，管理员应尽快重新启用master节点
+
+```bash
+# /etc/keepalived/on_master.sh: line 13: /usr/bin/systemctl: Permission denied
+
+# SELinux 在 Enforcing 模式下，可能阻止 keepalived 以 root 执行 systemctl
+# 解决办法：
+# 查看 SELinux 状态：
+getenforce
+# 如果输出 Enforcing，临时关闭试试：
+setenforce 0
+```
diff --git a/etc/config.yaml b/etc/config.yaml
@@ -31,7 +31,7 @@ CraneCtldForInternalListenPort: 10013
 CraneCtldDebugLevel: trace
 # file path of cranectld log file (relative to CraneBaseDir)
 CraneCtldLogFile: cranectld/cranectld.log
-# file path of cranectld lock file (relative to CraneBaseDir)
+# file path of cranectld lock file (relative to CraneBaseDir, when set Keepalived, Relative to CraneNFSBaseDir)
 CraneCtldMutexFilePath: cranectld/cranectld.lock
 # whether the cranectld is running in the background
 CraneCtldForeground: true
@@ -43,6 +43,12 @@ CraneCtld:
   # max files of cranectld log file
   MaxLogFileNum: 3
 
+Keepalived:
+  # the base directory of NFS storage
+  CraneNFSBaseDir: /var/crane/
+  # file path of cranectld alive file (relative to CraneBaseDir)
+  CraneCtldAliveFile: cranectld/cranectld.alive
+
 # Craned settings
 # the listening address of control machine
 CranedListenAddr: 0.0.0.0