Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ KVCM server可识别的配置参数列表如下。可通过配置文件、启动
# 指定主节点租约时间。仅单节点的部署模式下建议配置一个很大的值。
# kvcm.leader_elector.lease_ms=600000
# 多节点的部署模式下建议配置一个较低的值
# kvcm.leader_elector.lease_ms=100
# kvcm.leader_elector.lease_ms=10000

# 指定选主逻辑后台循环间隔时间。仅单节点的部署模式下建议配置一个较大的值。
# kvcm.leader_elector.loop_interval_ms=10000
# 多节点的部署模式下建议配置一个较低的值。建议低于kvcm.leader_elector.lease_ms/10。
# kvcm.leader_elector.loop_interval_ms=10
# kvcm.leader_elector.loop_interval_ms=100

# 额外指定日志级别,覆盖日志配置文件中的设置,方便进行动态调整
# 0: auto, 1: fatal, 2: error, 3: warn, 4: info, 5: debug
Expand Down
26 changes: 23 additions & 3 deletions integration_test/admin_service/admin_interface_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,14 +472,34 @@ def prepare_test_resource(self, worker_num, work_dir=None, worker_mode='normal')
def start_worker(self, **kwargs):
# 配置文件分布式锁
kwargs[f'kvcm.distributed_lock.uri'] = self._lock_uri
kwargs[f'kvcm.leader_elector.lease_ms'] = 2000
self.assertTrue(self.worker_manager.start_all(**kwargs))

def start_worker_by_id(self, worker_id, **kwargs):
kwargs[f'kvcm.distributed_lock.uri'] = self._lock_uri
kwargs[f'kvcm.leader_elector.lease_ms'] = 2000
self.assertTrue(self.worker_manager.start_worker(worker_id, **kwargs))

def clean_test_resource(self):
pass

def stop_worker(self):
self.worker_manager.stop_all()

def _wait_for_healthy(self, client, worker_id, timeout=30, interval=1):
"""等待指定worker启动完成"""
start_time = time.time()
while time.time() - start_time < timeout:
try:
req = {"trace_id": f"trace_wait_{worker_id}_{int(time.time())}"}
resp = client.check_health(req, check_response=False)
if resp.get("header", {}).get("status", {}).get("code") == "OK" and resp.get("is_health") == True:
return True
except Exception as e:
logging.warning(f"Worker {worker_id} check health failed: {e}")
time.sleep(interval)
return False

def _wait_for_leader(self, client, worker_id, timeout=30, interval=1):
"""等待指定worker成为leader"""
start_time = time.time()
Expand Down Expand Up @@ -636,13 +656,13 @@ def test_leader_switch_when_worker_stops(self):
self.assertEqual(new_cluster_resp["leader_node_id"], new_cluster_resp["self_node_id"], "leader节点ID和leader自身应该一致")

# 重新启动之前停止的worker
self.worker_manager.start_worker(leader_id)
self.start_worker_by_id(leader_id)

# 等待重新启动的worker完成初始化
time.sleep(2)

# 重新创建客户端连接
restarted_client = self._get_manager_client(leader_id)
self._wait_for_healthy(restarted_client, leader_id)

restarted_req = {"trace_id": "trace_restarted_worker"}
restarted_resp = restarted_client.check_health(restarted_req, check_response=True)
logging.info(f"重新启动的Worker {leader_id} 状态: {restarted_resp}")
Expand Down
4 changes: 0 additions & 4 deletions integration_test/testlib/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,6 @@ def set_port(self, rpc_port, http_port, admin_rpc_port, admin_http_port):
logging.info("rpc_port = %s, http_port = %s" % (self.rpc_port, self.http_port))

def update_parameters(self, **kwargs):
self.add_env_parameter('rpc_thread_num', 4)
self.add_env_parameter('rpc_queue_size', 100)
self.add_env_parameter('ip', self.ip)
self.add_env_parameter('debug_mode', self.debug_mode)
for k, v in kwargs.items():
self.add_env_parameter(k, v)

Expand Down
4 changes: 2 additions & 2 deletions kv_cache_manager/service/server_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,8 @@ bool ServerConfig::Parse(const std::string &config_file, const EnvironMap &envir

void ServerConfig::UpdateDefaultConfig() {
metrics_report_interval_ms_ = 20000;
leader_elector_lease_ms_ = 100;
leader_elector_loop_interval_ms_ = 10;
leader_elector_lease_ms_ = 10000;
leader_elector_loop_interval_ms_ = 100;
}

bool ServerConfig::ParseFromFile(const std::string &config_file) {
Expand Down
4 changes: 2 additions & 2 deletions package/etc/default_server_config.conf
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@
# 指定主节点租约时间。仅单节点的部署模式下建议配置一个很大的值。
# kvcm.leader_elector.lease_ms=600000
# 多节点的部署模式下建议配置一个较低的值
# kvcm.leader_elector.lease_ms=100
# kvcm.leader_elector.lease_ms=10000

# 指定选主逻辑后台循环间隔时间。仅单节点的部署模式下建议配置一个较大的值。
# kvcm.leader_elector.loop_interval_ms=10000
# 多节点的部署模式下建议配置一个较低的值。建议低于kvcm.leader_elector.lease_ms/10。
# kvcm.leader_elector.loop_interval_ms=10
# kvcm.leader_elector.loop_interval_ms=100

# 额外指定日志级别,覆盖日志配置文件中的设置,方便进行动态调整
# 0: auto, 1: fatal, 2: error, 3: warn, 4: info, 5: debug
Expand Down