Skip to content

Commit 969e895

Browse files
authored
fix(catchup): tgt wait for extra 5s to recv response from all replicas (#189)
Signed-off-by: Vishnu Itta <[email protected]>
1 parent 194a858 commit 969e895

File tree

2 files changed

+50
-7
lines changed

2 files changed

+50
-7
lines changed

src/istgt.c

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
ISTGT g_istgt;
9696
#ifdef REPLICATION
9797
extern int replica_timeout;
98+
extern int extraWait;
9899
extern rte_smempool_t rcmd_mempool;
99100
extern rte_smempool_t rcommon_cmd_mempool;
100101
#endif
@@ -2812,14 +2813,31 @@ void *timerfn(void
28122813
istgt_queue_enqueue(&closedconns, conn);
28132814

28142815
#ifdef REPLICATION
2816+
const char *s_extra_wait_time = getenv("extraWait");
2817+
int extra_wait = -1;
2818+
if (s_extra_wait_time != NULL)
2819+
extra_wait = (int)strtol(s_extra_wait_time,
2820+
NULL, 10);
2821+
if ((extra_wait >= 0) && (extra_wait != extraWait)) {
2822+
ISTGT_NOTICELOG("changing extraWait time from %d to "
2823+
"%d\n", extraWait, extra_wait);
2824+
extraWait = extra_wait;
2825+
}
2826+
28152827
const char *s_replica_timeout = getenv("replicaTimeout");
2816-
unsigned int rep_timeout = 0;
2828+
int rep_timeout = 0;
28172829
if (s_replica_timeout != NULL)
2818-
rep_timeout = (unsigned int)strtol(s_replica_timeout, NULL, 10);
2819-
if (rep_timeout > 30)
2830+
rep_timeout = (int)strtol(s_replica_timeout,
2831+
NULL, 10);
2832+
if ((rep_timeout > 30) && (rep_timeout != replica_timeout)) {
2833+
ISTGT_NOTICELOG("changing replica timeout "
2834+
"from %d to %d", replica_timeout,
2835+
rep_timeout);
28202836
replica_timeout = rep_timeout;
2837+
}
28212838
int check_interval = (replica_timeout / 4) * 1000;
28222839

2840+
28232841
clock_gettime(clockid, &now);
28242842
timesdiff(clockid, last_check, now, diff);
28252843
ms = diff.tv_sec * 1000;

src/replication.c

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "istgt_scsi.h"
2525
#include "assert.h"
2626

27+
int extraWait = 5;
2728
extern int replica_timeout;
2829
cstor_conn_ops_t cstor_ops = {
2930
.conn_listen = replication_listen,
@@ -2560,11 +2561,13 @@ replicate(ISTGT_LU_DISK *spec, ISTGT_LU_CMD_Ptr cmd, uint64_t offset, uint64_t n
25602561
rcmd_t *rcmd = NULL;
25612562
int iovcnt = cmd->iobufindx + 1;
25622563
bool replica_choosen = false;
2563-
struct timespec abstime, now;
2564+
struct timespec abstime, now, extra_wait;
25642565
int nsec, err_num = 0;
25652566
int skip_count = 0;
25662567
uint64_t num_read_ios = 0;
25672568
uint64_t inflight_read_ios = 0;
2569+
int count =0 ;
2570+
bool success_resp = false;
25682571

25692572
(void) cmd_read;
25702573
CHECK_IO_TYPE(cmd, cmd_read, cmd_write, cmd_sync);
@@ -2655,23 +2658,44 @@ replicate(ISTGT_LU_DISK *spec, ISTGT_LU_CMD_Ptr cmd, uint64_t offset, uint64_t n
26552658

26562659
MTX_UNLOCK(&spec->rq_mtx);
26572660

2661+
extra_wait.tv_sec = extra_wait.tv_nsec = 0;
26582662
// now wait for command to complete
26592663
while (1) {
26602664
// check for status of rcomm_cmd
26612665
rc = check_for_command_completion(spec, rcomm_cmd, cmd);
26622666
if (rc) {
2667+
success_resp = false;
26632668
if (rc == 1) {
26642669
rc = cmd->data_len = rcomm_cmd->data_len;
2670+
success_resp = true;
26652671
} else if (rcomm_cmd->opcode == ZVOL_OPCODE_READ &&
26662672
rcomm_cmd->copies_sent == 1) {
26672673
rcomm_cmd->copies_sent = 0;
2668-
memset(rcomm_cmd->resp_list, 0, sizeof (rcomm_cmd->resp_list));
2674+
memset(rcomm_cmd->resp_list, 0,
2675+
sizeof (rcomm_cmd->resp_list));
26692676
MTX_LOCK(&spec->rq_mtx);
2670-
TAILQ_REMOVE(&spec->rcommon_waitq, rcomm_cmd, wait_cmd_next);
2677+
TAILQ_REMOVE(&spec->rcommon_waitq, rcomm_cmd,
2678+
wait_cmd_next);
26712679
goto retry_read;
26722680
}
2673-
rcomm_cmd->state = CMD_EXECUTION_DONE;
26742681

2682+
count = 0;
2683+
if (success_resp == true) {
2684+
if (extra_wait.tv_sec == 0)
2685+
clock_gettime(CLOCK_REALTIME,
2686+
&extra_wait);
2687+
clock_gettime(CLOCK_REALTIME, &now);
2688+
for (i = 0; i < rcomm_cmd->copies_sent; i++) {
2689+
if (rcomm_cmd->resp_list[i].status &
2690+
(RECEIVED_OK|RECEIVED_ERR))
2691+
count++;
2692+
}
2693+
if ((now.tv_sec - extra_wait.tv_sec) < extraWait) {
2694+
if (count != rcomm_cmd->copies_sent)
2695+
goto wait_for_other_responses;
2696+
}
2697+
}
2698+
rcomm_cmd->state = CMD_EXECUTION_DONE;
26752699
#ifdef DEBUG
26762700
/*
26772701
* NOTE: This is for debugging purpose only
@@ -2690,6 +2714,7 @@ replicate(ISTGT_LU_DISK *spec, ISTGT_LU_CMD_Ptr cmd, uint64_t offset, uint64_t n
26902714
break;
26912715
}
26922716

2717+
wait_for_other_responses:
26932718
/* wait for 500 ms(500000000 ns) */
26942719
clock_gettime(CLOCK_REALTIME, &now);
26952720
nsec = 1000000000 - now.tv_nsec;

0 commit comments

Comments
 (0)