@@ -271,6 +271,17 @@ struct iter_obj_arg {
271271 uint32_t generation ;
272272};
273273
274+ #define MIGR_RETRY_WAIT_WARN (tls , oid , rc , tried , duration ) \
275+ do { \
276+ tried++; \
277+ if (tried >= 4096) \
278+ tried = 2048; \
279+ if ((tried & (tried - 1)) == 0) \
280+ DL_WARN(rc, DF_RB ": retry " DF_UOID ", tried[%d] " DF_U64 " seconds ", \
281+ DP_RB_MPT(tls), DP_UOID(oid), tried, (duration)); \
282+ dss_sleep(1000); \
283+ } while (0)
284+
274285static int
275286migrate_try_obj_insert (struct migrate_pool_tls * tls , uuid_t co_uuid , daos_unit_oid_t oid ,
276287 daos_epoch_t epoch , daos_epoch_t punched_epoch , unsigned int shard ,
@@ -778,6 +789,7 @@ mrone_obj_fetch_internal(struct migrate_one *mrone, daos_handle_t oh, d_sg_list_
778789 uint64_t now ;
779790 int rc ;
780791 int wait = MEM_NO_WAIT ;
792+ int tried = 0 ;
781793
782794 /* pass rebuild epoch by extra_arg */
783795 if (flags & DIOF_FETCH_EPOCH_EC_AGG_BOUNDARY ) {
@@ -789,7 +801,8 @@ mrone_obj_fetch_internal(struct migrate_one *mrone, daos_handle_t oh, d_sg_list_
789801retry :
790802 rc = dsc_obj_fetch (oh , eph , & mrone -> mo_dkey , iod_num , iods , sgls , NULL , flags , extra_arg ,
791803 csum_iov_fetch );
792- if ((rc == - DER_TIMEDOUT || rc == - DER_FETCH_AGAIN || rc == - DER_NOMEM ) &&
804+ if ((rc == - DER_TIMEDOUT || rc == - DER_FETCH_AGAIN || rc == - DER_NOMEM ||
805+ daos_crt_network_error (rc )) &&
793806 tls -> mpt_version + 1 >= tls -> mpt_pool -> spc_map_version ) {
794807 if (tls -> mpt_fini ) {
795808 DL_ERROR (rc , DF_RB ": dsc_obj_fetch " DF_UOID "failed when mpt_fini" ,
@@ -799,19 +812,25 @@ mrone_obj_fetch_internal(struct migrate_one *mrone, daos_handle_t oh, d_sg_list_
799812 /* If pool map does not change, then let's retry for timeout, instead of
800813 * fail out.
801814 */
815+ now = daos_gettime_coarse ();
816+ if (then == 0 )
817+ then = now ;
818+
802819 if (rc != - DER_NOMEM ) {
803- DL_WARN (rc , DF_RB ": retry " DF_UOID , DP_RB_MPT (tls ),
804- DP_UOID (mrone -> mo_oid ));
805- dss_sleep (1000 );
806- D_GOTO (retry , rc );
820+ if (rc == - DER_TIMEDOUT || rc == - DER_FETCH_AGAIN || now - then < 600 ) {
821+ MIGR_RETRY_WAIT_WARN (tls , mrone -> mo_oid , rc , tried , now - then );
822+ D_GOTO (retry , rc );
823+ }
824+ /* waited for too long, return error and restart rebuild */
825+ DL_ERROR (rc , DF_RB " waited for over 10 minutes due to network error" ,
826+ DP_RB_MRO (mrone ));
827+ return rc ;
807828 }
808829
809- now = daos_gettime_coarse ();
810830 if (wait == MEM_NO_WAIT ) {
811831 wait = MEM_WAIT ;
812832 res -> res_data .mem_waiting ++ ;
813833 res -> res_data .mem_err ++ ;
814- then = now ;
815834 }
816835 /* sleep a few seconds before retry, give other layers a chance to
817836 * release resources.
@@ -3081,7 +3100,9 @@ migrate_obj_epoch(struct migrate_pool_tls *tls, struct iter_obj_arg *arg, daos_e
30813100 uint32_t minimum_nr ;
30823101 uint32_t enum_flags ;
30833102 uint32_t num ;
3084- int waited = 0 ;
3103+ uint64_t now ;
3104+ uint64_t then = 0 ;
3105+ int tried = 0 ;
30853106 int rc = 0 ;
30863107
30873108 D_DEBUG (DB_REBUILD , "migrate obj " DF_UOID " for shard %u eph "
@@ -3216,23 +3237,29 @@ migrate_obj_epoch(struct migrate_pool_tls *tls, struct iter_obj_arg *arg, daos_e
32163237 /* -DER_UPDATE_AGAIN means the remote target does not parse EC
32173238 * aggregation yet, so let's retry.
32183239 */
3219- waited ++ ;
3240+ now = daos_gettime_coarse ();
3241+ if (then == 0 )
3242+ then = now ;
32203243 dss_sleep (5000 );
32213244 D_DEBUG (DB_REBUILD , DF_UOID "retry %d secs with %d \n" , DP_UOID (arg -> oid ),
3222- waited * 5 , rc );
3245+ ( int )( now + 5 - then ) , rc );
32233246 rc = 0 ;
32243247 continue ;
32253248 } else if (rc ) {
32263249 /* To avoid reclaim and retry rebuild, let's retry until the pool map
32273250 * being changed due to further failure.
32283251 */
3229- if (rc == - DER_TIMEDOUT &&
3252+ if (( rc == - DER_TIMEDOUT || daos_crt_network_error ( rc )) &&
32303253 tls -> mpt_version + 1 >= tls -> mpt_pool -> spc_map_version ) {
3231- D_WARN (DF_UUID " retry " DF_UOID " " DF_RC "\n" ,
3232- DP_UUID (tls -> mpt_pool_uuid ), DP_UOID (arg -> oid ),
3233- DP_RC (rc ));
3234- rc = 0 ;
3235- continue ;
3254+ now = daos_gettime_coarse ();
3255+ if (then == 0 )
3256+ then = now ;
3257+ if (rc == - DER_TIMEDOUT || now - then < 600 ) {
3258+ MIGR_RETRY_WAIT_WARN (tls , arg -> oid , rc , tried , now - then );
3259+ rc = 0 ;
3260+ continue ;
3261+ }
3262+ /* fall through and fail rebuild */
32363263 }
32373264
32383265 /* container might have been destroyed. Or there is
0 commit comments