diff --git a/radosgw_agent/worker.py b/radosgw_agent/worker.py index 0087da6..6d8a793 100644 --- a/radosgw_agent/worker.py +++ b/radosgw_agent/worker.py @@ -256,6 +256,9 @@ def sync_object(self, bucket, obj): if found: client.remove_op_state(self.dest_conn, self.daemon_id, local_op_id, bucket, obj) + log.debug('op state ID: "%s" removed' % local_op_id) + else: + log.debug('op state ID: "%s" not removed' % local_op_id) except NotFound: log.debug('op state already gone') except Exception: @@ -264,7 +267,8 @@ def sync_object(self, bucket, obj): return True - def wait_for_object(self, bucket, obj, until, local_op_id): + def wait_for_object(self, bucket, obj, until, local_op_id, error_max_count=10): + err_counts = 0 while time.time() < until: try: state = client.get_op_state(self.dest_conn, @@ -272,18 +276,29 @@ def wait_for_object(self, bucket, obj, until, local_op_id): local_op_id, bucket, obj) log.debug('op state is %s', state) - state = state[0]['state'] - if state == 'complete': - return - elif state != 'in-progress': - raise SyncFailed('state is {0}'.format(state)) + try: + state = state[0]['state'] + if state == 'complete': + return + elif state != 'in-progress': + raise SyncFailed('state is {0}'.format(state)) + except IndexError: + raise SyncFailed('client.get_op_state() returns bad element key') + time.sleep(1) except SyncFailed: raise except NotFound: raise SyncFailed('object copy state not found') except Exception as e: - log.debug('error geting op state: %s', e, exc_info=True) + err_counts += 1 + if err_counts > error_max_count: + log.error('error counter >= %d - possible infinitely loop !', + error_max_count, exc_info=True) + raise SyncFailed('Infinitely loop ! Check connections quality between clusters !') + else: + log.exception('error (count: %d) when geting op state: %s', + err_counts, e, exc_info=True) time.sleep(1) # timeout expired raise SyncTimedOut()