Skip to content

Commit d28945a

Browse files
authored
Merge pull request #622 from Aiven-Open/sebinsunny-refactor-pg-basebackup-metric
refactor progress based basebackup metrics
2 parents a1da446 + 649d80e commit d28945a

File tree

5 files changed

+25
-55
lines changed

5 files changed

+25
-55
lines changed

pghoard/basebackup/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -564,8 +564,6 @@ def run_local_tar_basebackup(self, delta: bool = False, with_delta_stats: bool =
564564
db_conn.commit()
565565

566566
self.log.info("Starting to backup %r and %r tablespaces to %r", pgdata, len(tablespaces), compressed_base)
567-
progress_instance = PersistedProgress()
568-
progress_instance.reset_all(metrics=self.metrics)
569567
start_time = time.monotonic()
570568

571569
if delta:
@@ -686,6 +684,8 @@ def run_local_tar_basebackup(self, delta: bool = False, with_delta_stats: bool =
686684
"%r byte input, %r byte output, took %r seconds, waiting to upload", total_file_count, chunks_count,
687685
total_size_plain, total_size_enc, backup_time
688686
)
687+
progress_instance = PersistedProgress()
688+
progress_instance.reset_all(metrics=self.metrics)
689689

690690
finally:
691691
db_conn.rollback()

pghoard/basebackup/delta.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ def progress_callback(progress_step: ProgressStep, progress_data: ProgressMetric
9494
persisted_progress.write(self.metrics)
9595
self.last_flush_time = time.monotonic()
9696
self.metrics.gauge("pghoard.seconds_since_backup_progress_stalled", 0, tags=tags)
97+
self.log.info(
98+
"Updated snapshot progress for %s to %d files; elapsed time since last check: %.2f seconds.",
99+
progress_step.value, progress_data["handled"], elapsed
100+
)
97101
else:
98102
stalled_age = progress_info.age
99103
self.metrics.gauge("pghoard.seconds_since_backup_progress_stalled", stalled_age, tags=tags)

pghoard/transfer.py

Lines changed: 11 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,6 @@ def untrack_upload_event(self, file_key: str) -> None:
146146
self._tracked_events.pop(file_key)
147147

148148
def increment(self, file_key: str, total_bytes_uploaded: float) -> None:
149-
metric_data = {}
150149
persisted_progress = PersistedProgress.read(metrics=self.metrics)
151150

152151
with self._tracked_events_lock:
@@ -160,9 +159,11 @@ def increment(self, file_key: str, total_bytes_uploaded: float) -> None:
160159
FileType.Basebackup_delta,
161160
FileType.Basebackup_delta_chunk,
162161
):
163-
progress_info = persisted_progress.get(file_key)
164-
if total_bytes_uploaded > progress_info.current_progress:
165-
progress_info.update(total_bytes_uploaded)
162+
progress_info = persisted_progress.get("total_bytes_uploaded")
163+
updated_total_bytes_uploaded = progress_info.current_progress + total_bytes_uploaded
164+
165+
if updated_total_bytes_uploaded > progress_info.current_progress:
166+
progress_info.update(updated_total_bytes_uploaded)
166167
persisted_progress.write(metrics=self.metrics)
167168
self.metrics.gauge("pghoard.seconds_since_backup_progress_stalled", 0)
168169
else:
@@ -174,19 +175,7 @@ def increment(self, file_key: str, total_bytes_uploaded: float) -> None:
174175
file_key,
175176
stalled_age,
176177
)
177-
metric_data = {
178-
"metric": "pghoard.basebackup_bytes_uploaded",
179-
"inc_value": total_bytes_uploaded,
180-
"tags": {
181-
"delta": file_type in (FileType.Basebackup_delta, FileType.Basebackup_delta_chunk)
182-
},
183-
}
184-
elif file_type in (FileType.Wal, FileType.Timeline):
185-
metric_data = {"metric": "pghoard.compressed_file_upload", "inc_value": total_bytes_uploaded}
186-
187178
self._tracked_events[file_key].increments.append(TransferIncrement(total_bytes_uploaded=total_bytes_uploaded))
188-
if metric_data:
189-
self.metrics.increase(**metric_data)
190179

191180
def reset(self) -> None:
192181
with self._tracked_events_lock:
@@ -204,7 +193,7 @@ def run_safe(self):
204193
time.sleep(self.CHECK_FREQUENCY)
205194
except Exception: # pylint: disable=broad-except
206195
self.log.exception("Failed to update transfer rate %s", "pghoard.compressed_file_upload")
207-
self.metrics.increase("pghoard.transfer_operation.errors")
196+
self.metrics.increase("pghoard.transfer_operation_errors")
208197
self.reset()
209198
self.stop()
210199

@@ -423,18 +412,11 @@ def run_safe(self):
423412
if file_to_transfer.callback_queue:
424413
file_to_transfer.callback_queue.put(result)
425414

426-
self.log.info(
427-
"%r %stransfer of key: %r, size: %r, took %.3fs", oper, "FAILED " if not result.success else "", key,
428-
oper_size,
429-
time.monotonic() - start_time
430-
)
431-
432-
if file_to_transfer.operation in {TransferOperation.Upload} and filetype in (
433-
FileType.Basebackup, FileType.Basebackup_chunk, FileType.Basebackup_delta, FileType.Basebackup_delta_chunk
434-
):
435-
if result.success:
436-
persisted_progress = PersistedProgress.read(metrics=self.metrics)
437-
persisted_progress.reset(key, metrics=self.metrics)
415+
operation_type = file_to_transfer.operation
416+
status = "FAILED" if not result.success else "successfully"
417+
log_msg = f"{operation_type.capitalize()} of key: {key}, " \
418+
f"size: {oper_size}, {status} in {time.monotonic() - start_time:.3f}s"
419+
self.log.info(log_msg)
438420

439421
self.fetch_manager.stop()
440422
self.log.debug("Quitting TransferAgent")
@@ -539,10 +521,5 @@ def handle_upload(self, site, key, file_to_transfer: UploadEvent):
539521

540522
# Sleep for a bit to avoid busy looping. Increase sleep time if the op fails multiple times
541523
self.sleep(min(0.5 * 2 ** (file_to_transfer.retry_number - 1), 20))
542-
if file_to_transfer.file_type in (
543-
FileType.Basebackup, FileType.Basebackup_chunk, FileType.Basebackup_delta, FileType.Basebackup_delta_chunk
544-
) and file_to_transfer.retry_number < 2:
545-
persisted_progress = PersistedProgress.read(metrics=self.metrics)
546-
persisted_progress.reset(key, metrics=self.metrics)
547524
self.transfer_queue.put(file_to_transfer)
548525
return None

test/test_common.py

Lines changed: 7 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -92,36 +92,25 @@ def test_json_serialization(self, tmpdir):
9292
def test_persisted_progress(self, mocker, tmp_path):
9393
test_progress_file = tmp_path / "test_progress.json"
9494
original_time = 1625072042.123456
95-
test_data = {
96-
"progress": {
97-
"0af668268d0fe14c6e269760b08d80a634c421b8381df25f31fbed5e8a8c8d8b": {
98-
"current_progress": 100,
99-
"last_updated_time": original_time
100-
}
101-
}
102-
}
95+
test_data = {"progress": {"total_bytes_uploaded": {"current_progress": 100, "last_updated_time": original_time}}}
10396

10497
with open(test_progress_file, "w") as file:
10598
json.dump(test_data, file)
10699

107100
mocker.patch("pghoard.common.PROGRESS_FILE", test_progress_file)
108101
persisted_progress = PersistedProgress.read(metrics=metrics.Metrics(statsd={}))
109-
assert "0af668268d0fe14c6e269760b08d80a634c421b8381df25f31fbed5e8a8c8d8b" in persisted_progress.progress
110-
assert persisted_progress.progress["0af668268d0fe14c6e269760b08d80a634c421b8381df25f31fbed5e8a8c8d8b"
111-
].current_progress == 100
112-
assert persisted_progress.progress["0af668268d0fe14c6e269760b08d80a634c421b8381df25f31fbed5e8a8c8d8b"
113-
].last_updated_time == 1625072042.123456
102+
assert "total_bytes_uploaded" in persisted_progress.progress
103+
assert persisted_progress.progress["total_bytes_uploaded"].current_progress == 100
104+
assert persisted_progress.progress["total_bytes_uploaded"].last_updated_time == 1625072042.123456
114105

115106
new_progress = 200
116-
progress_info = persisted_progress.get("0af668268d0fe14c6e269760b08d80a634c421b8381df25f31fbed5e8a8c8d8b")
107+
progress_info = persisted_progress.get("total_bytes_uploaded")
117108
progress_info.update(new_progress)
118109
persisted_progress.write(metrics=metrics.Metrics(statsd={}))
119110

120111
updated_progress = PersistedProgress.read(metrics=metrics.Metrics(statsd={}))
121-
assert updated_progress.progress["0af668268d0fe14c6e269760b08d80a634c421b8381df25f31fbed5e8a8c8d8b"
122-
].current_progress == new_progress
123-
assert updated_progress.progress["0af668268d0fe14c6e269760b08d80a634c421b8381df25f31fbed5e8a8c8d8b"
124-
].last_updated_time > original_time
112+
assert updated_progress.progress["total_bytes_uploaded"].current_progress == new_progress
113+
assert updated_progress.progress["total_bytes_uploaded"].last_updated_time > original_time
125114

126115
def test_default_persisted_progress_creation(self, mocker, tmp_path):
127116
tmp_file = tmp_path / "non_existent_progress.json"

test/test_transferagent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,6 @@ def test_handle_upload_with_persisted_progress(self, mocker, tmp_path):
337337
self.transfer_agent.handle_upload("test_site", self.foo_basebackup_path, upload_event)
338338
updated_progress = PersistedProgress.read(metrics=metrics.Metrics(statsd={}))
339339
assert temp_progress_file.exists()
340-
assert updated_progress.progress[self.foo_basebackup_path].current_progress == 3
340+
assert updated_progress.progress["total_bytes_uploaded"].current_progress == 3
341341
if temp_progress_file.exists():
342342
temp_progress_file.unlink()

0 commit comments

Comments
 (0)