Skip to content

Commit 39bc4ea

Browse files
authored
[BugFix] Fix merge commit latency metrics overflow (#67168)
Signed-off-by: PengFei Li <[email protected]>
1 parent 768d46e commit 39bc4ea

File tree

4 files changed

+314
-32
lines changed

4 files changed

+314
-32
lines changed

be/src/runtime/batch_write/isomorphic_batch_write.cpp

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,18 @@ bvar::Adder<int64_t> g_mc_send_rpc_total("merge_commit", "send_rpc_total");
5454
bvar::Adder<int64_t> g_mc_register_pipe_total("merge_commit", "register_pipe_total");
5555
// Counter for stream load pipes unregistered for merge commit operations
5656
bvar::Adder<int64_t> g_mc_unregister_pipe_total("merge_commit", "unregister_pipe_total");
57-
// Latency recorder for end-to-end merge commit processing time (nanoseconds)
58-
bvar::LatencyRecorder g_mc_total_latency_ns("merge_commit", "reqeust");
59-
// Latency recorder for time tasks spend in pending queue before execution (nanoseconds)
60-
bvar::LatencyRecorder g_mc_pending_latency_ns("merge_commit", "pending");
61-
// Latency recorder for combined RPC request time and pipe availability wait time (nanoseconds)
62-
bvar::LatencyRecorder g_mc_wait_plan_latency_ns("merge_commit", "wait_plan");
63-
// Latency recorder for time spent appending data to stream load pipes (nanoseconds)
64-
bvar::LatencyRecorder g_mc_append_pipe_latency_ns("merge_commit", "append_pipe");
65-
// Latency recorder for time spent waiting for load operations to complete (nanoseconds)
66-
bvar::LatencyRecorder g_mc_wait_finish_latency_ns("merge_commit", "wait_finish");
57+
// Latency recorder for end-to-end merge commit processing time (microseconds)
58+
bvar::LatencyRecorder g_mc_total_latency_us("merge_commit", "request");
59+
// Latency recorder for time tasks spend in pending queue before execution (microseconds)
60+
bvar::LatencyRecorder g_mc_pending_latency_us("merge_commit", "pending");
61+
// Latency recorder for combined RPC request time and pipe availability wait time (microseconds)
62+
bvar::LatencyRecorder g_mc_wait_plan_latency_us("merge_commit", "wait_plan");
63+
// Latency recorder for time spent appending data to stream load pipes (microseconds)
64+
bvar::LatencyRecorder g_mc_append_pipe_latency_us("merge_commit", "append_pipe");
65+
// Latency recorder for time spent waiting for load operations to complete (microseconds)
66+
bvar::LatencyRecorder g_mc_wait_finish_latency_us("merge_commit", "wait_finish");
67+
68+
#define NS_TO_US(x) ((x) / 1000)
6769

6870
class AsyncAppendDataContext {
6971
public:
@@ -259,7 +261,7 @@ Status IsomorphicBatchWrite::append_data(StreamLoadContext* data_ctx) {
259261
} else {
260262
g_mc_fail_total << 1;
261263
}
262-
g_mc_total_latency_ns << (MonotonicNanos() - start_time_ns);
264+
g_mc_total_latency_us << NS_TO_US(MonotonicNanos() - start_time_ns);
263265
});
264266
if (_stopped.load(std::memory_order_acquire)) {
265267
return Status::ServiceUnavailable("Batch write is stopped");
@@ -296,14 +298,14 @@ Status IsomorphicBatchWrite::_create_and_wait_async_task(starrocks::StreamLoadCo
296298
async_ctx->total_cost_ns.store(MonotonicNanos() - async_ctx->create_time_ts);
297299
TRACE_BATCH_WRITE << "wait async finish, " << _batch_write_id << ", user label: " << async_ctx->data_ctx()->label
298300
<< ", user ip: " << data_ctx->auth.user_ip << ", data size: " << data_ctx->receive_bytes
299-
<< ", total_cost: " << (async_ctx->total_cost_ns / 1000)
300-
<< "us, total_async_cost: " << (async_ctx->total_async_cost_ns / 1000)
301-
<< "us, task_pending_cost: " << (async_ctx->task_pending_cost_ns / 1000)
302-
<< "us, append_pipe_cost: " << (async_ctx->append_pipe_cost_ns / 1000)
303-
<< "us, rpc_cost: " << (async_ctx->rpc_cost_ns / 1000)
304-
<< "us, wait_pipe_cost: " << (async_ctx->wait_pipe_cost_ns / 1000)
301+
<< ", total_cost: " << NS_TO_US(async_ctx->total_cost_ns)
302+
<< "us, total_async_cost: " << NS_TO_US(async_ctx->total_async_cost_ns)
303+
<< "us, task_pending_cost: " << NS_TO_US(async_ctx->task_pending_cost_ns)
304+
<< "us, append_pipe_cost: " << NS_TO_US(async_ctx->append_pipe_cost_ns)
305+
<< "us, rpc_cost: " << NS_TO_US(async_ctx->rpc_cost_ns)
306+
<< "us, wait_pipe_cost: " << NS_TO_US(async_ctx->wait_pipe_cost_ns)
305307
<< "us, num retries: " << async_ctx->num_retries
306-
<< ", pipe_left_active: " << (async_ctx->pipe_left_active_ns / 1000)
308+
<< ", pipe_left_active: " << NS_TO_US(async_ctx->pipe_left_active_ns)
307309
<< ", async_status: " << async_ctx->get_status() << ", txn_id: " << async_ctx->txn_id()
308310
<< ", label: " << async_ctx->label();
309311
data_ctx->txn_id = async_ctx->txn_id();
@@ -327,20 +329,20 @@ int IsomorphicBatchWrite::_execute_tasks(void* meta, bthread::TaskIterator<Task>
327329
ctx->task_pending_cost_ns.store(MonotonicNanos() - ctx->create_time_ts);
328330
g_mc_pending_total << -1;
329331
g_mc_pending_bytes << -ctx->data_ctx()->receive_bytes;
330-
g_mc_pending_latency_ns << ctx->task_pending_cost_ns;
332+
g_mc_pending_latency_us << NS_TO_US(ctx->task_pending_cost_ns);
331333
auto st = batch_write->_execute_write(ctx);
332334
ctx->finish_async(st);
333335
ctx->total_async_cost_ns.store(MonotonicNanos() - start_ts);
334336
TRACE_BATCH_WRITE << "async task finish, " << batch_write->_batch_write_id
335337
<< ", user label: " << ctx->data_ctx()->label
336338
<< ", data size: " << ctx->data_ctx()->receive_bytes
337-
<< ", total_async_cost: " << (ctx->total_async_cost_ns / 1000)
338-
<< "us, task_pending_cost: " << (ctx->task_pending_cost_ns / 1000)
339-
<< "us, append_pipe_cost: " << (ctx->append_pipe_cost_ns / 1000)
340-
<< "us, rpc_cost: " << (ctx->rpc_cost_ns / 1000)
341-
<< "us, wait_pipe_cost: " << (ctx->wait_pipe_cost_ns / 1000)
339+
<< ", total_async_cost: " << NS_TO_US(ctx->total_async_cost_ns)
340+
<< "us, task_pending_cost: " << NS_TO_US(ctx->task_pending_cost_ns)
341+
<< "us, append_pipe_cost: " << NS_TO_US(ctx->append_pipe_cost_ns)
342+
<< "us, rpc_cost: " << NS_TO_US(ctx->rpc_cost_ns)
343+
<< "us, wait_pipe_cost: " << NS_TO_US(ctx->wait_pipe_cost_ns)
342344
<< "us, num retries: " << ctx->num_retries
343-
<< ", pipe_left_active: " << (ctx->pipe_left_active_ns / 1000) << "us, status: " << st
345+
<< ", pipe_left_active: " << NS_TO_US(ctx->pipe_left_active_ns) << "us, status: " << st
344346
<< ", txn_id: " << ctx->txn_id() << ", label: " << ctx->label();
345347
;
346348
AsyncAppendDataContext::release(ctx);
@@ -387,13 +389,13 @@ Status IsomorphicBatchWrite::_execute_write(AsyncAppendDataContext* async_ctx) {
387389
async_ctx->rpc_cost_ns.store(rpc_cost_ns);
388390
async_ctx->wait_pipe_cost_ns.store(wait_pipe_cost_ns);
389391
async_ctx->num_retries.store(num_retries);
390-
g_mc_append_pipe_latency_ns << write_data_cost_ns;
391-
g_mc_wait_plan_latency_ns << (rpc_cost_ns + wait_pipe_cost_ns);
392+
g_mc_append_pipe_latency_us << NS_TO_US(write_data_cost_ns);
393+
g_mc_wait_plan_latency_us << NS_TO_US(rpc_cost_ns + wait_pipe_cost_ns);
392394
if (!st.ok()) {
393395
std::stringstream stream;
394396
stream << "Failed to write data to stream load pipe, num retry: " << num_retries
395-
<< ", write_data: " << (write_data_cost_ns / 1000) << " us, rpc: " << (rpc_cost_ns / 1000)
396-
<< "us, wait_pipe: " << (wait_pipe_cost_ns / 1000) << " us, last error: " << st;
397+
<< ", write_data: " << NS_TO_US(write_data_cost_ns) << " us, rpc: " << NS_TO_US(rpc_cost_ns)
398+
<< "us, wait_pipe: " << NS_TO_US(wait_pipe_cost_ns) << " us, last error: " << st;
397399
st = Status::InternalError(stream.str());
398400
}
399401
return st;
@@ -464,7 +466,7 @@ Status IsomorphicBatchWrite::_send_rpc_request(StreamLoadContext* data_ctx) {
464466
config::merge_commit_rpc_reqeust_timeout_ms);
465467
TRACE_BATCH_WRITE << "receive requestBatchWrite response, " << _batch_write_id
466468
<< ", user label: " << data_ctx->label << ", master: " << master_addr
467-
<< ", cost: " << ((MonotonicNanos() - start_ts) / 1000) << "us, status: " << st
469+
<< ", cost: " << NS_TO_US(MonotonicNanos() - start_ts) << "us, status: " << st
468470
<< ", response: " << response;
469471
#else
470472
TEST_SYNC_POINT_CALLBACK("IsomorphicBatchWrite::send_rpc_request::request", &request);
@@ -490,10 +492,10 @@ Status IsomorphicBatchWrite::_wait_for_load_finish(StreamLoadContext* data_ctx)
490492
int64_t start_ts = MonotonicNanos();
491493
StatusOr<TxnState> status_or = subscriber->wait_finished_state(left_timeout_ms * 1000);
492494
data_ctx->mc_wait_finish_cost_nanos = MonotonicNanos() - start_ts;
493-
g_mc_wait_finish_latency_ns << data_ctx->mc_wait_finish_cost_nanos;
495+
g_mc_wait_finish_latency_us << NS_TO_US(data_ctx->mc_wait_finish_cost_nanos);
494496
TRACE_BATCH_WRITE << "finish to wait load, " << _batch_write_id << ", user label: " << data_ctx->label
495497
<< ", txn_id: " << data_ctx->txn_id << ", load label: " << data_ctx->batch_write_label
496-
<< ", cost: " << (data_ctx->mc_wait_finish_cost_nanos / 1000)
498+
<< ", cost: " << NS_TO_US(data_ctx->mc_wait_finish_cost_nanos)
497499
<< "us, wait status: " << status_or.status() << ", "
498500
<< (status_or.ok() ? status_or.value() : subscriber->current_state());
499501
if (!status_or.ok()) {

docs/en/administration/management/monitoring/metrics.md

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1854,3 +1854,95 @@ All transaction metrics share the following labels:
18541854
- Unit: ms
18551855
- Type: Summary
18561856
- Description: The final acknowledgment latency, from when the `publish` task finishes to the final `finish` time when the transaction is marked as `VISIBLE`. This metric includes any final steps or acknowledgments required.
1857+
1858+
### Merge Commit BE Metrics
1859+
1860+
#### merge_commit_request_total
1861+
1862+
- Unit: Count
1863+
- Type: Cumulative
1864+
- Description: Total number of merge commit requests received by BE.
1865+
1866+
#### merge_commit_request_bytes
1867+
1868+
- Unit: Bytes
1869+
- Type: Cumulative
1870+
- Description: Total bytes of data received across merge commit requests.
1871+
1872+
#### merge_commit_success_total
1873+
1874+
- Unit: Count
1875+
- Type: Cumulative
1876+
- Description: Merge commit requests that finished successfully.
1877+
1878+
#### merge_commit_fail_total
1879+
1880+
- Unit: Count
1881+
- Type: Cumulative
1882+
- Description: Merge commit requests that failed.
1883+
1884+
#### merge_commit_pending_total
1885+
1886+
- Unit: Count
1887+
- Type: Instantaneous
1888+
- Description: Merge commit tasks currently waiting in the execution queue.
1889+
1890+
#### merge_commit_pending_bytes
1891+
1892+
- Unit: Bytes
1893+
- Type: Instantaneous
1894+
- Description: Total bytes of data held by pending merge commit tasks.
1895+
1896+
#### merge_commit_send_rpc_total
1897+
1898+
- Unit: Count
1899+
- Type: Cumulative
1900+
- Description: RPC requests sent to FE for starting merge commit operations.
1901+
1902+
#### merge_commit_register_pipe_total
1903+
1904+
- Unit: Count
1905+
- Type: Cumulative
1906+
- Description: Stream load pipes registered for merge commit operations.
1907+
1908+
#### merge_commit_unregister_pipe_total
1909+
1910+
- Unit: Count
1911+
- Type: Cumulative
1912+
- Description: Stream load pipes unregistered from merge commit operations.
1913+
1914+
Latency metrics expose percentile series such as `merge_commit_request_latency_99` and `merge_commit_request_latency_90`, reported in microseconds. The end-to-end latency obeys:
1915+
1916+
`merge_commit_request = merge_commit_pending + merge_commit_wait_plan + merge_commit_append_pipe + merge_commit_wait_finish`
1917+
1918+
> **Note**: Before v3.4.11, v3.5.12, and v4.0.4, these latency metrics were reported in nanoseconds.
1919+
1920+
#### merge_commit_request
1921+
1922+
- Unit: microsecond
1923+
- Type: Summary
1924+
- Description: End-to-end processing latency for merge commit requests.
1925+
1926+
#### merge_commit_pending
1927+
1928+
- Unit: microsecond
1929+
- Type: Summary
1930+
- Description: Time merge commit tasks spend waiting in the pending queue before execution.
1931+
1932+
#### merge_commit_wait_plan
1933+
1934+
- Unit: microsecond
1935+
- Type: Summary
1936+
- Description: Combined latency for the RPC request and waiting for the stream load pipe to become available.
1937+
1938+
#### merge_commit_append_pipe
1939+
1940+
- Unit: microsecond
1941+
- Type: Summary
1942+
- Description: Time spent appending data to the stream load pipe during merge commit.
1943+
1944+
#### merge_commit_wait_finish
1945+
1946+
- Unit: microsecond
1947+
- Type: Summary
1948+
- Description: Time spent waiting for merge commit load operations to finish.

docs/ja/administration/management/monitoring/metrics.md

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1849,3 +1849,97 @@ StarRocks クラスタのモニタリングサービスの構築方法につい
18491849
- 単位: ms
18501850
- タイプ: Summary
18511851
- 説明: 最終的な確認遅延。`publish` タスクが完了してから、トランザクションが `VISIBLE` としてマークされる最終的な `finish` 時点までの時間。このメトリックには、必要な最終的なステップや確認が含まれます。
1852+
1853+
### Merge Commit メトリクス
1854+
1855+
これらのメトリクスは、等型バッチ書き込み経路での merge commit 処理を追跡します。
1856+
1857+
#### merge_commit_request_total
1858+
1859+
- 単位: Count
1860+
- タイプ: Cumulative
1861+
- 説明: BE が受信した merge commit リクエストの総数。
1862+
1863+
#### merge_commit_request_bytes
1864+
1865+
- 単位: Bytes
1866+
- タイプ: Cumulative
1867+
- 説明: merge commit リクエストで受信したデータ総量。
1868+
1869+
#### merge_commit_success_total
1870+
1871+
- 単位: Count
1872+
- タイプ: Cumulative
1873+
- 説明: 正常に完了した merge commit リクエスト数。
1874+
1875+
#### merge_commit_fail_total
1876+
1877+
- 単位: Count
1878+
- タイプ: Cumulative
1879+
- 説明: 失敗した merge commit リクエスト数。
1880+
1881+
#### merge_commit_pending_total
1882+
1883+
- 単位: Count
1884+
- タイプ: Instantaneous
1885+
- 説明: 実行キューで待機している merge commit タスク数。
1886+
1887+
#### merge_commit_pending_bytes
1888+
1889+
- 単位: Bytes
1890+
- タイプ: Instantaneous
1891+
- 説明: 待機中の merge commit タスクが保持するデータ総量。
1892+
1893+
#### merge_commit_send_rpc_total
1894+
1895+
- 単位: Count
1896+
- タイプ: Cumulative
1897+
- 説明: merge commit を開始するために送信した RPC リクエスト数。
1898+
1899+
#### merge_commit_register_pipe_total
1900+
1901+
- 単位: Count
1902+
- タイプ: Cumulative
1903+
- 説明: merge commit 用に登録された stream load pipe の数。
1904+
1905+
#### merge_commit_unregister_pipe_total
1906+
1907+
- 単位: Count
1908+
- タイプ: Cumulative
1909+
- 説明: merge commit 用に登録解除された stream load pipe の数。
1910+
1911+
レイテンシメトリクスは、`merge_commit_request_latency_99``merge_commit_request_latency_90` などのパーセンタイル系列をマイクロ秒単位で出力します。エンドツーエンドのレイテンシは以下の式に従います:
1912+
1913+
`merge_commit_request = merge_commit_pending + merge_commit_wait_plan + merge_commit_append_pipe + merge_commit_wait_finish`
1914+
1915+
> **注意**: v3.4.11、v3.5.12、および v4.0.4 より前のバージョンでは、これらのレイテンシメトリクスはナノ秒単位で報告されていました。
1916+
1917+
#### merge_commit_request
1918+
1919+
- 単位: microsecond
1920+
- タイプ: Summary
1921+
- 説明: merge commit リクエストのエンドツーエンド処理レイテンシ。
1922+
1923+
#### merge_commit_pending
1924+
1925+
- 単位: microsecond
1926+
- タイプ: Summary
1927+
- 説明: 実行前に pending キューで待機する時間。
1928+
1929+
#### merge_commit_wait_plan
1930+
1931+
- 単位: microsecond
1932+
- タイプ: Summary
1933+
- 説明: RPC リクエストと stream load pipe が利用可能になるまでの待機を合わせた時間。
1934+
1935+
#### merge_commit_append_pipe
1936+
1937+
- 単位: microsecond
1938+
- タイプ: Summary
1939+
- 説明: merge commit 中に stream load pipe へデータを追加する時間。
1940+
1941+
#### merge_commit_wait_finish
1942+
1943+
- 単位: microsecond
1944+
- タイプ: Summary
1945+
- 説明: merge commit のロード操作が完了するまで待機する時間。

0 commit comments

Comments
 (0)