Skip to content

Commit b4fb86a

Browse files
committed
Don't crash on pauses greater than 10sec
1 parent 3a7ff18 commit b4fb86a

3 files changed

Lines changed: 58 additions & 10 deletions

File tree

lib/olly_gc_stats/olly_gc_stats.5.0.ml

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
include Olly_gc_stats_common
22

3-
let print_percentiles json output hist =
3+
let print_percentiles json output hist outliers =
44
let to_sec x = float_of_int x /. 1_000_000_000. in
55
let ms ns = ns /. 1_000_000. in
6+
let outlier_mean_ms =
7+
if outliers.count = 0 then 0.
8+
else float_of_int outliers.total /. float_of_int outliers.count |> ms
9+
in
610

711
let mean_latency = H.mean hist |> ms
812
and max_latency = float_of_int (H.max hist) |> ms in
@@ -82,10 +86,12 @@ let print_percentiles json output hist =
8286
Buffer.contents buf
8387
in
8488
Printf.fprintf oc
85-
{|{"version": 1, "wall_time": %.2f, "cpu_time": %.2f, "gc_time": %.2f, "gc_overhead": %.2f, "max_rss_kb": %d, "domain_stats": {%s}, "mean_latency": %f, "stddev_latency": %f, "min_latency": %.2f, "max_latency": %f, "distr_latency": {%s}, "allocations": {"total_heap": %.0f, "minor_heap": %.0f, "promoted_words": %.0f, "promoted_pct": %.2f}, "collections": {"minor": %i, "major": %i, "forced_major": %i, "compactions": %i}}|}
89+
{|{"version": 2, "wall_time": %.2f, "cpu_time": %.2f, "gc_time": %.2f, "gc_overhead": %.2f, "max_rss_kb": %d, "domain_stats": {%s}, "mean_latency": %f, "stddev_latency": %f, "min_latency": %.2f, "max_latency": %f, "distr_latency": {%s}, "outliers": {"count": %d, "mean_latency": %f, "max_latency": %f}, "allocations": {"total_heap": %.0f, "minor_heap": %.0f, "promoted_words": %.0f, "promoted_pct": %.2f}, "collections": {"minor": %i, "major": %i, "forced_major": %i, "compactions": %i}}|}
8690
real_time !total_cpu_time total_gc_time gc_overhead
8791
(Olly_common.Max_rss.max_rss_kb rss_collector)
8892
domain_stats mean_latency stddev_latency min_latency max_latency distribs
93+
outliers.count outlier_mean_ms
94+
(float_of_int outliers.max |> ms)
8995
total_heap !minor_words !promoted_words promoted_pct !minor_collections
9096
!major_collections !forced_major_collections !compactions
9197
else (
@@ -127,6 +133,13 @@ let print_percentiles json output hist =
127133
Fun.flip Array.iter percentiles (fun p ->
128134
Printf.fprintf oc "%.4f \t %.2f\n" p
129135
(float_of_int (H.value_at_percentile hist p) |> ms));
136+
if outliers.count > 0 then
137+
Printf.fprintf oc
138+
"#[Beyond histogram (> %.0f ms):\t%d events,\t mean (ms):\t%.2f,\t \
139+
max (ms):\t%.2f]\n"
140+
(float_of_int highest_trackable_value |> ms)
141+
outliers.count outlier_mean_ms
142+
(float_of_int outliers.max |> ms);
130143
Printf.fprintf oc "\n";
131144
Printf.fprintf oc "GC allocations (in words): \n";
132145
Printf.fprintf oc "Total heap:\t %.0f\n" total_heap;
@@ -143,9 +156,10 @@ let gc_stats poll_sleep json output runtime_events_dir runtime_events_log_wsize
143156
exec_args =
144157
let current_event = Hashtbl.create 13 in
145158
let hist =
146-
H.init ~lowest_discernible_value:10 ~highest_trackable_value:10_000_000_000
159+
H.init ~lowest_discernible_value:10 ~highest_trackable_value
147160
~significant_figures:3
148161
in
162+
let outliers = make_outliers () in
149163
let is_gc_phase phase =
150164
match phase with
151165
| Runtime_events.EV_MAJOR | Runtime_events.EV_STW_LEADER
@@ -182,7 +196,7 @@ let gc_stats poll_sleep json output runtime_events_dir runtime_events_log_wsize
182196
| Some (saved_phase, saved_ts) when saved_phase = phase ->
183197
Hashtbl.remove current_event ring_id;
184198
let latency = Int64.to_int (Int64.sub (Ts.to_int64 ts) saved_ts) in
185-
assert (H.record_value hist latency);
199+
record_latency hist outliers latency;
186200
domain_gc_times.(ring_id) <- domain_gc_times.(ring_id) + latency
187201
| _ -> ()
188202
in
@@ -204,7 +218,7 @@ let gc_stats poll_sleep json output runtime_events_dir runtime_events_log_wsize
204218
in
205219

206220
let init = Fun.id in
207-
let cleanup () = print_percentiles json output hist in
221+
let cleanup () = print_percentiles json output hist outliers in
208222
let on_poll = Olly_common.Max_rss.sample rss_collector in
209223
let open Olly_common.Launch in
210224
try

lib/olly_gc_stats/olly_gc_stats.5.3.ml

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,13 @@ let print_per_domain_stats oc =
5252
|> Array.combine domain_major_words);
5353
print_table oc !data
5454

55-
let print_percentiles json output hist =
55+
let print_percentiles json output hist outliers =
5656
let to_sec x = float_of_int x /. 1_000_000_000. in
5757
let ms ns = ns /. 1_000_000. in
58+
let outlier_mean_ms =
59+
if outliers.count = 0 then 0.
60+
else float_of_int outliers.total /. float_of_int outliers.count |> ms
61+
in
5862

5963
let mean_latency = H.mean hist |> ms
6064
and max_latency = float_of_int (H.max hist) |> ms in
@@ -155,10 +159,12 @@ let print_percentiles json output hist =
155159
Buffer.contents buf
156160
in
157161
Printf.fprintf oc
158-
{|{"version": 1, "wall_time": %.2f, "cpu_time": %.2f, "gc_time": %.2f, "gc_overhead": %.2f, "max_rss_kb": %d, "domain_stats": {%s}, "mean_latency": %f, "stddev_latency": %f, "min_latency": %.2f, "max_latency": %f, "distr_latency": {%s}, "allocations": {"total_heap": %.0f, "minor_heap": %.0f, "major_heap": %.0f, "promoted_words": %.0f, "promoted_pct": %.2f}, "domain_alloc_stats": {%s}, "collections": {"minor": %i, "major": %i, "forced_major": %i, "compactions": %i}}|}
162+
{|{"version": 2, "wall_time": %.2f, "cpu_time": %.2f, "gc_time": %.2f, "gc_overhead": %.2f, "max_rss_kb": %d, "domain_stats": {%s}, "mean_latency": %f, "stddev_latency": %f, "min_latency": %.2f, "max_latency": %f, "distr_latency": {%s}, "outliers": {"count": %d, "mean_latency": %f, "max_latency": %f}, "allocations": {"total_heap": %.0f, "minor_heap": %.0f, "major_heap": %.0f, "promoted_words": %.0f, "promoted_pct": %.2f}, "domain_alloc_stats": {%s}, "collections": {"minor": %i, "major": %i, "forced_major": %i, "compactions": %i}}|}
159163
real_time !total_cpu_time total_gc_time gc_overhead
160164
(Olly_common.Max_rss.max_rss_kb rss_collector)
161165
domain_stats mean_latency stddev_latency min_latency max_latency distribs
166+
outliers.count outlier_mean_ms
167+
(float_of_int outliers.max |> ms)
162168
total_heap !minor_words !major_words !promoted_words promoted_pct
163169
domain_alloc_stats !minor_collections !major_collections
164170
!forced_major_collections !compactions
@@ -201,6 +207,13 @@ let print_percentiles json output hist =
201207
Fun.flip Array.iter percentiles (fun p ->
202208
Printf.fprintf oc "%.4f \t %.2f\n" p
203209
(float_of_int (H.value_at_percentile hist p) |> ms));
210+
if outliers.count > 0 then
211+
Printf.fprintf oc
212+
"#[Beyond histogram (> %.0f ms):\t%d events,\t mean (ms):\t%.2f,\t \
213+
max (ms):\t%.2f]\n"
214+
(float_of_int highest_trackable_value |> ms)
215+
outliers.count outlier_mean_ms
216+
(float_of_int outliers.max |> ms);
204217
Printf.fprintf oc "\n";
205218
print_global_allocation_stats oc;
206219
print_per_domain_stats oc;
@@ -213,9 +226,10 @@ let gc_stats poll_sleep json output runtime_events_dir runtime_events_log_wsize
213226
exec_args =
214227
let current_event = Hashtbl.create 13 in
215228
let hist =
216-
H.init ~lowest_discernible_value:10 ~highest_trackable_value:10_000_000_000
229+
H.init ~lowest_discernible_value:10 ~highest_trackable_value
217230
~significant_figures:3
218231
in
232+
let outliers = make_outliers () in
219233
let is_gc_phase phase =
220234
match phase with
221235
| Runtime_events.EV_MAJOR | Runtime_events.EV_STW_LEADER
@@ -252,7 +266,7 @@ let gc_stats poll_sleep json output runtime_events_dir runtime_events_log_wsize
252266
| Some (saved_phase, saved_ts) when saved_phase = phase ->
253267
Hashtbl.remove current_event ring_id;
254268
let latency = Int64.to_int (Int64.sub (Ts.to_int64 ts) saved_ts) in
255-
assert (H.record_value hist latency);
269+
record_latency hist outliers latency;
256270
domain_gc_times.(ring_id) <- domain_gc_times.(ring_id) + latency
257271
| _ -> ()
258272
in
@@ -278,7 +292,7 @@ let gc_stats poll_sleep json output runtime_events_dir runtime_events_log_wsize
278292
in
279293

280294
let init = Fun.id in
281-
let cleanup () = print_percentiles json output hist in
295+
let cleanup () = print_percentiles json output hist outliers in
282296
let on_poll = Olly_common.Max_rss.sample rss_collector in
283297
let open Olly_common.Launch in
284298
try

lib/olly_gc_stats/olly_gc_stats_common.ml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ type ts = { mutable start_time : float; mutable end_time : float }
99
This can be user configurable with OCAMLRUNPARAM=d=XXX
1010
*)
1111
let number_domains = 128
12+
13+
(* Largest GC pause (in nanoseconds) the latency histogram can track. Pauses
14+
beyond this are summarised separately. *)
15+
let highest_trackable_value = 10_000_000_000
16+
1217
let wall_time = { start_time = 0.; end_time = 0. }
1318
let rss_collector = Olly_common.Max_rss.create ()
1419
let domain_elapsed_times = Array.make number_domains 0.
@@ -20,6 +25,21 @@ let major_collections = ref 0
2025
let forced_major_collections = ref 0
2126
let compactions = ref 0
2227

28+
(* Running summary of GC pauses that fall outside the histogram's range. *)
29+
type outliers = { mutable count : int; mutable total : int; mutable max : int }
30+
31+
let make_outliers () = { count = 0; total = 0; max = 0 }
32+
33+
(* Record [latency] into [hist], or, if it is too large for the histogram,
34+
fold it into [outliers]. [H.record_value] returns [false] for out-of-range
35+
values; a non-positive latency is a spurious measurement and is dropped. *)
36+
let record_latency hist outliers latency =
37+
if (not (H.record_value hist latency)) && latency > highest_trackable_value
38+
then (
39+
outliers.count <- outliers.count + 1;
40+
outliers.total <- outliers.total + latency;
41+
if latency > outliers.max then outliers.max <- latency)
42+
2343
let lifecycle domain_id ts lifecycle_event _data =
2444
let ts = float_of_int Int64.(to_int @@ Ts.to_int64 ts) /. 1_000_000_000. in
2545
match lifecycle_event with

0 commit comments

Comments
 (0)