Skip to content

Commit f70688a

Browse files
authored
Collect chef metrics (#34)
* Collect chef metrics * Make lintastic * Change service name to chef_metrics_collector * Disable metrics collection in taste tester mode by default Foo Foo 2 * Make lintastic
1 parent d60955e commit f70688a

File tree

5 files changed

+196
-2
lines changed

5 files changed

+196
-2
lines changed
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
require 'chef/handler'
2+
require 'time'
3+
4+
module Boxcutter
5+
class MetricsHandler < ::Chef::Handler
6+
FORCE_METRICS_COLLECTOR = '/var/chef/chef-force-metrics-collector'.freeze
7+
8+
def initialize(path: '/var/chef/reports', filename: 'chef-run-metrics.json')
9+
@path = path
10+
@filename = filename
11+
end
12+
13+
def report
14+
# Will get called here on compile errors instead of exception
15+
Chef::Log.info('boxcutter_chef: metrics_hander - Entering report handler')
16+
write_metrics
17+
end
18+
19+
def exception
20+
Chef::Log.info('boxcutter_chef: metrics_handler - Entering exception handler')
21+
write_metrics
22+
end
23+
24+
private
25+
26+
def metrics_path
27+
FileUtils.mkdir_p(@path)
28+
File.chmod(0o700, @path)
29+
File.join(@path, @filename)
30+
end
31+
32+
def load_previous
33+
return {} unless File.exist?(metrics_path)
34+
JSON.parse(File.read(metrics_path))
35+
rescue JSON::ParserError, Errno::ENOENT
36+
{}
37+
end
38+
39+
def write_metrics
40+
# Ruby to_s is close to ISO 8601, but not quite - use ISO 8601 instead.
41+
now = Time.now.iso8601
42+
43+
prev = load_previous
44+
success = run_status.success?
45+
last_success = if success
46+
now
47+
else
48+
prev['last_success_time_iso8601']
49+
end
50+
51+
out = {
52+
# stable schema keys (don't rename lightly)
53+
'report_time_iso8601' => now,
54+
'success' => success ? 1 : 0,
55+
'last_success_time_iso8601' => last_success,
56+
57+
'start_time_iso8601' => run_status.start_time.iso8601,
58+
'end_time_iso8601' => run_status.end_time.iso8601,
59+
# run_status.elapsed time is start_time - end_time as a float in seconds
60+
# Convert to ms as integer (no way resolution is more granular than ms)
61+
'elapsed_time_ms' => run_status.elapsed_time * 1000,
62+
'all_resources_count' => run_status.all_resources.count,
63+
'updated_resources_count' => run_status.updated_resources.count,
64+
}
65+
66+
if taste_tester? && !force_metrics_collector?
67+
Chef::Log.info(
68+
'boxcutter_chef: metrics_handler - In taste tester mode. Metrics collector not running.' +
69+
"To override, touch #{FORCE_METRICS_COLLECTOR}",
70+
)
71+
return
72+
end
73+
74+
File.open(metrics_path, 'w') { |f| f.write(JSON.pretty_generate(out)) }
75+
Chef::Log.info("boxcutter_chef: metrics_hander - wrote chef run metrics to #{metrics_path}")
76+
end
77+
78+
def taste_tester?
79+
!Chef::Config[:chef_server_url].start_with?('chefzero://')
80+
end
81+
82+
def force_metrics_collector?
83+
File.exist?(FORCE_METRICS_COLLECTOR)
84+
end
85+
end
86+
end
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
# Adjust as needed.
5+
CONFIG_FILE="/etc/boxcutter-config.json"
6+
TEXTFILE_COLLECTOR_DIR=/var/lib/node_exporter/textfile/
7+
INPUT_JSON='/var/chef/reports/chef-run-metrics.json'
8+
9+
# Convert ISO8601 → epoch seconds
10+
iso_to_epoch() {
11+
local ts="$1"
12+
[[ -z "$ts" || "$ts" == "null" ]] && echo "0" && return
13+
14+
# GNU date handles ISO 8601 natively
15+
date -d "$ts" +%s 2>/dev/null || echo "0"
16+
}
17+
18+
metrics_to_stdout() {
19+
echo "Report time (ISO 8601): ${report_time_iso8601}"
20+
echo "Report time (Unix epoch): ${report_time_unix_epoch}"
21+
echo "Success: ${success}"
22+
echo "Last success time (ISO 8601): ${last_success_time_iso8601}"
23+
echo "Last success time (Unix epoch): ${last_success_unix_epoch}"
24+
echo "Start time (ISO 8601): ${start_time_iso8601}"
25+
echo "Start time (Unix epoch): ${start_time_unix_epoch}"
26+
echo "End time (ISO 8601): ${end_time_iso8601}"
27+
echo "End time (Unix epoch): ${end_time_unix_epoch}"
28+
echo "Elapsed time (ms): ${elapsed_time_ms}"
29+
echo "All resources: ${all_resources_count}"
30+
echo "Updated resources: ${updated_resources_count}"
31+
}
32+
33+
metrics_to_prometheus() {
34+
local tags="$1"
35+
36+
cat << EOF > "$TEXTFILE_COLLECTOR_DIR/chef_metrics.prom.$$"
37+
# HELP chef_client_run_success Whether the last Chef run succeeded (1) or failed (0).
38+
# TYPE chef_client_run_success gauge
39+
chef_client_run_success${tags} ${success}
40+
# HELP chef_client_run_last_success_timestamp_seconds Unix timestamp of the most recent successful Chef run.
41+
# TYPE chef_client_run_last_success_timestamp_seconds gauge
42+
chef_client_run_last_success_timestamp_seconds${tags} ${last_success_unix_epoch}
43+
# HELP chef_client_run_duration_seconds Duration of the last Chef run in seconds.
44+
# TYPE chef_client_run_duration_seconds gauge
45+
chef_client_run_duration_seconds${tags} ${elapsed_seconds}
46+
# HELP chef_client_run_resources_total Total resources in the last Chef run.
47+
# TYPE chef_client_run_resources_total gauge
48+
chef_client_resources_total${tags} ${all_resources_count}
49+
# HELP chef_client_run_updated_resources_total Updated resources in the last Chef run.
50+
# TYPE chef_client_run_updated_resources_total gauge
51+
chef_client_updated_resources_total${tags} ${updated_resources_count}
52+
EOF
53+
54+
# Rename the temporary file atomically.
55+
# This avoids the node exporter seeing half a file.
56+
mv "$TEXTFILE_COLLECTOR_DIR/chef_metrics.prom.$$" \
57+
"$TEXTFILE_COLLECTOR_DIR/chef_metrics.prom"
58+
}
59+
60+
# Extract fields (use //empty to avoid 'null' output and allow defaults)
61+
report_time_iso8601="$(jq -r '.report_time_iso8601 // empty' "$INPUT_JSON")"
62+
success="$(jq -r '.success // 0' "$INPUT_JSON")"
63+
last_success_time_iso8601="$(jq -r '.last_success_time_iso8601 // empty' "$INPUT_JSON")"
64+
start_time_iso8601="$(jq -r '.start_time_iso8601 // empty' "$INPUT_JSON")"
65+
end_time_iso8601="$(jq -r '.end_time_iso8601 // empty' "$INPUT_JSON")"
66+
elapsed_time_ms="$(jq -r '.elapsed_time_ms // empty' "$INPUT_JSON")"
67+
all_resources_count="$(jq -r '.all_resources_count // empty' "$INPUT_JSON")"
68+
updated_resources_count="$(jq -r '.updated_resources_count // empty' "$INPUT_JSON")"
69+
70+
report_time_unix_epoch=$(iso_to_epoch "${report_time_iso8601}")
71+
last_success_unix_epoch="$(iso_to_epoch "${last_success_time_iso8601}")"
72+
start_time_unix_epoch="$(iso_to_epoch "$start_time_iso8601")"
73+
end_time_unix_epoch="$(iso_to_epoch "$end_time_iso8601")"
74+
75+
elapsed_seconds="$(jq -nr --arg v "$elapsed_time_ms" '($v|tonumber) / 1000.0')"
76+
77+
if [[ -r "$CONFIG_FILE" ]]; then
78+
tier="$(jq -r '.tier // empty' "$CONFIG_FILE")"
79+
[[ -n "$tier" ]] || tier="default"
80+
fi
81+
82+
tags="{tier=\"$tier\"}"
83+
metrics_to_prometheus "$tags"

cookbooks/boxcutter_chef/files/taste-tester/taste-tester-plugin.rb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,16 @@ def self.test_remote_client_rb_extra_code(_hostname)
77
local_key_generation true
88
json_attribs '/etc/cinc/run-list.json'
99
%w(
10-
attribute-changed-handler.rb
11-
resource-updated-handler.rb
10+
attribute_changed_handler.rb
11+
metrics_handler.rb
1212
).each do |handler|
1313
handler_file = File.join('/etc/cinc/handlers', handler)
1414
if File.exist?(handler_file)
1515
require handler_file
1616
end
1717
end
18+
report_handlers << Boxcutter::MetricsHandler.new()
19+
exception_handlers << Boxcutter::MetricsHandler.new()
1820
ohai.critical_plugins ||= []
1921
ohai.critical_plugins += [:Passwd]
2022
ohai.critical_plugins += [:ShardSeed]

cookbooks/boxcutter_chef/recipes/cinc_client.rb

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,13 @@
118118
mode '0644'
119119
end
120120

121+
cookbook_file "#{config_dir}/handlers/metrics_handler.rb" do
122+
source 'config/metrics_handler.rb'
123+
owner 'root'
124+
group 'root'
125+
mode '0644'
126+
end
127+
121128
template "#{config_dir}/client-prod.rb" do
122129
source 'client-prod.rb.erb'
123130
cookbook 'boxcutter_chef'
@@ -148,3 +155,16 @@
148155
group 'root'
149156
mode '0644'
150157
end
158+
159+
cookbook_file '/usr/local/sbin/chef_metrics_collector.sh' do
160+
source 'metrics/chef_metrics_collector.sh'
161+
owner 'root'
162+
group 'root'
163+
mode '0755'
164+
end
165+
166+
node.default['fb_timers']['jobs']['chef_metrics_collector'] = {
167+
'calendar' => FB::Systemd::Calendar.every(2).minutes,
168+
'command' => '/usr/local/sbin/chef_metrics_collector.sh',
169+
'only_if' => proc { File.exist?('/var/lib/node_exporter/textfile') },
170+
}

cookbooks/boxcutter_chef/templates/client-prod.rb.erb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,15 @@ local_key_generation true
1111
json_attribs '/etc/cinc/run-list.json'
1212
%w(
1313
attribute_changed_handler.rb
14+
metrics_handler.rb
1415
).each do |handler|
1516
handler_file = File.join('/etc/cinc/handlers', handler)
1617
if File.exist?(handler_file)
1718
require handler_file
1819
end
1920
end
21+
report_handlers << Boxcutter::MetricsHandler.new()
22+
exception_handlers << Boxcutter::MetricsHandler.new()
2023
ohai.critical_plugins ||= []
2124
ohai.critical_plugins += [:Passwd]
2225
ohai.critical_plugins += [:ShardSeed]

0 commit comments

Comments
 (0)