Skip to content

Commit 5e2760d

Browse files
committed
Merge pull request #1566 from DataDog/yann/cherry-picked-jmx
[jmxfetch] fix windows bootloop 🐛
2 parents 0977b84 + 8d68061 commit 5e2760d

File tree

7 files changed

+139
-61
lines changed

7 files changed

+139
-61
lines changed

CHANGELOG.md

+12-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
11
Changes
22
=======
33

4+
# 5.3.1 / 04-21-2015
5+
**Windows only**
6+
7+
### Details
8+
https://github.com/DataDog/dd-agent/compare/5.3.0...5.3.1
9+
10+
### Changes
11+
* [BUGFIX] JMXFetch: Fix bootloop issue when no JMX integration is set. See [#1561][]
12+
13+
414
# 5.3.0 / 04-16-2015
515
### Details
616
https://github.com/DataDog/dd-agent/compare/5.2.2...5.3.0
@@ -1558,6 +1568,7 @@ If you use ganglia, you want this version.
15581568
[#1511]: https://github.com/DataDog/dd-agent/issues/1511
15591569
[#1512]: https://github.com/DataDog/dd-agent/issues/1512
15601570
[#1518]: https://github.com/DataDog/dd-agent/issues/1518
1571+
[#1561]: https://github.com/DataDog/dd-agent/issues/1561
15611572
[@AirbornePorcine]: https://github.com/AirbornePorcine
15621573
[@CaptTofu]: https://github.com/CaptTofu
15631574
[@Osterjour]: https://github.com/Osterjour
@@ -1613,4 +1624,4 @@ If you use ganglia, you want this version.
16131624
[@stefan-mees]: https://github.com/stefan-mees
16141625
[@takus]: https://github.com/takus
16151626
[@tomduckering]: https://github.com/tomduckering
1616-
[@walkeran]: https://github.com/walkeran
1627+
[@walkeran]: https://github.com/walkeran

checks/check_status.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,8 @@ def to_dict(self):
517517
status_info['checks'][cs.name] = {'instances': {}}
518518
if cs.init_failed_error:
519519
status_info['checks'][cs.name]['init_failed'] = True
520-
status_info['checks'][cs.name]['traceback'] = cs.init_failed_traceback
520+
status_info['checks'][cs.name]['traceback'] = \
521+
cs.init_failed_traceback or cs.init_failed_error
521522
else:
522523
status_info['checks'][cs.name] = {'instances': {}}
523524
status_info['checks'][cs.name]['init_failed'] = False

jmxfetch.py

+62-24
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# std
2-
import os
3-
import logging
42
import glob
3+
import logging
4+
import os
55
import subprocess
6-
import time
76
import sys
87
import signal
8+
import time
99

1010
# datadog
1111
from util import get_os, yLoader, yDumper
@@ -68,6 +68,7 @@ def __init__(self, confd_path, agentConfig):
6868
self.check_frequency = DEFAULT_CHECK_FREQUENCY
6969

7070
self.jmx_process = None
71+
self.jmx_checks = None
7172

7273
def terminate(self):
7374
self.jmx_process.terminate()
@@ -77,35 +78,63 @@ def _handle_sigterm(self, signum, frame):
7778
log.debug("Caught sigterm. Stopping subprocess.")
7879
self.jmx_process.terminate()
7980

80-
def initialize(self):
81-
# Gracefully exit on sigterm.
82-
signal.signal(signal.SIGTERM, self._handle_sigterm)
81+
def register_signal_handlers(self):
82+
"""
83+
Enable SIGTERM and SIGINT handlers
84+
"""
85+
try:
86+
# Gracefully exit on sigterm
87+
signal.signal(signal.SIGTERM, self._handle_sigterm)
8388

84-
# Handle Keyboard Interrupt
85-
signal.signal(signal.SIGINT, self._handle_sigterm)
89+
# Handle Keyboard Interrupt
90+
signal.signal(signal.SIGINT, self._handle_sigterm)
8691

87-
self.run()
92+
except ValueError:
93+
log.exception("Unable to register signal handlers.")
94+
95+
def configure(self, check_list=None):
96+
"""
97+
Instantiate JMXFetch parameters.
98+
"""
99+
self.jmx_checks, self.invalid_checks, self.java_bin_path, self.java_options, self.tools_jar_path = \
100+
self.get_configuration(check_list)
101+
102+
def should_run(self):
103+
"""
104+
Should JMXFetch run ?
105+
"""
106+
return self.jmx_checks is not None and self.jmx_checks != []
107+
108+
def run(self, command=None, check_list=None, reporter=None):
109+
110+
if check_list or self.jmx_checks is None:
111+
# (Re)set/(re)configure JMXFetch parameters when `check_list` is specified or
112+
# no configuration was found
113+
self.configure(check_list)
88114

89-
def run(self, command=None, checks_list=None, reporter=None):
90115
try:
91116
command = command or JMX_COLLECT_COMMAND
92-
jmx_checks, invalid_checks, java_bin_path, java_options, tools_jar_path = \
93-
self._should_run(checks_list)
94-
if len(invalid_checks) > 0:
117+
118+
if len(self.invalid_checks) > 0:
95119
try:
96-
self._write_status_file(invalid_checks)
120+
self._write_status_file(self.invalid_checks)
97121
except Exception:
98122
log.exception("Error while writing JMX status file")
99123

100-
if len(jmx_checks) > 0:
101-
self._start(
102-
java_bin_path, java_options, jmx_checks, command, reporter, tools_jar_path)
103-
return True
124+
if len(self.jmx_checks) > 0:
125+
return self._start(self.java_bin_path, self.java_options, self.jmx_checks,
126+
command, reporter, self.tools_jar_path)
127+
else:
128+
# We're exiting purposefully, so exit with zero (supervisor's expected
129+
# code). HACK: Sleep a little bit so supervisor thinks we've started cleanly
130+
# and thus can exit cleanly.
131+
time.sleep(4)
132+
log.info("No valid JMX integration was found. Exiting ...")
104133
except Exception:
105134
log.exception("Error while initiating JMXFetch")
106135
raise
107136

108-
def _should_run(self, checks_list):
137+
def get_configuration(self, checks_list=None):
109138
"""
110139
Return a tuple (jmx_checks, invalid_checks, java_bin_path, java_options)
111140
@@ -171,7 +200,6 @@ def _should_run(self, checks_list):
171200
return (jmx_checks, invalid_checks, java_bin_path, java_options, tools_jar_path)
172201

173202
def _start(self, path_to_java, java_run_opts, jmx_checks, command, reporter, tools_jar_path):
174-
175203
statsd_port = self.agentConfig.get('dogstatsd_port', "8125")
176204
if reporter is None:
177205
reporter = "statsd:%s" % str(statsd_port)
@@ -219,12 +247,24 @@ def _start(self, path_to_java, java_run_opts, jmx_checks, command, reporter, too
219247
log.info("Running %s" % " ".join(subprocess_args))
220248
jmx_process = subprocess.Popen(subprocess_args, close_fds=True)
221249
self.jmx_process = jmx_process
250+
251+
# Register SIGINT and SIGTERM signal handlers
252+
self.register_signal_handlers()
253+
254+
# Wait for JMXFetch to return
222255
jmx_process.wait()
223256

224257
return jmx_process.returncode
225258

226259
except OSError:
227-
log.exception("Couldn't launch JMXTerm. Is java in your PATH?")
260+
java_path_msg = "Couldn't launch JMXTerm. Is Java in your PATH ?"
261+
log.exception(java_path_msg)
262+
invalid_checks = {}
263+
for check in jmx_checks:
264+
check_name = check.split('.')[0]
265+
check_name = check_name.encode('ascii', 'ignore')
266+
invalid_checks[check_name] = java_path_msg
267+
self._write_status_file(invalid_checks)
228268
raise
229269
except Exception:
230270
log.exception("Couldn't launch JMXFetch")
@@ -361,9 +401,7 @@ def main(config_path=None):
361401
confd_path, agentConfig = init(config_path)
362402

363403
jmx = JMXFetch(confd_path, agentConfig)
364-
jmx.initialize()
365-
366-
return 0
404+
return jmx.run()
367405

368406
if __name__ == '__main__':
369407
sys.exit(main())

packaging/datadog-agent/source/supervisord.conf

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ command=python agent/jmxfetch.py
4545
stdout_logfile=supervisord/logs/jmxfetch.log
4646
redirect_stderr=true
4747
priority=999
48-
startsecs=0
48+
startsecs=3
4949

5050
[group:datadog-agent]
5151
programs=forwarder,collector,dogstatsd,jmxfetch

packaging/supervisor.conf

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ stdout_logfile=NONE
5353
stderr_logfile=NONE
5454
redirect_stderr=true
5555
priority=999
56-
startsecs=0
56+
startsecs=3
5757

5858
[group:datadog-agent]
5959
programs=forwarder,collector,dogstatsd,jmxfetch

supervisord.dev.conf

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ command=python jmxfetch.py
4343
stdout_logfile=jmxfetch.log
4444
redirect_stderr=true
4545
priority=999
46-
startsecs=0
46+
startsecs=3
4747

4848
[group:datadog-agent]
4949
programs=forwarder,collector,dogstatsd,jmxfetch

win32/agent.py

+60-32
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@
3434
log = logging.getLogger(__name__)
3535

3636
SERVICE_SLEEP_INTERVAL = 1
37-
MAX_FAILED_HEARTBEATS = 8 # runs of collector
37+
MAX_FAILED_HEARTBEATS = 8 # runs of collector
38+
3839

3940
class AgentSvc(win32serviceutil.ServiceFramework):
4041
_svc_name_ = "DatadogAgent"
@@ -63,14 +64,18 @@ def __init__(self, args):
6364
self._max_failed_heartbeats = \
6465
MAX_FAILED_HEARTBEATS * agentConfig['check_freq'] / SERVICE_SLEEP_INTERVAL
6566

67+
# Watch JMXFetch restarts
68+
self._MAX_JMXFETCH_RESTARTS = 3
69+
self._count_jmxfetch_restarts = 0
70+
6671
# Keep a list of running processes so we can start/end as needed.
6772
# Processes will start started in order and stopped in reverse order.
6873
self.procs = {
69-
'forwarder': DDForwarder(config, self.hostname),
70-
'collector': DDAgent(agentConfig, self.hostname,
71-
heartbeat=self._collector_send_heartbeat),
72-
'dogstatsd': DogstatsdProcess(config, self.hostname),
73-
'jmxfetch': JMXFetchProcess(config, self.hostname),
74+
'forwarder': ProcessWatchDog("forwarder", DDForwarder(config, self.hostname)),
75+
'collector': ProcessWatchDog("collector", DDAgent(agentConfig, self.hostname,
76+
heartbeat=self._collector_send_heartbeat)),
77+
'dogstatsd': ProcessWatchDog("dogstatsd", DogstatsdProcess(config, self.hostname)),
78+
'jmxfetch': ProcessWatchDog("jmxfetch", JMXFetchProcess(config, self.hostname), 3),
7479
}
7580

7681
def SvcStop(self):
@@ -99,9 +104,9 @@ def SvcDoRun(self):
99104
while self.running:
100105
# Restart any processes that might have died.
101106
for name, proc in self.procs.iteritems():
102-
if not proc.is_alive() and proc.is_enabled:
103-
servicemanager.LogInfoMsg("%s has died. Restarting..." % proc.name)
104-
self._restart_proc(name)
107+
if not proc.is_alive() and proc.is_enabled():
108+
servicemanager.LogInfoMsg("%s has died. Restarting..." % name)
109+
proc.restart()
105110

106111
self._check_collector_blocked()
107112

@@ -117,25 +122,49 @@ def _check_collector_blocked(self):
117122
if self._collector_failed_heartbeats > self._max_failed_heartbeats:
118123
servicemanager.LogInfoMsg(
119124
"%s was unresponsive for too long. Restarting..." % 'collector')
120-
self._restart_proc('collector')
125+
self.procs['collector'].restart()
121126
self._collector_failed_heartbeats = 0
122127

123-
def _restart_proc(self, proc_name):
128+
129+
class ProcessWatchDog(object):
130+
"""
131+
Monitor the attached process.
132+
Restarts when it exits until the limit set is reached.
133+
"""
134+
def __init__(self, name, process, max_restarts=5):
135+
self._name = name
136+
self._process = process
137+
self._count_restarts = 0
138+
self._MAX_RESTARTS = max_restarts
139+
140+
def start(self):
141+
return self._process.start()
142+
143+
def terminate(self):
144+
return self._process.terminate()
145+
146+
def is_alive(self):
147+
return self._process.is_alive()
148+
149+
def is_enabled(self):
150+
return self._process.is_enabled
151+
152+
def restart(self):
153+
self._count_restarts += 1
154+
if self._count_restarts >= self._MAX_RESTARTS:
155+
servicemanager.LogInfoMsg(
156+
"%s reached the limit of restarts. Not restarting..." % self._name)
157+
self._process.is_enabled = False
158+
return
159+
124160
# Make a new proc instances because multiprocessing
125161
# won't let you call .start() twice on the same instance.
126-
old_proc = self.procs[proc_name]
127-
if proc_name == 'collector':
128-
new_proc = old_proc.__class__(
129-
old_proc.config, self.hostname, heartbeat=self._collector_send_heartbeat)
130-
else:
131-
new_proc = old_proc.__class__(old_proc.config, self.hostname)
162+
if self._process.is_alive():
163+
self._process.terminate()
132164

133-
if old_proc.is_alive():
134-
old_proc.terminate()
135-
del self.procs[proc_name]
165+
self._process = self._process.__class__(self._process.config, self._process.hostname)
166+
self._process.start()
136167

137-
new_proc.start()
138-
self.procs[proc_name] = new_proc
139168

140169
class DDAgent(multiprocessing.Process):
141170
def __init__(self, agentConfig, hostname, heartbeat=None):
@@ -240,25 +269,24 @@ class JMXFetchProcess(multiprocessing.Process):
240269
def __init__(self, agentConfig, hostname):
241270
multiprocessing.Process.__init__(self, name='jmxfetch')
242271
self.config = agentConfig
243-
self.is_enabled = True
244272
self.hostname = hostname
245273

246-
osname = get_os()
247274
try:
275+
osname = get_os()
248276
confd_path = get_confd_path(osname)
249-
except PathNotFound, e:
250-
log.error("No conf.d folder found at '%s' or in the directory where"
251-
"the Agent is currently deployed.\n" % e.args[0])
277+
self.jmx_daemon = JMXFetch(confd_path, agentConfig)
278+
self.jmx_daemon.configure()
279+
self.is_enabled = self.jmx_daemon.should_run()
252280

253-
self.jmx_daemon = JMXFetch(confd_path, agentConfig)
281+
except PathNotFound:
282+
self.is_enabled = False
254283

255284
def run(self):
256-
log.debug("Windows Service - Starting JMXFetch")
257-
self.jmx_daemon.run()
285+
if self.is_enabled:
286+
self.jmx_daemon.run()
258287

259288
def stop(self):
260-
log.debug("Windows Service - Stopping JMXFetch")
261-
self.jmx_daemon.terminate()
289+
pass
262290

263291

264292
if __name__ == '__main__':

0 commit comments

Comments
 (0)