Skip to content

Commit 334c607

Browse files
Emit periodic keepalive events from Worker (#1191) (#1201)
* new CLI arg and envvar for `Worker` mode to optionally emit regular keepalive events; fixes issues with container runtimes that assume long-silent stdout == hung process --------- Co-authored-by: Alan Rominger <arominge@redhat.com> (cherry picked from commit fd9d67a)
1 parent 22c4555 commit 334c607

File tree

4 files changed

+248
-48
lines changed

4 files changed

+248
-48
lines changed

ansible_runner/__main__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,15 @@ def main(sys_args=None):
611611
"Using this will also assure that the directory is deleted when the job finishes."
612612
)
613613
)
614+
worker_subparser.add_argument(
615+
"--keepalive-seconds",
616+
dest="keepalive_seconds",
617+
default=None,
618+
type=int,
619+
help=(
620+
"Emit a synthetic keepalive event every N seconds of idle. (default=0, disabled)"
621+
)
622+
)
614623
process_subparser = subparser.add_parser(
615624
'process',
616625
help="Receive the output of remote ansible-runner work and distribute the results"
@@ -859,6 +868,7 @@ def main(sys_args=None):
859868
limit=vargs.get('limit'),
860869
streamer=streamer,
861870
suppress_env_files=vargs.get("suppress_env_files"),
871+
keepalive_seconds=vargs.get("keepalive_seconds"),
862872
)
863873
try:
864874
res = run(**run_options)
@@ -887,3 +897,7 @@ def main(sys_args=None):
887897
return 0
888898
except OSError:
889899
return 1
900+
901+
902+
if __name__ == '__main__':
903+
sys.exit(main())

ansible_runner/config/_base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def __init__(self,
6767
process_isolation=False, process_isolation_executable=None,
6868
container_image=None, container_volume_mounts=None, container_options=None, container_workdir=None, container_auth_data=None,
6969
ident=None, rotate_artifacts=0, timeout=None, ssh_key=None, quiet=False, json_mode=False,
70-
check_job_event_data=False, suppress_env_files=False):
70+
check_job_event_data=False, suppress_env_files=False, keepalive_seconds=None):
7171
# common params
7272
self.host_cwd = host_cwd
7373
self.envvars = envvars
@@ -95,6 +95,8 @@ def __init__(self,
9595
self.timeout = timeout
9696
self.check_job_event_data = check_job_event_data
9797
self.suppress_env_files = suppress_env_files
98+
# ignore this for now since it's worker-specific and would just trip up old runners
99+
# self.keepalive_seconds = keepalive_seconds
98100

99101
# setup initial environment
100102
if private_data_dir:

ansible_runner/streaming.py

Lines changed: 134 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations # allow newer type syntax until 3.10 is our minimum
2+
13
import codecs
24
import json
35
import os
@@ -6,17 +8,16 @@
68
import tempfile
79
import uuid
810
import traceback
9-
try:
10-
from collections.abc import Mapping
11-
except ImportError:
12-
from collections import Mapping
1311

1412
import ansible_runner
1513
from ansible_runner.exceptions import ConfigurationError
1614
from ansible_runner.loader import ArtifactLoader
1715
import ansible_runner.plugins
1816
from ansible_runner.utils import register_for_cleanup
1917
from ansible_runner.utils.streaming import stream_dir, unstream_dir
18+
from collections.abc import Mapping
19+
from functools import wraps
20+
from threading import Event, RLock, Thread
2021

2122

2223
class UUIDEncoder(json.JSONEncoder):
@@ -38,6 +39,9 @@ def __init__(self, _output=None, **kwargs):
3839
self._output = _output
3940
self.private_data_dir = os.path.abspath(kwargs.pop('private_data_dir'))
4041
self.only_transmit_kwargs = kwargs.pop('only_transmit_kwargs', False)
42+
if 'keepalive_seconds' in kwargs:
43+
kwargs.pop('keepalive_seconds') # don't confuse older runners with this Worker-only arg
44+
4145
self.kwargs = kwargs
4246

4347
self.status = "unstarted"
@@ -60,12 +64,22 @@ def run(self):
6064
return self.status, self.rc
6165

6266

63-
class Worker(object):
64-
def __init__(self, _input=None, _output=None, **kwargs):
67+
class Worker:
68+
def __init__(self, _input=None, _output=None, keepalive_seconds: float | None = None, **kwargs):
6569
if _input is None:
6670
_input = sys.stdin.buffer
6771
if _output is None:
6872
_output = sys.stdout.buffer
73+
74+
if keepalive_seconds is None: # if we didn't get an explicit int value, fall back to envvar
75+
# FIXME: emit/log a warning and silently continue if this value won't parse
76+
keepalive_seconds = float(os.environ.get('ANSIBLE_RUNNER_KEEPALIVE_SECONDS', 0))
77+
78+
self._keepalive_interval_sec = keepalive_seconds
79+
self._keepalive_thread: Thread | None = None
80+
self._output_event = Event()
81+
self._output_lock = RLock()
82+
6983
self._input = _input
7084
self._output = _output
7185

@@ -81,6 +95,64 @@ def __init__(self, _input=None, _output=None, **kwargs):
8195
self.status = "unstarted"
8296
self.rc = None
8397

98+
def _begin_keepalive(self):
99+
"""Starts a keepalive thread at most once"""
100+
if not self._keepalive_thread:
101+
self._keepalive_thread = Thread(target=self._keepalive_loop, daemon=True)
102+
self._keepalive_thread.start()
103+
104+
def _end_keepalive(self):
105+
"""Disable the keepalive interval and notify the keepalive thread to shut down"""
106+
self._keepalive_interval_sec = 0
107+
self._output_event.set()
108+
109+
def _keepalive_loop(self):
110+
"""Main loop for keepalive injection thread; exits when keepalive interval is <= 0"""
111+
while self._keepalive_interval_sec > 0:
112+
# block until output has occurred or keepalive interval elapses
113+
if self._output_event.wait(timeout=self._keepalive_interval_sec):
114+
# output was sent before keepalive timeout; reset the event and start waiting again
115+
self._output_event.clear()
116+
continue
117+
118+
# keepalive interval elapsed; try to send a keepalive...
119+
# pre-acquire the output lock without blocking
120+
if not self._output_lock.acquire(blocking=False):
121+
# something else has the lock; output is imminent, so just skip this keepalive
122+
# NB: a long-running operation under an event handler that's holding this lock but not actually moving
123+
# output could theoretically block keepalives long enough to cause problems, but it's probably not
124+
# worth the added locking hassle to be pedantic about it
125+
continue
126+
127+
try:
128+
# were keepalives recently disabled?
129+
if self._keepalive_interval_sec <= 0:
130+
# we're probably shutting down; don't risk corrupting output by writing now, just bail out
131+
return
132+
# output a keepalive event
133+
# FIXME: this could be a lot smaller (even just `{}`) if a short-circuit discard was guaranteed in
134+
# Processor or if other layers were more defensive about missing event keys and/or unknown dictionary
135+
# values...
136+
self.event_handler(dict(event='keepalive', counter=0, uuid=0))
137+
finally:
138+
# always release the output lock (
139+
self._output_lock.release()
140+
141+
def _synchronize_output_reset_keepalive(wrapped_method):
142+
"""
143+
Utility decorator to synchronize event writes and flushes to avoid keepalives splatting in the middle of
144+
mid-write events, and reset keepalive interval on write completion.
145+
"""
146+
@wraps(wrapped_method)
147+
def wrapper(self, *args, **kwargs):
148+
with self._output_lock:
149+
ret = wrapped_method(self, *args, **kwargs)
150+
# signal the keepalive thread last, so the timeout restarts after the last write, not before the first
151+
self._output_event.set()
152+
return ret
153+
154+
return wrapper
155+
84156
def update_paths(self, kwargs):
85157
if kwargs.get('envvars'):
86158
if 'ANSIBLE_ROLES_PATH' in kwargs['envvars']:
@@ -93,63 +165,72 @@ def update_paths(self, kwargs):
93165
return kwargs
94166

95167
def run(self):
96-
while True:
97-
try:
98-
line = self._input.readline()
99-
data = json.loads(line)
100-
except (json.decoder.JSONDecodeError, IOError):
101-
self.status_handler({'status': 'error', 'job_explanation': 'Failed to JSON parse a line from transmit stream.'}, None)
102-
self.finished_callback(None) # send eof line
103-
return self.status, self.rc
104-
105-
if 'kwargs' in data:
106-
self.job_kwargs = self.update_paths(data['kwargs'])
107-
elif 'zipfile' in data:
168+
self._begin_keepalive()
169+
try:
170+
while True:
108171
try:
109-
unstream_dir(self._input, data['zipfile'], self.private_data_dir)
110-
except Exception:
111-
self.status_handler({
112-
'status': 'error',
113-
'job_explanation': 'Failed to extract private data directory on worker.',
114-
'result_traceback': traceback.format_exc()
115-
}, None)
172+
line = self._input.readline()
173+
data = json.loads(line)
174+
except (json.decoder.JSONDecodeError, IOError):
175+
self.status_handler({'status': 'error', 'job_explanation': 'Failed to JSON parse a line from transmit stream.'}, None)
116176
self.finished_callback(None) # send eof line
117177
return self.status, self.rc
118-
elif 'eof' in data:
119-
break
120178

121-
self.kwargs.update(self.job_kwargs)
122-
self.kwargs['quiet'] = True
123-
self.kwargs['suppress_ansible_output'] = True
124-
self.kwargs['private_data_dir'] = self.private_data_dir
125-
self.kwargs['status_handler'] = self.status_handler
126-
self.kwargs['event_handler'] = self.event_handler
127-
self.kwargs['artifacts_handler'] = self.artifacts_handler
128-
self.kwargs['finished_callback'] = self.finished_callback
129-
130-
r = ansible_runner.interface.run(**self.kwargs)
131-
self.status, self.rc = r.status, r.rc
132-
133-
# FIXME: do cleanup on the tempdir
179+
if 'kwargs' in data:
180+
self.job_kwargs = self.update_paths(data['kwargs'])
181+
elif 'zipfile' in data:
182+
try:
183+
unstream_dir(self._input, data['zipfile'], self.private_data_dir)
184+
except Exception:
185+
self.status_handler({
186+
'status': 'error',
187+
'job_explanation': 'Failed to extract private data directory on worker.',
188+
'result_traceback': traceback.format_exc()
189+
}, None)
190+
self.finished_callback(None) # send eof line
191+
return self.status, self.rc
192+
elif 'eof' in data:
193+
break
194+
195+
self.kwargs.update(self.job_kwargs)
196+
self.kwargs['quiet'] = True
197+
self.kwargs['suppress_ansible_output'] = True
198+
self.kwargs['private_data_dir'] = self.private_data_dir
199+
self.kwargs['status_handler'] = self.status_handler
200+
self.kwargs['event_handler'] = self.event_handler
201+
self.kwargs['artifacts_handler'] = self.artifacts_handler
202+
self.kwargs['finished_callback'] = self.finished_callback
203+
204+
r = ansible_runner.interface.run(**self.kwargs)
205+
self.status, self.rc = r.status, r.rc
206+
207+
# FIXME: do cleanup on the tempdir
208+
finally:
209+
self._end_keepalive()
134210

135211
return self.status, self.rc
136212

213+
@_synchronize_output_reset_keepalive
137214
def status_handler(self, status_data, runner_config):
138215
self.status = status_data['status']
139216
self._output.write(json.dumps(status_data).encode('utf-8'))
140217
self._output.write(b'\n')
141218
self._output.flush()
142219

220+
@_synchronize_output_reset_keepalive
143221
def event_handler(self, event_data):
144222
self._output.write(json.dumps(event_data).encode('utf-8'))
145223
self._output.write(b'\n')
146224
self._output.flush()
147225

226+
@_synchronize_output_reset_keepalive
148227
def artifacts_handler(self, artifact_dir):
149228
stream_dir(artifact_dir, self._output)
150229
self._output.flush()
151230

231+
@_synchronize_output_reset_keepalive
152232
def finished_callback(self, runner_obj):
233+
self._end_keepalive() # ensure that we can't splat a keepalive event after the eof event
153234
self._output.write(json.dumps({'eof': True}).encode('utf-8'))
154235
self._output.write(b'\n')
155236
self._output.flush()
@@ -210,10 +291,18 @@ def status_callback(self, status_data):
210291
self.status_handler(status_data, runner_config=self.config)
211292

212293
def event_callback(self, event_data):
294+
# FIXME: this needs to be more defensive to not blow up on "malformed" events or new values it doesn't recognize
295+
counter = event_data.get('counter')
296+
uuid = event_data.get('uuid')
297+
298+
if not counter or not uuid:
299+
# FIXME: log a warning about a malformed event?
300+
return
301+
213302
full_filename = os.path.join(self.artifact_dir,
214303
'job_events',
215-
'{}-{}.json'.format(event_data['counter'],
216-
event_data['uuid']))
304+
f'{counter}-{uuid}.json')
305+
217306
if not self.quiet and 'stdout' in event_data:
218307
print(event_data['stdout'])
219308

@@ -254,6 +343,9 @@ def run(self):
254343
self.artifacts_callback(data)
255344
elif 'eof' in data:
256345
break
346+
elif data.get('event') == 'keepalive':
347+
# just ignore keepalives
348+
continue
257349
else:
258350
self.event_callback(data)
259351

0 commit comments

Comments
 (0)