Skip to content

Commit b9e2536

Browse files
committed
Merge branch 'fix/autoheal' into 'master'
Restart SNMP collector if BrokenProcessPool exception is caught See merge request grafolean/grafolean-collector-snmp!10
2 parents a288369 + 8a2e75a commit b9e2536

File tree

4 files changed

+52
-3
lines changed

4 files changed

+52
-3
lines changed

Dockerfile

+1
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,5 @@ RUN \
4040
echo "alias l='ls -altr'" >> /root/.bashrc
4141
COPY --from=build-backend /snmpcollector/ /snmpcollector/
4242
WORKDIR /snmpcollector
43+
HEALTHCHECK --interval=10s --retries=1 CMD /bin/bash -c "[ ! -f /tmp/fail_health_check ]"
4344
CMD ["python", "-m", "snmpcollector"]

collector.py

+26
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,32 @@ def run_job(job, jobstore_alias, run_times, logger_name):
154154

155155
return events
156156

157+
def _run_job_error(self, job_id, exc, traceback=None):
158+
"""
159+
> Called by the executor with the exception if there is an error calling `run_job`.
160+
161+
Sometimes we start getting traceback, after which collector no longer works:
162+
-----
163+
2019-10-04 19:45:38 | ERR | Error submitting job "SNMPCollector.do_snmp (trigger: <collector.MultipleIntervalsTrigger object at 0x7fd866b9aee8>, next run at: 2019-10-04 19:45:38 UTC)" to executor "iaexecutor"
164+
Traceback (most recent call last):
165+
File "/usr/local/lib/python3.6/site-packages/apscheduler/schedulers/base.py", line 974, in _process_jobs
166+
executor.submit_job(job, run_times)
167+
File "/usr/local/lib/python3.6/site-packages/apscheduler/executors/base.py", line 71, in submit_job
168+
self._do_submit_job(job, run_times)
169+
File "./collector.py", line 92, in _do_submit_job
170+
File "/usr/local/lib/python3.6/concurrent/futures/process.py", line 452, in submit
171+
raise BrokenProcessPool('A child process terminated '
172+
concurrent.futures.process.BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore
173+
-----
174+
175+
The idea is that we remember that we are in this state, so that we can make Docker health check fail.
176+
"""
177+
super()._run_job_error(job_id, exc, traceback)
178+
179+
if 'BrokenProcessPool' in exc.__class__.__name__:
180+
# this file is checked by the Docker health check and if it exists, container should be restarted:
181+
open('/tmp/fail_health_check', 'a').close()
182+
157183

158184
class Collector(object):
159185
__slots__ = 'backend_url', 'bot_token', 'scheduler', 'known_jobs', 'jobs_refresh_interval'

docker-compose.dev.yml

+13-3
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,21 @@ services:
3232
# so that Docker networking is bypassed.
3333
network_mode: "host"
3434

35+
3536
redis:
3637
image: redis:5-alpine
3738
container_name: grafolean-collector-snmp-redis
3839
ports:
39-
- "127.0.0.1:6379:6379"
40-
# We advise not to use `network_mode: "host"` in production, because it would expose Redis to host network
41-
# (even if access is limited to 127.0.0.1).
40+
- "6379:6379"
41+
# We advise not to use `network_mode: "host"` in production, because it would expose Redis to the network.
4242
network_mode: "host"
43+
44+
45+
autoheal:
46+
image: willfarrell/autoheal
47+
container_name: autoheal-snmp
48+
environment:
49+
- AUTOHEAL_CONTAINER_LABEL=all
50+
volumes:
51+
- /var/run/docker.sock:/var/run/docker.sock
52+
restart: always

docker-compose.yml

+12
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,19 @@ services:
2626
- REDIS_HOST=redis
2727
restart: always
2828

29+
2930
redis:
3031
image: redis:5-alpine
3132
container_name: grafolean-collector-snmp-redis
3233
restart: always
34+
35+
36+
autoheal:
37+
# This container automatically restarts any container that fails its health check. Not a bullet-proof solution, but better than nothing.
38+
image: willfarrell/autoheal
39+
container_name: autoheal-snmp
40+
environment:
41+
- AUTOHEAL_CONTAINER_LABEL=all
42+
volumes:
43+
- /var/run/docker.sock:/var/run/docker.sock
44+
restart: always

0 commit comments

Comments
 (0)