@@ -154,6 +154,32 @@ def run_job(job, jobstore_alias, run_times, logger_name):
154
154
155
155
return events
156
156
157
+ def _run_job_error (self , job_id , exc , traceback = None ):
158
+ """
159
+ > Called by the executor with the exception if there is an error calling `run_job`.
160
+
161
+ Sometimes we start getting traceback, after which collector no longer works:
162
+ -----
163
+ 2019-10-04 19:45:38 | ERR | Error submitting job "SNMPCollector.do_snmp (trigger: <collector.MultipleIntervalsTrigger object at 0x7fd866b9aee8>, next run at: 2019-10-04 19:45:38 UTC)" to executor "iaexecutor"
164
+ Traceback (most recent call last):
165
+ File "/usr/local/lib/python3.6/site-packages/apscheduler/schedulers/base.py", line 974, in _process_jobs
166
+ executor.submit_job(job, run_times)
167
+ File "/usr/local/lib/python3.6/site-packages/apscheduler/executors/base.py", line 71, in submit_job
168
+ self._do_submit_job(job, run_times)
169
+ File "./collector.py", line 92, in _do_submit_job
170
+ File "/usr/local/lib/python3.6/concurrent/futures/process.py", line 452, in submit
171
+ raise BrokenProcessPool('A child process terminated '
172
+ concurrent.futures.process.BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore
173
+ -----
174
+
175
+ The idea is that we remember that we are in this state, so that we can make Docker health check fail.
176
+ """
177
+ super ()._run_job_error (job_id , exc , traceback )
178
+
179
+ if 'BrokenProcessPool' in exc .__class__ .__name__ :
180
+ # this file is checked by the Docker health check and if it exists, container should be restarted:
181
+ open ('/tmp/fail_health_check' , 'a' ).close ()
182
+
157
183
158
184
class Collector (object ):
159
185
__slots__ = 'backend_url' , 'bot_token' , 'scheduler' , 'known_jobs' , 'jobs_refresh_interval'
0 commit comments