|
139 | 139 | LVL_SKILL = [-4, 0, 3, 6, 10, 14, 16, 18, 20] |
140 | 140 | LVL_MOVETIMES = [50, 50, 100, 150, 200, 300, 400, 500, 1000] |
141 | 141 | LVL_DEPTHS = [1, 1, 1, 2, 3, 5, 8, 13, 22] |
| 142 | +ABORT_REASON_ENGINE_CRASH = "engine_crash" |
142 | 143 |
|
143 | 144 | NNUE_NET = {} |
144 | 145 |
|
@@ -764,14 +765,23 @@ def run_inner(self): |
764 | 765 |
|
765 | 766 | # Do the next work unit |
766 | 767 | path, request = self.work() |
767 | | - except DEAD_ENGINE_ERRORS: |
| 768 | + except DEAD_ENGINE_ERRORS as err: |
768 | 769 | alive = self.is_alive() |
| 770 | + error = { |
| 771 | + "reason": ABORT_REASON_ENGINE_CRASH, |
| 772 | + "kind": err.__class__.__name__, |
| 773 | + } |
| 774 | + if self.stockfish: |
| 775 | + returncode = self.stockfish.poll() |
| 776 | + if returncode is not None: |
| 777 | + error["engine_returncode"] = returncode |
769 | 778 | if alive: |
770 | 779 | t = next(self.backoff) |
771 | 780 | logging.exception("Engine process has died. Backing off %0.1fs", t) |
772 | 781 |
|
773 | | - # Abort current job |
774 | | - self.abort_job() |
| 782 | + # Tell server this abort is from an engine crash so it can cap retries |
| 783 | + # and avoid rescheduling the same crashing position forever. |
| 784 | + self.abort_job(error=error) |
775 | 785 |
|
776 | 786 | if alive: |
777 | 787 | self.sleep.wait(t) |
@@ -825,15 +835,18 @@ def run_inner(self): |
825 | 835 | logging.error("Unexpected HTTP status for acquire: %d", response.status_code) |
826 | 836 | self.sleep.wait(t) |
827 | 837 |
|
828 | | - def abort_job(self): |
| 838 | + def abort_job(self, error=None): |
829 | 839 | if self.job is None: |
830 | 840 | return |
831 | 841 |
|
832 | 842 | logging.debug("Aborting job %s", self.job["work"]["id"]) |
| 843 | + request = self.make_request() |
| 844 | + if error is not None: |
| 845 | + request["error"] = error |
833 | 846 |
|
834 | 847 | try: |
835 | 848 | response = requests.post(get_endpoint(self.conf, "abort/%s" % self.job["work"]["id"]), |
836 | | - data=json.dumps(self.make_request()), |
| 849 | + data=json.dumps(request), |
837 | 850 | timeout=HTTP_TIMEOUT) |
838 | 851 | if response.status_code == 204: |
839 | 852 | logging.info("Aborted job %s", self.job["work"]["id"]) |
|
0 commit comments