Skip to content

Commit 73ed5f7

Browse files
Alxiicecbentejac
authored andcommitted
[submitter] Add tools to avoid autoretry on farm
1 parent 4ec805c commit 73ed5f7

File tree

4 files changed

+56
-7
lines changed

4 files changed

+56
-7
lines changed

bin/meshroom_compute

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import argparse
33
import logging
44
import os
55
import sys
6+
from typing import NoReturn
67

78
try:
89
import meshroom
@@ -63,12 +64,28 @@ else:
6364

6465
meshroom.core.initPlugins()
6566
meshroom.core.initNodes()
67+
meshroom.core.initSubmitters()
6668

6769
graph = meshroom.core.graph.loadGraph(args.graphFile)
6870
if args.cache:
6971
graph.cacheDir = args.cache
7072
graph.update()
7173

74+
75+
def killRunningJob(node) -> NoReturn:
76+
""" Kills current job and try to avoid job restarting """
77+
jobInfos = node.nodeStatus.jobInfos
78+
submitterName = jobInfos.get("submitterName")
79+
if not submitterName:
80+
sys.exit(meshroom.MeshroomExitStatus.ERROR_NO_RETRY)
81+
from meshroom.core import submitters
82+
for subName, sub in submitters.items():
83+
if submitterName == subName:
84+
sub.killRunningJob()
85+
break
86+
sys.exit(meshroom.MeshroomExitStatus.ERROR_NO_RETRY)
87+
88+
7289
if args.node:
7390
# Execute the node
7491
node = graph.findNode(args.node)
@@ -109,8 +126,14 @@ if args.node:
109126
node.preprocess()
110127
if args.iteration != -1:
111128
chunk = node.chunks[args.iteration]
129+
if chunk._status.status == Status.STOPPED:
130+
print(f"Chunk {chunk} : status is STOPPED")
131+
killRunningJob(node)
112132
chunk.process(args.forceCompute, args.inCurrentEnv)
113133
else:
134+
if node.nodeStatus.status == Status.STOPPED:
135+
print(f"Node {node} : status is STOPPED")
136+
killRunningJob(node)
114137
node.process(args.forceCompute, args.inCurrentEnv)
115138
node.postprocess()
116139
node.restoreLogger()

bin/meshroom_createChunks

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,19 +77,20 @@ graph.update()
7777
node = graph.findNode(args.node)
7878
submittedStatuses = [Status.RUNNING]
7979

80-
if node._nodeStatus.status in (Status.STOPPED, Status.KILLED):
81-
logging.error("Node status is STOPPED or KILLED.")
82-
sys.exit(666)
83-
84-
85-
# Find job
80+
# Find submitter
8681
submitter = None
8782
# It's required if we want to spool chunks on different machines
8883
for subName, sub in submitters.items():
8984
if args.submitter == subName:
9085
submitter = sub
9186
break
9287

88+
if node._nodeStatus.status in (Status.STOPPED, Status.KILLED):
89+
logging.error("Node status is STOPPED or KILLED.")
90+
if submitter:
91+
submitter.killRunningJob()
92+
sys.exit(meshroom.MeshroomExitStatus.ERROR_NO_RETRY)
93+
9394
if not node._chunksCreated:
9495
# Create node chunks
9596
# Once created we don't have to do it again even if we relaunch the job

meshroom/__init__.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from distutils import util
2-
from enum import Enum
2+
from enum import Enum, IntEnum
33
import logging
44
import os
55
import sys
@@ -72,6 +72,18 @@ def logToRoot(message, *args, **kwargs):
7272
logging.getLogger().setLevel(logStringToPython[os.environ.get('MESHROOM_VERBOSE', 'warning')])
7373

7474

75+
class MeshroomExitStatus(IntEnum):
76+
""" In case we want to catch some special case from the parent process
77+
We could use 3-125 for custom exist codes :
78+
https://tldp.org/LDP/abs/html/exitcodes.html
79+
"""
80+
SUCCESS = 0
81+
ERROR = 1
82+
# In some farm tools jobs are automatically re-tried,
83+
# using ERROR_NO_RETRY will try to prevent that
84+
ERROR_NO_RETRY = -999 # It's actually -999 % 256 => 25
85+
86+
7587
def setupEnvironment(backend=Backend.STANDALONE):
7688
"""
7789
Setup environment for Meshroom to work in a prebuilt, standalone configuration.

meshroom/core/submitter.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
#!/usr/bin/env python
22

3+
import sys
34
import logging
45
import operator
56

67
from enum import IntFlag, auto
78
from typing import Optional
89
from itertools import accumulate
910

11+
import meshroom
1012
from meshroom.common import BaseObject, Property
1113

1214

@@ -225,4 +227,15 @@ def submit(self, nodes, edges, filepath, submitLabel="{projectName}") -> BaseSub
225227
return None
226228
return job
227229

230+
@staticmethod
231+
def killRunningJob():
232+
""" Sometimes farms are automatically re-trying job once in case it was
233+
killed by a user that don't want his machine to be used. Unfortunately this
234+
means jobs will be launched twice even if they failed for a good reason.
235+
This function can be used to make sure the current job will not restart
236+
Note : the ERROR_NO_RETRY itself won't do anything. This function must be
237+
implemented on a case-by-case for each possible farm system
238+
"""
239+
sys.exit(meshroom.MeshroomExitStatus.ERROR_NO_RETRY)
240+
228241
name = Property(str, lambda self: self._name, constant=True)

0 commit comments

Comments
 (0)