diff --git a/.codecov.yml b/.codecov.yml index dc610023ab..70a09b8973 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -50,3 +50,4 @@ ignore: - "docs/" - "scripts/" - "bin/" + - "localFarm/" # For now ignore localFarm as it has no coverage yet diff --git a/bin/meshroom_compute b/bin/meshroom_compute index 2cbfe25e8b..86a9db793e 100755 --- a/bin/meshroom_compute +++ b/bin/meshroom_compute @@ -3,6 +3,7 @@ import argparse import logging import os import sys +from typing import NoReturn try: import meshroom @@ -16,7 +17,7 @@ meshroom.setupEnvironment() import meshroom.core import meshroom.core.graph -from meshroom.core.node import Status, ExecMode +from meshroom.core.node import Status parser = argparse.ArgumentParser(description='Execute a Graph of processes.') @@ -63,20 +64,46 @@ else: meshroom.core.initPlugins() meshroom.core.initNodes() +meshroom.core.initSubmitters() graph = meshroom.core.graph.loadGraph(args.graphFile) if args.cache: graph.cacheDir = args.cache graph.update() + +def killRunningJob(node) -> NoReturn: + """ Kills current job and try to avoid job restarting """ + jobInfos = node.nodeStatus.jobInfos + submitterName = jobInfos.get("submitterName") + if not submitterName: + sys.exit(meshroom.MeshroomExitStatus.ERROR_NO_RETRY) + from meshroom.core import submitters + for subName, sub in submitters.items(): + if submitterName == subName: + sub.killRunningJob() + break + sys.exit(meshroom.MeshroomExitStatus.ERROR_NO_RETRY) + + if args.node: # Execute the node node = graph.findNode(args.node) + node.updateStatusFromCache() submittedStatuses = [Status.RUNNING] if not args.extern: # If running as "extern", the task is supposed to have the status SUBMITTED. # If not running as "extern", the SUBMITTED status should generate a warning. submittedStatuses.append(Status.SUBMITTED) + + if not node._chunksCreated: + print(f"Error: Node {node} has been submitted before chunks have been created." \ + f"See file: \"{node.nodeStatusFile}\".") + sys.exit(-1) + + if node._isInputNode(): + print(f"InputNode: No computation to do.") + if not args.forceStatus and not args.forceCompute: if args.iteration != -1: chunks = [node.chunks[args.iteration]] @@ -85,10 +112,11 @@ if args.node: for chunk in chunks: if chunk.status.status in submittedStatuses: # Particular case for the local isolated, the node status is set to RUNNING by the submitter directly. - # We ensure that no other instance has started to compute, by checking that the sessionUid is empty. - if chunk.node.getMrNodeType() == meshroom.core.MrNodeType.NODE and not chunk.status.sessionUid and chunk.status.submitterSessionUid: + # We ensure that no other instance has started to compute, by checking that the computeSessionUid is empty. + if chunk.node.getMrNodeType() == meshroom.core.MrNodeType.NODE and \ + not chunk.status.computeSessionUid and node._nodeStatus.submitterSessionUid: continue - print(f'Warning: Node is already submitted with status "{chunk.status.status.name}". See file: "{chunk.statusFile}". ExecMode: {chunk.status.execMode.name}, SessionUid: {chunk.status.sessionUid}, submitterSessionUid: {chunk.status.submitterSessionUid}') + print(f'Warning: Node is already submitted with status "{chunk.status.status.name}". See file: "{chunk.statusFile}". ExecMode: {chunk.status.execMode.name}, computeSessionUid: {chunk.status.computeSessionUid}, submitterSessionUid: {node._nodeStatus.submitterSessionUid}') # sys.exit(-1) if args.extern: @@ -99,8 +127,14 @@ if args.node: node.preprocess() if args.iteration != -1: chunk = node.chunks[args.iteration] + if chunk._status.status == Status.STOPPED: + print(f"Chunk {chunk} : status is STOPPED") + killRunningJob(node) chunk.process(args.forceCompute, args.inCurrentEnv) else: + if node.nodeStatus.status == Status.STOPPED: + print(f"Node {node} : status is STOPPED") + killRunningJob(node) node.process(args.forceCompute, args.inCurrentEnv) node.postprocess() node.restoreLogger() diff --git a/bin/meshroom_createChunks b/bin/meshroom_createChunks new file mode 100755 index 0000000000..e55ade807d --- /dev/null +++ b/bin/meshroom_createChunks @@ -0,0 +1,145 @@ +#!/usr/bin/env python + +""" +This is a script used to wrap the process of processing a node on the farm +It will handle chunk creation and create all the jobs for these chunks +If the submitter cannot create chunks, then it will process the chunks serially +in the current process +""" + +import argparse +import logging +import os +import sys +try: + import meshroom +except Exception: + # If meshroom module is not in the PYTHONPATH, add our root using the relative path + import pathlib + meshroomRootFolder = pathlib.Path(__file__).parent.parent.resolve() + sys.path.append(meshroomRootFolder) + import meshroom +meshroom.setupEnvironment() + +import meshroom.core +import meshroom.core.graph +from meshroom.core import submitters +from meshroom.core.submitter import SubmitterOptionsEnum +from meshroom.core.node import Status + + +parser = argparse.ArgumentParser(description='Execute a Graph of processes.') +parser.add_argument('graphFile', metavar='GRAPHFILE.mg', type=str, + help='Filepath to a graph file.') + +parser.add_argument('--submitter', type=str, required=True, + help='Name of the submitter used to create the job.') +parser.add_argument('--node', metavar='NODE_NAME', type=str, required=True, + help='Process the node. It will generate an error if the dependencies are not already computed.') +parser.add_argument('--inCurrentEnv', help='Execute process in current env without creating a dedicated runtime environment.', + action='store_true') +parser.add_argument('--forceStatus', help='Force computation if status is RUNNING or SUBMITTED.', + action='store_true') +parser.add_argument('--forceCompute', help='Compute in all cases even if already computed.', + action='store_true') +parser.add_argument('--extern', help='Use this option when you compute externally after submission to a render farm from meshroom.', + action='store_true') +parser.add_argument('--cache', metavar='FOLDER', type=str, + default=None, + help='Override the cache folder') +parser.add_argument('-v', '--verbose', + help='Set the verbosity level for logging:\n' + ' - fatal: Show only critical errors.\n' + ' - error: Show errors only.\n' + ' - warning: Show warnings and errors.\n' + ' - info: Show standard informational messages.\n' + ' - debug: Show detailed debug information.\n' + ' - trace: Show all messages, including trace-level details.', + default=os.environ.get('MESHROOM_VERBOSE', 'info'), + choices=['fatal', 'error', 'warning', 'info', 'debug', 'trace']) + +args = parser.parse_args() + +# For extern computation, we want to focus on the node computation log. +# So, we avoid polluting the log with general warning about plugins, versions of nodes in file, etc. +logging.getLogger().setLevel(level=logging.INFO) + +meshroom.core.initPlugins() +meshroom.core.initNodes() +meshroom.core.initSubmitters() # Required to spool child job + +graph = meshroom.core.graph.loadGraph(args.graphFile) +if args.cache: + graph.cacheDir = args.cache +graph.update() + +# Execute the node +node = graph.findNode(args.node) +submittedStatuses = [Status.RUNNING] + +# Find submitter +submitter = None +# It's required if we want to spool chunks on different machines +for subName, sub in submitters.items(): + if args.submitter == subName: + submitter = sub + break + +if node._nodeStatus.status in (Status.STOPPED, Status.KILLED): + logging.error("Node status is STOPPED or KILLED.") + if submitter: + submitter.killRunningJob() + sys.exit(meshroom.MeshroomExitStatus.ERROR_NO_RETRY) + +if not node._chunksCreated: + # Create node chunks + # Once created we don't have to do it again even if we relaunch the job + node.createChunks() + # Set the chunks statuses + for chunk in node._chunks: + if args.forceCompute or chunk._status.status != Status.SUCCESS: + hasChunkToLaunch = True + chunk._status.setNode(node) + chunk._status.initExternSubmit() + chunk.upgradeStatusFile() + +# Get chunks to process in the current process +chunksToProcess = [] +if submitter: + if not submitter._options.includes(SubmitterOptionsEnum.EDIT_TASKS): + chunksToProcess = node.chunks +else: + # Cannot retrieve job -> execute process serially + chunksToProcess = node.chunks + +logging.info(f"[MeshroomCreateChunks] Chunks to process here : {chunksToProcess}") + +if not args.forceStatus and not args.forceCompute: + for chunk in chunksToProcess: + if chunk.status.status in submittedStatuses: + # Particular case for the local isolated, the node status is set to RUNNING by the submitter directly. + # We ensure that no other instance has started to compute, by checking that the sessicomputeSessionUidonUid is empty. + if chunk.node.getMrNodeType() == meshroom.core.MrNodeType.NODE and \ + not chunk.status.computeSessionUid and node._nodeStatus.submitterSessionUid: + continue + logging.warning( + f"[MeshroomCreateChunks] Node is already submitted with status " \ + f"\"{chunk.status.status.name}\". See file: \"{chunk.statusFile}\". " \ + f"ExecMode: {chunk.status.execMode.name}, computeSessionUid: {chunk.status.computeSessionUid}, " \ + f"submitterSessionUid: {node._nodeStatus.submitterSessionUid}") + +if chunksToProcess: + node.prepareLogger() + node.preprocess() + for chunk in chunksToProcess: + logging.info(f"[MeshroomCreateChunks] process chunk {chunk}") + chunk.process(args.forceCompute, args.inCurrentEnv) + node.postprocess() + node.restoreLogger() +else: + logging.info(f"[MeshroomCreateChunks] -> create job to process chunks {[c for c in node.chunks]}") + submitter.createChunkTask(node, graphFile=args.graphFile, cache=args.cache, + forceStatus=args.forceStatus, forceCompute=args.forceCompute) + +# Restore the log level +logging.getLogger().setLevel(meshroom.logStringToPython[args.verbose]) diff --git a/localfarm/__init__.py b/localfarm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/localfarm/localFarm.py b/localfarm/localFarm.py new file mode 100644 index 0000000000..347e3d899c --- /dev/null +++ b/localfarm/localFarm.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python + +""" +Local Farm : A simple local job runner +""" + +from __future__ import annotations # For forward references in type hints + +import logging +import json +import socket +import logging +import uuid +from collections import defaultdict +from pathlib import Path +from typing import Dict, List, Generator + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(name)s][%(levelname)s] %(message)s' +) +logger = logging.getLogger("LocalFarm") +logger.setLevel(logging.INFO) + + +class LocalFarmEngine: + """ Client to communicate with the farm backend """ + + def __init__(self, root): + self.root = Path(root) + self.tcpPortFile = self.root / "backend.port" + + def connect(self): + """Connect to the backend""" + print("Connect to farm located at", self.root) + if self.tcpPortFile.exists(): + try: + port = int(self.tcpPortFile.read_text()) + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect(("localhost", port)) + return sock + except Exception as e: + logger.error(f"Could not connect via TCP: {e}") + raise ConnectionError("Cannot connect to farm backend") + raise ConnectionError("Farm backend not found") + + def _call(self, method, **params): + """Make an query to the backend""" + request = { + "method": method, + "params": params + } + sock = self.connect() + try: + # Send request + request_data = json.dumps(request) + "\n" + sock.sendall(request_data.encode("utf-8")) + # Receive response + response_data = b"" + while True: + chunk = sock.recv(4096) + if not chunk: + break + response_data += chunk + if b"\n" in chunk: + break + response = json.loads(response_data.decode("utf-8")) + if not response.get("success"): + raise RuntimeError(response.get("error", "Unknown error")) + return response + finally: + sock.close() + + def submit_job(self, job: Job): + """ Submit the job to the farm """ + # Create the job + createdJob = self._call("create_job", name=job.name) + jid = createdJob["jid"] + # Create the tasks + tasksCreated = {} + for task in job.tasksDFS(): + parentTasks = job.getTaskDependencies(task) + deps = [] + for parentTask in parentTasks: + if parentTask not in tasksCreated: + raise RuntimeError(f"Parent task {parentTask.name} not created yet") + deps.append(tasksCreated[parentTask]) + createdTask = self._call("create_task", + jid=jid, name=task.name, command=task.command, + metadata=task.metadata, dependencies=deps, env=task.env) + tasksCreated[task] = createdTask["tid"] + # Submit the job + self._call("submit_job", jid=jid) + return {"jid": jid} + + def create_additional_task(self, jid, tid, task): + """ Create new task in an existing job """ + createdTask = self._call("expand_task", + jid=jid, name=task.name, command=task.command, + metadata=task.metadata, parentTid=tid, env=task.env) + return {"tid": createdTask["tid"]} + + def get_job_infos(self, jid): + """Get job status""" + return self._call("get_job_infos", jid=jid)["result"] + + def pause_job(self, jid): + """Pause a job""" + return self._call("pause_job", jid=jid) + + def unpause_job(self, jid): + """Resume a job""" + return self._call("unpause_job", jid=jid) + + def interrupt_job(self, jid): + """Interrupt a job""" + return self._call("interrupt_job", jid=jid) + + def restart_job(self, jid): + """Restart a job""" + return self._call("restart_job", jid=jid) + + def restart_error_tasks(self, jid): + """Restart error tasks""" + return self._call("restart_error_tasks", jid=jid) + + def stop_task(self, jid, tid): + """Stop a specific task""" + return self._call("stop_task", jid=jid, tid=tid) + + def skip_task(self, jid, tid): + """Stop a specific task""" + return self._call("skip_task", jid=jid, tid=tid) + + def restart_task(self, jid, tid): + """Restart a task""" + return self._call("restart_task", jid=jid, tid=tid) + + def list_jobs(self) -> list: + """List all jobs""" + return self._call("list_jobs")["jobs"] + + def get_job_status(self, jid: int) -> dict: + for job in self.list_jobs(): + if job["jid"] == jid: + return job + return {} + + def get_job_errors(self, jid: int) -> str: + """ Get job error logs """ + return self._call("get_job_errors", jid=jid)["result"] + + def ping(self): + """Check if backend is alive""" + try: + self.connect().close() + return True + except: + return False + + +class Task: + def __init__(self, name, command, metadata=None, env=None): + self.uid = str(uuid.uuid1()) + self.name = name + self.command = command + self.metadata = metadata or {} + self.env = env or {} + + def __repr__(self): + return f"" + + def __hash__(self): + return hash(self.uid) + + +class Job: + def __init__(self, name): + self.name = name + self.tasks: Dict[str, Task] = {} + self.dependencies: Dict[str: List[str]] = defaultdict(set) + self.reverseDependencies: Dict[str: List[str]] = defaultdict(set) + self._engine: LocalFarmEngine = None + + def setEngine(self, engine: LocalFarmEngine): + self._engine = engine + + def addTask(self, task): + if task.name in self.tasks: + raise ValueError(f"Task {task} already exists in job") + self.tasks[task.uid] = task + + def addTaskDependency(self, task: Task, dependsOn: Task): + if task.uid not in self.tasks: + raise ValueError(f"Task {task} not found in job") + if dependsOn.uid not in self.tasks: + raise ValueError(f"Task {dependsOn} not found in job") + self.dependencies[task.uid].add(dependsOn.uid) + self.reverseDependencies[dependsOn.uid].add(task.uid) + if self.hasCycle(): + # Rollback + self.dependencies[task.uid].remove(dependsOn.uid) + self.reverseDependencies[dependsOn.uid].remove(task.uid) + raise ValueError("Adding this task creates a cycle in the job dependencies") + + def getTaskDependencies(self, task): + return [self.tasks[depUid] for depUid in self.dependencies.get(task.uid, [])] + + def getRootTasks(self) -> List[Task]: + roots = [] + for taskUid, task in self.tasks.items(): + if not self.dependencies.get(taskUid): + roots.append(task) + return roots + + def hasCycle(self) -> bool: + """ Check there are no cycles in the task graph """ + def exploreTask(taskUid, taskParents=None): + taskParents = taskParents or set() + if taskUid in taskParents: + return True + childrenParents = taskParents.copy() + childrenParents.add(taskUid) + for childUid in self.reverseDependencies[taskUid]: + failed = exploreTask(childUid, childrenParents) + if failed: + return True + return False + # Start from root and explore down + roots = self.getRootTasks() + if not roots: + return True + for task in roots: + failed = exploreTask(task.uid) + if failed: + return True + return False + + def tasksDFS(self) -> Generator[Task]: + """ + Return tasks in topological order (dependencies before dependents). + Tasks closer to roots appear first. + """ + taskLevels = {} + def exploreTask(task: str, currentLevel=0): + if task in taskLevels: + if currentLevel > taskLevels[task]: + taskLevels[task] = currentLevel + else: + taskLevels[task] = currentLevel + for child in self.reverseDependencies[task]: + exploreTask(child, currentLevel + 1) + # Start from root and explore down + for task in self.getRootTasks(): + exploreTask(task.uid) + taskByLevel = defaultdict(list) + for taskUid, level in taskLevels.items(): + taskByLevel[level].append(self.tasks[taskUid]) + levels = sorted(list(taskByLevel.keys())) + for level in levels: + tasks = taskByLevel[level] + for task in tasks: + yield task + + def submit(self, engine: LocalFarmEngine = None): + engine = engine or self._engine + if engine: + result = engine.submit_job(self) + return result + else: + raise ValueError("No LocalFarmEngine set for this job") + + +def test(): + # _ B - D - F - G - H _ + # / / \ \ + # A - / - I -- J + # \ / + # - C - E - K - L - M + # \_____/ + job = Job("job") + for node in ["F", "B", "K", "J", "A", "M", "L", "E", "C", "D", "G", "H", "I"]: + job.addTask(Task(node, "")) + + def addTaskDependencies(taskName, parentTaskName): + task = next(t for t in job.tasks.values() if t.name == taskName) + parentTask = next(t for t in job.tasks.values() if t.name == parentTaskName) + job.addTaskDependency(task, parentTask) + + addTaskDependencies("B", "A") + addTaskDependencies("C", "A") + addTaskDependencies("D", "B") + addTaskDependencies("E", "C") + addTaskDependencies("F", "D") + addTaskDependencies("C", "L") + addTaskDependencies("F", "E") + addTaskDependencies("K", "E") + addTaskDependencies("M", "K") + addTaskDependencies("G", "F") + addTaskDependencies("H", "G") + addTaskDependencies("I", "G") + addTaskDependencies("J", "I") + addTaskDependencies("J", "H") + + print("Tasks order : ", end="") + for task in job.tasksDFS(): + print(f"{task.name} -> ", end="") + print("END") diff --git a/localfarm/localFarmBackend.py b/localfarm/localFarmBackend.py new file mode 100644 index 0000000000..8bb95cc31b --- /dev/null +++ b/localfarm/localFarmBackend.py @@ -0,0 +1,657 @@ +#!/usr/bin/env python + +""" +Local Farm : A simple local job runner +""" + +import os +import sys +import random +import argparse +import json +import shlex +import time +import signal +import logging +import subprocess +from pathlib import Path +from datetime import datetime +from collections import defaultdict +from typing import Optional, Union, Dict, List +from enum import Enum +# For the tcp server +import threading +from socketserver import BaseRequestHandler, ThreadingTCPServer + +FARM_MAX_PARALLEL_TASKS = 10 +MAX_BYTES_REQUEST = 4096 # 8192 / 65536 if needed + +PathLike = Union[str, Path] + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(name)s][%(levelname)s] %(message)s' +) +logger = logging.getLogger("LocalFarmBackend") +logger.setLevel(logging.DEBUG) + + +class Status(Enum): + NONE = 0 + SUBMITTED = 1 + RUNNING = 2 + ERROR = 3 + STOPPED = 4 + KILLED = 5 + SUCCESS = 6 + PAUSED = 7 + + +class Task: + def __init__(self, jid: str, tid: str, label: str, command: str, metadata: dict, jobDir: PathLike, env: dict = None): + self.jid: str = jid + self.tid: str = tid + self.parentTids = [] # Tasks that must be completed before this one + self.childTids = [] # Task that depend on this one + self.label: str = label + self.command: str = command + self.metadata: dict = metadata or {} + self.env: dict = env or {} + self.taskDir: Path = Path(jobDir) / "tasks" + self.taskDir.mkdir(parents=True, exist_ok=True) + self.status: Status = Status.NONE + self.created_at = datetime.now() + self.started_at = None + self.finished_at = None + self.returnCode = None + self.process = None + self.logFile: Path = self.taskDir / f"{tid}.log" + + def to_dict(self): + return { + "jid": self.jid, + "tid": self.tid, + "label": self.label, + "command": self.command, + "metadata": self.metadata, + "env": self.env, + "status": self.status.name, + "created_at": self.created_at.isoformat(), + "started_at": self.started_at.isoformat() if self.started_at else None, + "finished_at": self.finished_at.isoformat() if self.finished_at else None, + "returnCode": self.returnCode + } + + +class Job: + def __init__(self, jid: str, label: str, farmRoot: PathLike, maxParallel: int=4): + self.jid: str = jid + self.label: str = label + self.submitted: bool = False + self.jobDir: Path = Path(farmRoot) / "jobs" / str(jid) + self.jobDir.mkdir(parents=True, exist_ok=True) + self.lastJid = 0 + self.status: Status = Status.NONE + self.created_at = datetime.now() + self.started_at = None + self.tasks: List[Task] = [] + self.maxParallel: int = maxParallel + # Runtime tasks status + self.__stoppedTasks = [] + + def to_dict(self): + return { + "jid": self.jid, + "label": self.label, + "submitted": self.submitted, + "status": self.status.name, + "created_at": self.created_at.isoformat(), + "started_at": self.started_at.isoformat() if self.started_at else None, + "tasks": [t.to_dict() for t in self.tasks], + "maxParallel": self.maxParallel + } + + @property + def errorLogs(self): + errorLog = "" + for task in self.tasks: + if task.status in (Status.ERROR, Status.STOPPED, Status.KILLED): + errorLog += f"Task {task.tid} failed :\n{task.logFile.read_text()}\n" + return errorLog + + @property + def rootTasks(self): + return [t for t in self.tasks if len(t.parentTids) == 0] + + def addTaskDependency(self, parentTask: Task, childTask: Task): + parentTask.childTids.append(childTask.tid) + childTask.parentTids.append(parentTask.tid) + + def canStartTask(self, task: Task): + for parentTid in task.parentTids: + parentTask = next((t for t in self.tasks if t.tid == parentTid), None) + if parentTask and parentTask.status != Status.SUCCESS: + return False + return True + + def getNextTaskToProcess(self): + # TODO : better to use the DFS implemented in localFarm.py + # Function to explore tasks + def exploreTask(task): + if task.status == Status.SUBMITTED: + return task + if task.status != Status.SUCCESS: + return None + children = [t for t in self.tasks if t.tid in task.childTids] + for taskCandidate in children: + submittedTask = exploreTask(taskCandidate) + if submittedTask: + return submittedTask + return None + for task in self.rootTasks: + submittedTask = exploreTask(task) + if submittedTask: + return submittedTask + return None + + def start(self): + self.status = Status.RUNNING + self.started_at = datetime.now() + for task in self.tasks: + task.status = Status.SUBMITTED + + def updateStatusFromTasks(self): + for task in self.tasks: + if task.status in (Status.ERROR, Status.STOPPED, Status.KILLED): + self.status = Status.STOPPED + return + elif task.status == Status.RUNNING: + self.status = Status.RUNNING + return + + def interrupt(self): + logger.info(f"Interrupt job {self.jid}") + self.status = Status.STOPPED + for task in self.tasks: + if task.status == Status.RUNNING and task.process: + logger.info(f"Interrupt task {task.tid}") + self.__stoppedTasks.append(task) + task.process.terminate() + task.status = Status.STOPPED + logger.info(f"Job {self.jid} interrupted") + + def restart(self): + self.interrupt() + self.start() + + def restartErrorTasks(self): + self.status = Status.RUNNING + for task in self.tasks: + if task.status in (Status.ERROR, Status.STOPPED, Status.KILLED): + task.status = Status.SUBMITTED + + def resume(self): + logger.info(f"Resume job {self.jid}") + self.status = Status.RUNNING + for task in self.__stoppedTasks: + if task.status == Status.STOPPED: + task.status = Status.SUBMITTED + self.__stoppedTasks = [] + + def stopTask(self, tid): + for task in self.tasks: + if task.tid == tid: + if task.process and task.process.poll() is None: + task.process.terminate() + task.status = Status.STOPPED + logger.info(f"Task {tid} stopped") + return True + return False + + def skipTask(self, tid): + task = next((t for t in self.tasks if t.tid == tid), None) + if not task: + return False + task.status = Status.SUCCESS + if task.process and task.process.poll() is None: + task.process.terminate() + logger.info(f"Task {tid} skipped") + return True + + def restartTask(self, tid): + for task in self.tasks: + if task.tid == tid: + if task.process and task.process.poll() is None: + task.process.terminate() + task.status = Status.SUBMITTED + task.started_at = None + task.finished_at = None + task.return_code = None + task.process = None + logger.info(f"Task {tid} rescheduled") + return True + return False + + +class LocalFarmEngine: + def __init__(self, root: PathLike, maxParallel: int = FARM_MAX_PARALLEL_TASKS): + self.root: Path = Path(root) + self.root.mkdir(parents=True, exist_ok=True) + # Jobs + self.jobs: Dict[int, Job] = {} + self.lastJid = 0 + self.running = False + self.lock = threading.RLock() + # PID file + self.pidFile = self.root / "farm.pid" + self.pidFile.write_text(str(os.getpid())) + # Socket path + self.tcpPortFile = self.root / "backend.port" + logger.info(f"Backend initialized at {self.root}") + self.maxParallel: int = maxParallel + + def start(self): + """Start the server""" + logger.info(f"Starting the server...") + # Start the server to listen to queries + self.running = True + handler = lambda *args: LocalFarmRequestHandler(self, *args) + self.server = ThreadingTCPServer(('localhost', 0), handler) + port = self.server.server_address[1] + self.tcpPortFile.write_text(str(port)) + logger.info(f"Server listening on TCP port: {port}") + # Start server in separate thread + serverThread = threading.Thread(target=self.server.serve_forever, daemon=True) + serverThread.start() + # Start task processor + processThread = threading.Thread(target=self.taskRunner, daemon=True) + processThread.start() + # Wait for shutdown signal + signal.signal(signal.SIGTERM, self.signalHandler) + signal.signal(signal.SIGINT, self.signalHandler) + try: + while self.running: + time.sleep(1) + finally: + self.cleanup() + + def signalHandler(self, signum, frame): + logger.info(f"Received signal {signum}, shutting down...") + self.running = False + + def taskRunner(self): + """Background thread that processes tasks""" + while self.running: + try: + with self.lock: + self.processJobs() + time.sleep(0.5) + except Exception as e: + logger.error(f"Error in task processor: {e}", exc_info=True) + + def processJobs(self): + """Process all active jobs""" + runningTasks = defaultdict(list) + tasksToStart = defaultdict(list) + for job in self.jobs.values(): + job.updateStatusFromTasks() + if not job.submitted or job.status in [Status.PAUSED, Status.SUCCESS, Status.STOPPED]: + continue + elif job.status == Status.SUBMITTED: + job.start() + # Update running tasks + runningTasks[job.jid] = [t for t in job.tasks if t.status == Status.RUNNING] + # Update tasks to start + for task in job.tasks: + if task.status == Status.SUBMITTED: + if job.canStartTask(task): + tasksToStart[job].append(task) + elif task.status == Status.RUNNING and task.process: + # Check if process finished + returncode = task.process.poll() + if returncode is not None: + self.finishTask(task, returncode) + + # Check if job is complete + if any(t.status in [Status.ERROR, Status.STOPPED, Status.KILLED] for t in job.tasks): + job.status = Status.ERROR + logger.error(f"Job {job.jid} failed !") + elif all(t.status in [Status.SUCCESS, Status.NONE] for t in job.tasks): + job.status = Status.SUCCESS + logger.info(f"Job {job.jid} finished !") + # else : keep running or paused + + # Launch tasks + nbRunningTasks = sum(len(tasks) for tasks in runningTasks.values()) + tasks = [] + for job, jobTasks in tasksToStart.items(): + # while True: + # nextTask = job.getNextTaskToProcess() + # if not nextTask: + # break + for task in jobTasks: + tasks.append((job, task)) + random.shuffle(tasks) # Randomize task order to be fair between jobs + for job, task in tasks: + nbJobRunningTasks = len(runningTasks[job.jid]) + if job.maxParallel > nbJobRunningTasks and self.maxParallel > nbRunningTasks: + nbRunningTasks += 1 + nbJobRunningTasks += 1 + self.startTask(task) + + def startTask(self, task: Task): + """Start a task process""" + logger.info(f"Starting task {task.tid}: {task.command}") + task.status = Status.RUNNING + task.started_at = datetime.now() + # Create log file + additional_env = { + "LOCALFARM_CURRENT_JID": str(task.jid), + "LOCALFARM_CURRENT_TID": str(task.tid), + "MR_LOCAL_FARM_PATH": str(self.root) + } + additional_env.update(task.env) + process_env = os.environ.copy() + process_env.update(additional_env) + try: + + with open(task.logFile, "w") as log: + log.write(f"# ========== Starting task {task.tid} at {task.started_at.isoformat()}" \ + f" (command=\"{task.command}\") ==========\n") + log.write(f"# process_env:\n") + log.write(f"# Additional env variables:\n") + for _k, _v in additional_env.items(): + log.write(f"# - {str(_k)}={str(_v)}\n") + log.write(f"\n") + task.process = subprocess.Popen( + task.command, + # shlex.split(task.command), + stdout=log, + stderr=log, + cwd=task.taskDir, + env=process_env, + shell=True + ) + except Exception as e: + logger.error(f"Failed to start task {task.tid}: {e}") + task.status = "error" + task.finished_at = datetime.now() + + def finishTask(self, task: Task, returncode: int): + task.finished_at = datetime.now() + task.return_code = returncode + if returncode == 0: + task.status = Status.SUCCESS + logger.info(f"Task {task.tid} completed") + else: + task.status = Status.ERROR + logger.error(f"Task {task.tid} failed with code {returncode}") + with open(task.logFile, "a") as log: + log.write(f"\n# ========== Task {task.tid} finished at {task.finished_at.isoformat()} with status {task.status} ==========\n") + + def cleanup(self): + logger.info("Cleaning up...") + with self.lock: + for job in self.jobs.values(): + for task in job.tasks: + if task.process and task.process.poll() is None: + logger.info(f"Terminating task {task.tid}") + task.process.terminate() + try: + task.process.wait(timeout=5) + except subprocess.TimeoutExpired: + task.process.kill() + self.server.shutdown() + self.pidFile.unlink(missing_ok=True) + logger.info("Cleanup complete") + + # ====================== + # API Calls + # ====================== + + # Author + + def create_job(self, name): + """Create a new job""" + with self.lock: + # Generate new jid + self.lastJid += 1 + jid = self.lastJid + try: + job = Job(jid, label=name, farmRoot=self.root) + except Exception as err: + return {"success": False, "error": str(err)} + self.jobs[jid] = job + logger.info(f"Created job {jid}") + return {"success": True, "jid": jid} + + def create_task(self, jid, name, command, metadata, dependencies, env=None): + """Add a task to a job""" + with self.lock: + if jid not in self.jobs: + return {"success": False, "error": "Job not found"} + job = self.jobs[jid] + job.lastJid += 1 + tid = job.lastJid + task = Task(jid, tid, name, command, metadata, job.jobDir, env=env) + job.tasks.append(task) + for parentTid in dependencies: + parentTask = next((t for t in job.tasks if t.tid == parentTid), None) + if parentTask: + job.addTaskDependency(parentTask, task) + else: + logger.warning(f"Task {tid} : Cannot add dependency to {parentTid}, task not found in job {jid}") + logger.info(f"Added task {tid} to job {jid}") + return {"success": True, "tid": tid} + + def expand_task(self, jid, name, command, metadata, parentTid, env=None): + with self.lock: + if jid not in self.jobs: + logger.info(f"Available jobs: {list(self.jobs.keys())}") + return {"success": False, "error": "Job not found"} + job = self.jobs[jid] + job.lastJid += 1 + tid = job.lastJid + task = Task(jid, tid, name, command, metadata, job.jobDir, env=env) + task.status = Status.SUBMITTED + job.tasks.append(task) + parentTask = next((t for t in job.tasks if t.tid == parentTid), None) + if not parentTask: + logger.error(f"Could not expand task {parentTid} : cannot find it in the job {job} ({jid})") + return {"success": False, "error": f"Parent task {parentTid} not found in job {jid}"} + for childTid in parentTask.childTids: + childTask = next((t for t in job.tasks if t.tid == childTid), None) + if not childTask: + logger.error(f"Could not find expanded task child {childTid}") + job.addTaskDependency(task, childTask) + logger.info(f"Added expanded task {tid} to job {jid}") + return {"success": True, "tid": tid} + + def submit_job(self, jid): + """Create a new job""" + with self.lock: + if jid not in self.jobs: + return {'success': False, "error": "Job not found"} + try: + job = self.jobs[jid] + job.submitted = True + job.status = Status.SUBMITTED + except Exception as err: + return {"success": False, "error": str(err)} + logger.info(f"Submitted job {jid}") + return {"success": True, "jid": jid} + + # Query + + def get_job_infos(self, jid): + """Get job status""" + with self.lock: + if jid not in self.jobs: + return {'success': False, "error": "Job not found"} + job = self.jobs[jid] + return {"success": True, "result": job.to_dict()} + + def get_job_errors(self, jid): + """Get job error logs""" + with self.lock: + if jid not in self.jobs: + return {'success': False, "error": "Job not found"} + job = self.jobs[jid] + return {"success": True, "result": job.errorLogs} + + def pause_job(self, jid): + """Pause a job""" + with self.lock: + if jid not in self.jobs: + return {"success": False, "error": "Job not found"} + self.jobs[jid].status = Status.PAUSED + logger.info(f"Job {jid} paused") + return {"success": True} + + def unpause_job(self, jid): + """Resume a job""" + with self.lock: + if jid not in self.jobs: + return {"success": False, "error": "Job not found"} + self.jobs[jid].resume() + return {"success": True} + + def interrupt_job(self, jid): + """Interrupt a job and kill running tasks""" + with self.lock: + if jid not in self.jobs: + return {"success": False, "error": "Job not found"} + self.jobs[jid].interrupt() + return {"success": True} + + def restart_job(self, jid): + """Restarts a job and kill running tasks""" + with self.lock: + if jid not in self.jobs: + return {"success": False, "error": "Job not found"} + self.jobs[jid].restart() + return {"success": True} + + def restart_error_tasks(self, jid): + """Restarts all error tasks in the job""" + with self.lock: + if jid not in self.jobs: + return {"success": False, "error": "Job not found"} + self.jobs[jid].restartErrorTasks() + return {"success": True} + + def stop_task(self, jid, tid): + """Stop a specific task""" + with self.lock: + if jid not in self.jobs: + return {"success": False, "error": "Job not found"} + res = self.jobs[jid].stopTask(tid) + if res: + return {"success": True} + else: + return {"success": False, "error": "Task not found"} + + def skip_task(self, jid, tid): + """Stop a specific task""" + with self.lock: + if jid not in self.jobs: + return {"success": False, "error": "Job not found"} + res = self.jobs[jid].skipTask(tid) + if res: + return {"success": True} + else: + return {"success": False, "error": "Task not found"} + + def restart_task(self, jid, tid): + """Restart a task""" + with self.lock: + if jid not in self.jobs: + return {"success": False, "error": "Job not found"} + res = self.jobs[jid].restartTask(tid) + if res: + return {"success": True} + else: + return {"success": False, "error": "Task not found"} + + def list_jobs(self): + """List all jobs""" + with self.lock: + return { + "success": True, + "jobs": [job.to_dict() for job in self.jobs.values()] + } + + +class LocalFarmRequestHandler(BaseRequestHandler): + """Handle requests""" + + def __init__(self, backend, *args, **kwargs): + self.backend = backend + super().__init__(*args, **kwargs) + + @property + def pid(self): + return self.server.server_address[1] + + def handle(self): + """Handle incoming request""" + try: + # Read request + data = b"" + while True: + token = self.request.recv(MAX_BYTES_REQUEST) + if not token: + break + data += token + if b"\n" in token: + break + if not data: + return + request = json.loads(data.decode("utf-8")) + logger.debug(f"Received request: {request}") + # Dispatch method + method = request.get("method") + params = request.get("params", {}) + if not hasattr(self.backend, method): + response = {"success": False, "error": f"Unknown request: {method}"} + else: + try: + result = getattr(self.backend, method)(**params) + response = result + except Exception as e: + logger.error(f"Error executing {method}: {e}", exc_info=True) + response = {'success': False, 'error': str(e)} + # Send response + response_data = json.dumps(response) + '\n' + self.request.sendall(response_data.encode('utf-8')) + + except Exception as e: + logger.error(f"Error handling request: {e}", exc_info=True) + + +def main(root): + # Daemonize + if os.fork() > 0: + sys.exit(0) + os.setsid() + if os.fork() > 0: + sys.exit(0) + + # Redirect standard file descriptors + sys.stdout.flush() + sys.stderr.flush() + with open(os.devnull, 'r') as devnull: + os.dup2(devnull.fileno(), sys.stdin.fileno()) + + backend = LocalFarmEngine(root=root) + backend.start() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Execute a Graph of processes.') + parser.add_argument('--root', type=str, required=False, help='Root path for the farm.') + args = parser.parse_args() + root = args.root + if not root: + root = os.getenv("MR_LOCAL_FARM_PATH", os.path.join(os.path.expanduser("~"), ".local_farm")) + main(root) diff --git a/localfarm/localFarmLauncher.py b/localfarm/localFarmLauncher.py new file mode 100644 index 0000000000..a6037f43c7 --- /dev/null +++ b/localfarm/localFarmLauncher.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python + +import os +import shutil +import sys +import time +import signal +import argparse +from pathlib import Path +import subprocess +from collections import defaultdict + +from localfarm.localFarm import LocalFarmEngine + + +class FarmLauncher: + def __init__(self, root=None): + self.root = Path(root or Path.home() / ".local_farm") + self.root.mkdir(parents=True, exist_ok=True) + self.pidFile = self.root / "farm.pid" + self.logFile = self.root / "backend.log" + + def clean(self): + """Clean farm backend files""" + print("Clean farm files...") + if self.logFile.exists(): + self.logFile.unlink() + if (self.root / "jobs").exists(): + shutil.rmtree(str((self.root / "jobs"))) + if not self.is_running(): + self.pidFile.unlink(missing_ok=True) + (self.root / "backend.port").unlink(missing_ok=True) + print("Done.") + + def start(self): + """Start the farm backend""" + if self.is_running(): + print("Farm backend is already running") + return + self.clean() + + print("Starting farm backend...") + print(f"Farm root is: {self.root}") + # Get path to backend script + backendScript = Path(__file__).parent / "localFarmBackend.py" + # Start backend as daemon + with open(self.logFile, 'a') as log: + subprocess.Popen( + [sys.executable, str(backendScript), "--root", str(self.root)], + stdout=log, + stderr=log, + # stderr=subprocess.PIPE, + start_new_session=True + ) + + # Wait for it to start + for _ in range(10): + time.sleep(0.5) + if self.is_running(): + print(f"Farm backend started (PID: {self.getFarmPid()})") + print(f"Logs: {self.logFile}") + return + + print("Failed to start farm backend") + sys.exit(1) + + def stop(self): + """Stop the farm backend""" + if not self.is_running(): + print("Farm backend is not running") + return + + pid = self.getFarmPid() + print(f"Stopping farm backend (PID: {pid})...") + + try: + os.kill(pid, signal.SIGTERM) + + # Wait for it to stop + for _ in range(10): + time.sleep(0.5) + if not self.is_running(): + print("Farm backend stopped") + return + + # Force kill if still running + print("Force killing farm backend...") + os.kill(pid, signal.SIGKILL) + + except ProcessLookupError: + print("Backend process not found") + self.pidFile.unlink(missing_ok=True) + + def restart(self): + """Restart the farm backend""" + self.stop() + time.sleep(1) + self.start() + + def getJobsInfos(self): + if self.is_running(): + # Try to get job list + try: + engine = LocalFarmEngine(root=self.root) + jobs = engine.list_jobs() + return jobs + except Exception as e: + raise ValueError(f"Could not fetch jobs: {e}") + else: + print("Farm backend is not running") + return [] + + def status(self, allInfos=False): + """Show status of the farm backend""" + if self.is_running(): + pid = self.getFarmPid() + print(f"Farm backend is running (PID: {pid})") + + # Try to get job list + try: + engine = LocalFarmEngine(root=self.root) + jobs = engine.list_jobs() + print(f"Active jobs: {len(jobs)}") + for job in jobs: + jid = job.get("jid") + taskByStatus = defaultdict(set) + for task in job['tasks']: + status = task.get("status", "UNKNOWN") + taskByStatus[status].add(task.get("tid")) + print(f" - {jid}: {job['status']} ({len(job['tasks'])} tasks) -> {dict(taskByStatus)}") + if allInfos: + for task in job['tasks']: + print(f" * Task {task['tid']}: {task}") + print("") + except Exception as e: + print(f"Could not get job list: {e}") + else: + print("Farm backend is not running") + + def is_running(self): + """Check if backend is running""" + pid = self.getFarmPid() + if pid is None: + return False + try: + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + + def getFarmPid(self): + """Get PID of running backend""" + if not self.pidFile.exists(): + return None + try: + return int(self.pidFile.read_text()) + except: + return None + + +def main(root, command): + launcher = FarmLauncher(root=root) + if command == 'clean': + return launcher.clean() + if command == 'start': + return launcher.start() + elif command == 'stop': + return launcher.stop() + elif command == 'restart': + return launcher.restart() + elif command == 'status': + return launcher.status() + elif command == 'fullInfos': + return launcher.status(allInfos=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Local Farm Launcher') + parser.add_argument('command', + choices=['clean', 'start', 'stop', 'restart', 'status', 'fullInfos'], + help='Command to execute') + parser.add_argument('--root', required=False, help='Farm directory path') + args = parser.parse_args() + + root = args.root + if not root: + root = os.getenv("MR_LOCAL_FARM_PATH", os.path.join(os.path.expanduser("~"), ".local_farm")) + + main(root, args.command) diff --git a/localfarm/test.py b/localfarm/test.py new file mode 100644 index 0000000000..03f12f749c --- /dev/null +++ b/localfarm/test.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python + +import os +from time import sleep +from localfarm.localFarm import Task, Job, LocalFarmEngine +from localfarm.localFarmLauncher import FarmLauncher +from collections import defaultdict +from typing import List + + +class TestLocalFarm: + def __init__(self, farmPath): + self.launcher = FarmLauncher(root=farmPath) + self.engine = LocalFarmEngine(farmPath) + + def prepare(self): + self.launcher.clean() + self.launcher.start() + + def createTask(self, job: Job, i: int, sleepTime=2, dependencies: List[Task] = None): + dependencies = dependencies or [] + task = Task(f"Task {i}", f"echo 'Hello from Task {i}' && sleep {sleepTime}") + job.addTask(task) + for parentTask in dependencies: + job.addTaskDependency(task, parentTask) + return task + + def expandTask(self, jid, tid, n=2): + for i in range(n): + task = Task(f"Expanded Task {i}", f"echo 'Hello from Expanded Task {i}' && sleep 5") + self.engine.create_additional_task(jid, tid, task) + + def getTasksByStatus(self, jid: int): + jobInfos = self.engine.get_job_status(jid) + if not jobInfos: + return {} + taskByStatus = defaultdict(set) + for task in jobInfos.get("tasks", []): + status = task.get("status", "UNKNOWN") + taskByStatus[status].add(task.get("tid")) + return dict(taskByStatus) + + def run(self): + # Create job + job = Job("Example Job") + job.setEngine(self.engine) + # Add tasks + task1 = self.createTask(job, 1, sleepTime=2, dependencies=[]) + task2 = self.createTask(job, 2, sleepTime=2, dependencies=[task1]) + task3 = self.createTask(job, 3, sleepTime=2, dependencies=[task1]) + task4 = self.createTask(job, 4, sleepTime=2, dependencies=[task2, task3]) + task5 = self.createTask(job, 5, sleepTime=2, dependencies=[task4]) + # Submit job + res = job.submit() + jid = res['jid'] + # Monitor job + currentRunningTids = set() + hasExpanded = False + while True: + sleep(1) + tasks = self.getTasksByStatus(jid) + if not tasks: + print("No tasks found for job") + break + runningTids = tasks.get("RUNNING") + activeTasks = tasks.get("SUBMITTED", set()).union(tasks.get("RUNNING", set())) + if not activeTasks: + print("All tasks completed") + break + if runningTids: + runningTids = [int(t) for t in runningTids] + newRunningTasks = set(runningTids) + if currentRunningTids != newRunningTasks: + print(f"Now running tasks: {runningTids} (active tasks: {activeTasks})") + currentRunningTids = newRunningTasks + expandingTid = 5 + if not hasExpanded and expandingTid in runningTids: + hasExpanded = True + print(f"Expanding task {expandingTid}") + self.expandTask(jid, expandingTid, n=2) + + def finish(self): + self.launcher.stop() + # self.launcher.clean() + + +def test(): + farm_path = os.getenv("MR_LOCAL_FARM_PATH", os.path.join(os.path.expanduser("~"), ".local_farm")) + # farm_path = "/s/prods/mvg/_source_global/users/sonoleta/tmp/local_farm" + _test = TestLocalFarm(farm_path) + try: + _test.prepare() + _test.run() + except Exception as e: + print(f"Test failed: {e}") + _test.finish() + raise e + finally: + _test.finish() + + +if __name__ == "__main__": + test() diff --git a/meshroom/__init__.py b/meshroom/__init__.py index d487f22d67..d957abdbed 100644 --- a/meshroom/__init__.py +++ b/meshroom/__init__.py @@ -1,5 +1,5 @@ from distutils import util -from enum import Enum +from enum import Enum, IntEnum import logging import os import sys @@ -72,6 +72,18 @@ def logToRoot(message, *args, **kwargs): logging.getLogger().setLevel(logStringToPython[os.environ.get('MESHROOM_VERBOSE', 'warning')]) +class MeshroomExitStatus(IntEnum): + """ In case we want to catch some special case from the parent process + We could use 3-125 for custom exist codes : + https://tldp.org/LDP/abs/html/exitcodes.html + """ + SUCCESS = 0 + ERROR = 1 + # In some farm tools jobs are automatically re-tried, + # using ERROR_NO_RETRY will try to prevent that + ERROR_NO_RETRY = -999 # It's actually -999 % 256 => 25 + + def setupEnvironment(backend=Backend.STANDALONE): """ Setup environment for Meshroom to work in a prebuilt, standalone configuration. diff --git a/meshroom/core/__init__.py b/meshroom/core/__init__.py index cf5c602fe0..cfc758cbcf 100644 --- a/meshroom/core/__init__.py +++ b/meshroom/core/__init__.py @@ -121,7 +121,7 @@ def loadClasses(folder: str, packageName: str, classType: type) -> list[type]: classes.append(p) except Exception as e: if classType == BaseSubmitter: - logging.warning(f" Could not load submitter {pluginName} from package '{package.__name__}'") + logging.warning(f" Could not load submitter {pluginName} from package '{package.__name__}'\n{e}") else: tb = traceback.extract_tb(e.__traceback__) last_call = tb[-1] diff --git a/meshroom/core/desc/computation.py b/meshroom/core/desc/computation.py index 0386fab474..0384e8b840 100644 --- a/meshroom/core/desc/computation.py +++ b/meshroom/core/desc/computation.py @@ -1,13 +1,15 @@ import math -from enum import Enum +from enum import IntEnum from .attribute import ListAttribute, IntParam -class Level(Enum): +class Level(IntEnum): NONE = 0 NORMAL = 1 INTENSIVE = 2 + EXTREME = 3 + SCRIPT=-1 class Range: @@ -45,6 +47,9 @@ def toDict(self): "rangeFullSize": self.fullSize, "rangeBlocksCount": self.nbBlocks } + + def __repr__(self): + return f"" class Parallelization: diff --git a/meshroom/core/desc/node.py b/meshroom/core/desc/node.py index c8d795ae5a..da1a6bd0d1 100644 --- a/meshroom/core/desc/node.py +++ b/meshroom/core/desc/node.py @@ -2,11 +2,12 @@ from inspect import getfile from pathlib import Path import logging -import os import psutil import shlex import shutil import sys +import signal +import subprocess from .computation import Level, StaticNodeSize from .attribute import StringParam, ColorParam, ChoiceParam @@ -20,6 +21,34 @@ _MESHROOM_COMPUTE_DEPS = ["psutil"] +# Handle cleanup +class ExitCleanup: + """ + Make sure we kill child subprocesses when the main process exits receive SIGTERM. + """ + + def __init__(self): + self._subprocesses = [] + signal.signal(signal.SIGTERM, self.exit) + + def addSubprocess(self, process): + logging.debug(f"[ExitCleanup] Register subprocess {process}") + self._subprocesses.append(process) + + def exit(self, signum, frame): + for proc in self._subprocesses: + logging.debug(f"[ExitCleanup] Kill subprocess {proc}") + try: + if proc.is_running(): + proc.terminate() + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + sys.exit(0) + +exitCleanup = ExitCleanup() + + class MrNodeType(enum.Enum): NONE = enum.auto() BASENODE = enum.auto() @@ -91,6 +120,9 @@ class BaseNode(object): documentation = "" category = "Other" plugin = None + # Licenses required to run the plugin + # Only used to select machines on the farm when the node is submitted + _licenses = [] def __init__(self): super(BaseNode, self).__init__() @@ -159,7 +191,7 @@ def processChunk(self, chunk): def executeChunkCommandLine(self, chunk, cmd, env=None): try: - with open(chunk.logFile, 'w') as logF: + with open(chunk.getLogFile(), 'w') as logF: chunk.status.commandLine = cmd chunk.saveStatusFile() cmdList = shlex.split(cmd) @@ -168,7 +200,7 @@ def executeChunkCommandLine(self, chunk, cmd, env=None): print(f"Starting Process for '{chunk.node.name}'") print(f" - commandLine: {cmd}") - print(f" - logFile: {chunk.logFile}") + print(f" - logFile: {chunk.getLogFile()}") if prog: cmdList[0] = Path(prog).as_posix() print(f" - command full path: {cmdList[0]}") @@ -193,6 +225,7 @@ def executeChunkCommandLine(self, chunk, cmd, env=None): env=env, **platformArgs, ) + exitCleanup.addSubprocess(chunk.subprocess) if hasattr(chunk, "statThread"): # We only have a statThread if the node is running in the current process @@ -213,7 +246,7 @@ def executeChunkCommandLine(self, chunk, cmd, env=None): pass if chunk.subprocess.returncode != 0: - with open(chunk.logFile, "r") as logF: + with open(chunk.getLogFile(), "r") as logF: logContent = "".join(logF.readlines()) raise RuntimeError(f'Error on node "{chunk.name}":\nLog:\n{logContent}') finally: diff --git a/meshroom/core/graph.py b/meshroom/core/graph.py index c5c5d2728c..8edf682527 100644 --- a/meshroom/core/graph.py +++ b/meshroom/core/graph.py @@ -15,12 +15,14 @@ import meshroom.core from meshroom.common import BaseObject, DictModel, Slot, Signal, Property from meshroom.core import Version +from meshroom.core import submitters from meshroom.core.attribute import Attribute, ListAttribute, GroupAttribute from meshroom.core.exception import GraphCompatibilityError, StopGraphVisit, StopBranchVisit from meshroom.core.graphIO import GraphIO, GraphSerializer, TemplateGraphSerializer, PartialGraphSerializer from meshroom.core.node import BaseNode, Status, Node, CompatibilityNode from meshroom.core.nodeFactory import nodeFactory from meshroom.core.mtyping import PathLike +from meshroom.core.submitter import BaseSubmittedJob, jobManager # Replace default encoder to support Enums @@ -1091,6 +1093,10 @@ def discoverVertex(vertex, graph): raise StopBranchVisit() def finishVertex(vertex, graph): + if not vertex.chunks: + # Chunks have not been initialized + nodes.append(vertex) + return chunksToProcess = [] for chunk in vertex.chunks: if chunk.status.status is not Status.SUCCESS: @@ -1452,6 +1458,23 @@ def updateNodesPerUid(self): # Now, update each individual node for node in self.nodes: node.updateDuplicates(nodesPerUid) + + def updateJobManagerWithNode(self, node): + if node._uid in jobManager._nodeToJob.keys(): + return + jobInfos = node._nodeStatus.jobInfos + if not jobInfos: + return + jid, subName = jobInfos.get("jid"), jobInfos.get("submitterName") + for _subName, sub in submitters.items(): + if _subName == subName: + try: + job = sub.retrieveJob(int(jid)) + jobManager.addJob(job, [node]) + break + except Exception as e: + logging.warning(f"Failed to retrieve job {jid} from submitter {subName} : {e}") + break def update(self): if not self._updateEnabled: @@ -1464,6 +1487,7 @@ def update(self): self.updateStatusFromCache() for node in self.nodes: node.dirty = False + self.updateJobManagerWithNode(node) self.updateNodesPerUid() @@ -1474,6 +1498,9 @@ def update(self): self.dirtyTopology = False self.updated.emit() + + def updateMonitoredFiles(self): + self.statusUpdated.emit() def markNodesDirty(self, fromNode): """ @@ -1597,6 +1624,7 @@ def setVerbose(self, v): cacheDirChanged = Signal() cacheDir = Property(str, cacheDir.fget, cacheDir.fset, notify=cacheDirChanged) updated = Signal() + statusUpdated = Signal() canComputeLeavesChanged = Signal() canComputeLeaves = Property(bool, lambda self: self._canComputeLeaves, notify=canComputeLeavesChanged) @@ -1661,7 +1689,7 @@ def executeGraph(graph, toNodes=None, forceCompute=False, forceStatus=False): graph.save() for node in nodes: - node.beginSequence(forceCompute) + node.initStatusOnCompute(forceCompute) for n, node in enumerate(nodes): try: @@ -1707,11 +1735,19 @@ def submitGraph(graph, submitter, toNodes=None, submitLabel="{projectName}"): raise RuntimeError("Unknown Submitter: '{submitter}'. Available submitters are: '{allSubmitters}'.".format( submitter=submitter, allSubmitters=str(meshroom.core.submitters.keys()))) + for node in nodesToProcess: + node.initStatusOnSubmit() + jobManager.resetNodeJob(node) + try: res = sub.submit(nodesToProcess, edgesToProcess, graph.filepath, submitLabel=submitLabel) if res: + if isinstance(res, BaseSubmittedJob): + jobManager.addJob(res, nodesToProcess) + else: for node in nodesToProcess: - node.initStatusOnSubmit() # update node status + # TODO : Notify the node that there was an issue on submit + pass except Exception as e: logging.error(f"Error on submit : {e}") diff --git a/meshroom/core/node.py b/meshroom/core/node.py index 8af8e4e642..1af5d1ec8e 100644 --- a/meshroom/core/node.py +++ b/meshroom/core/node.py @@ -31,7 +31,7 @@ def renameWritingToFinalPath(writingFilepath: str, filepath: str) -> str: if platform.system() == 'Windows': # On Windows, attempting to remove a file that is in use causes an exception to be raised. # So we may need multiple trials, if someone is reading it at the same time. - for i in range(20): + for _ in range(20): try: os.remove(filepath) # If remove is successful, we can stop the iterations @@ -40,7 +40,6 @@ def renameWritingToFinalPath(writingFilepath: str, filepath: str) -> str: pass os.rename(writingFilepath, filepath) - class Status(Enum): """ """ @@ -62,11 +61,164 @@ class ExecMode(Enum): EXTERN = auto() -class StatusData(BaseObject): +# Simple structure for storing chunk infos +NodeChunkSetup = namedtuple("NodeChunks", ["blockSize", "fullSize", "nbBlocks"]) + +class NodeStatusData(BaseObject): + __slots__ = ("nodeName", "status", "execMode", "nodeType", "packageName", "packageVersion", + "mrNodeType", "submitterSessionUid", "chunks", "jobInfos") + + def __init__(self, nodeName='', nodeType='', packageName='', packageVersion='', + mrNodeType: MrNodeType = MrNodeType.NONE, parent: BaseObject = None): + super().__init__(parent) + self.nodeName: str = nodeName + self.nodeType: str = nodeType + self.packageName: str = packageName + self.packageVersion: str = packageVersion + self.mrNodeType: str = mrNodeType + + # Session UID where the node was submitted + self.submitterSessionUid: Optional[str] = None + + self.reset() + + def reset(self): + self.resetChunkInfos() + self.resetDynamicValues() + + def resetChunkInfos(self): + self.chunks: NodeChunkSetup = None + + def resetDynamicValues(self): + self.status: Status = Status.NONE + self.execMode: ExecMode = ExecMode.NONE + self.jobInfos: dict = {} + + def setNodeType(self, node): + """ + Set the node type and package information from the given node. + We do not set the name in this method as it may vary if there are duplicates. + """ + self.nodeType = node.nodeType + self.packageName = node.packageName + self.packageVersion = node.packageVersion + self.mrNodeType = node.getMrNodeType() + + def setNode(self, node): + """ Set the node information from one node instance. """ + self.nodeName = node.name + self.setNodeType(node) + + def setJob(self, jid, submitterName): + """ Set Job infos on the node so that """ + self.jobInfos = { + "jid": str(jid), + "submitterName": str(submitterName), + } + + @property + def jobName(self): + if self.jobInfos: + return f"{self.jobInfos['submitterName']}<{self.jobInfos['jid']}>" + else: + return "UNKNOWN" + + def initExternSubmit(self): + """ + When submitting a node, we reset the status information to ensure that we do not keep + outdated information. + """ + self.resetDynamicValues() + self.submitterSessionUid = meshroom.core.sessionUid + self.status = Status.SUBMITTED + self.execMode = ExecMode.EXTERN + + def initLocalSubmit(self): + """ + When submitting a node, we reset the status information to ensure that we do not keep + outdated information. + """ + self.resetDynamicValues() + self.submitterSessionUid = meshroom.core.sessionUid + self.status = Status.SUBMITTED + self.execMode = ExecMode.LOCAL + + def toDict(self): + keys = list(self.__slots__) or [] + d = {key:getattr(self, key) for key in keys} + for _k, _v in d.items(): + if isinstance(_v, Enum): + d[_k] = _v.name + chunks = None + if self.chunks: + chunks = list(self.chunks) + d["chunks"] = chunks + return d + + def fromDict(self, d): + self.reset() + if "mrNodeType" in d: + self.mrNodeType = MrNodeType[d.pop("mrNodeType")] + if "chunks" in d: + chunks = d.pop("chunks") + if chunks: + self.chunks = NodeChunkSetup(*chunks) + if "status" in d: + self.status: Status = Status[d.pop("status")] + if "execMode" in d: + self.execMode = ExecMode[d.pop("execMode")] + for _key, _value in d.items(): + if _key in self.__slots__: + setattr(self, _key, _value) + + def loadFromCache(self, statusFile): + self.reset() + try: + with open(statusFile) as jsonFile: + statusData = json.load(jsonFile) + self.fromDict(statusData) + except Exception as e: + logging.warning(f"(loadFromCache) {self.nodeName}: Error while loading status file {statusFile}: {e}") + self.reset() + + @property + def nbChunks(self): + nbBlocks = self.chunks.nbBlocks if self.chunks else -1 + return nbBlocks + + def getChunkRanges(self): + if not self.chunks: + return [] + ranges = [] + for i in range(self.chunks.nbBlocks): + ranges.append(desc.Range( + iteration=i, + blockSize=self.chunks.blockSize, + fullSize=self.chunks.fullSize, + nbBlocks=self.chunks.nbBlocks + )) + return ranges + + def setChunks(self, chunks): + blockSize, fullSize, nbBlocks = 1, 1, 1 + for c in chunks: + r = c.range + blockSize, fullSize, nbBlocks = r.blockSize, r.fullSize, r.nbBlocks + break + self.chunks = NodeChunkSetup(blockSize, fullSize, nbBlocks) + + +class ChunkStatusData(BaseObject): """ """ dateTimeFormatting = '%Y-%m-%d %H:%M:%S.%f' + __slots__ = ( + "nodeName", "nodeType", "packageName", "packageVersion", "mrNodeType", + "computeSessionUid", "execMode", "status", "graph", "commandLine", "env", + "startDateTime", "endDateTime", "elapsedTime", "hostname" + ) + def __init__(self, nodeName='', nodeType='', packageName='', packageVersion='', mrNodeType: MrNodeType = MrNodeType.NONE, parent: BaseObject = None): super().__init__(parent) @@ -77,13 +229,23 @@ def __init__(self, nodeName='', nodeType='', packageName='', packageVersion='', self.packageVersion: str = packageVersion self.mrNodeType = mrNodeType - self.sessionUid: Optional[str] = None - self.submitterSessionUid: Optional[str] = None + self.computeSessionUid: Optional[str] = None # Session where computation is done self.execMode: ExecMode = ExecMode.NONE self.resetDynamicValues() + def resetDynamicValues(self): + self.status: Status = Status.NONE + self.graph = "" + self.commandLine: str = "" + self.env: str = "" + self._startTime: Optional[datetime.datetime] = None + self.startDateTime: str = "" + self.endDateTime: str = "" + self.elapsedTime: float = 0.0 + self.hostname: str = "" + def setNode(self, node): """ Set the node information from one node instance. """ self.nodeName = node.name @@ -112,21 +274,10 @@ def reset(self): self.mrNodeType: MrNodeType = MrNodeType.NONE self.execMode: ExecMode = ExecMode.NONE self.resetDynamicValues() - - def resetDynamicValues(self): - self.status: Status = Status.NONE - self.graph = "" - self.commandLine: str = "" - self.env: str = "" - self._startTime: Optional[datetime.datetime] = None - self.startDateTime: str = "" - self.endDateTime: str = "" - self.elapsedTime: float = 0.0 - self.hostname: str = "" - + def initStartCompute(self): import platform - self.sessionUid = meshroom.core.sessionUid + self.computeSessionUid = meshroom.core.sessionUid self.hostname = platform.node() self._startTime = time.time() self.startDateTime = datetime.datetime.now().strftime(self.dateTimeFormatting) @@ -144,8 +295,7 @@ def initIsolatedCompute(self): self.resetDynamicValues() self.initStartCompute() assert self.mrNodeType == MrNodeType.NODE - self.sessionUid = None - self.submitterSessionUid = meshroom.core.sessionUid + self.computeSessionUid = None def initExternSubmit(self): """ @@ -153,8 +303,7 @@ def initExternSubmit(self): outdated information. """ self.resetDynamicValues() - self.sessionUid = None - self.submitterSessionUid = meshroom.core.sessionUid + self.computeSessionUid = None self.status = Status.SUBMITTED self.execMode = ExecMode.EXTERN @@ -164,13 +313,12 @@ def initLocalSubmit(self): outdated information. """ self.resetDynamicValues() - self.sessionUid = None - self.submitterSessionUid = meshroom.core.sessionUid + self.computeSessionUid = None self.status = Status.SUBMITTED self.execMode = ExecMode.LOCAL def initEndCompute(self): - self.sessionUid = meshroom.core.sessionUid + self.computeSessionUid = meshroom.core.sessionUid self.endDateTime = datetime.datetime.now().strftime(self.dateTimeFormatting) if self._startTime != None: self.elapsedTime = time.time() - self._startTime @@ -180,41 +328,24 @@ def elapsedTimeStr(self): return str(datetime.timedelta(seconds=self.elapsedTime)) def toDict(self): - d = self.__dict__.copy() - d["elapsedTimeStr"] = self.elapsedTimeStr - - # Skip some attributes (some are from BaseObject) - d.pop("destroyed", None) - d.pop("objectNameChanged", None) - d.pop("_parent", None) - d.pop("_startTime", None) - + keys = list(self.__slots__) or [] + d = {key:getattr(self, key) for key in keys} + for _k, _v in d.items(): + if isinstance(_v, Enum): + d[_k] = _v.name return d def fromDict(self, d): - self.status = d.get("status", Status.NONE) - if not isinstance(self.status, Status): - self.status = Status[self.status] - self.execMode = d.get("execMode", ExecMode.NONE) - if not isinstance(self.execMode, ExecMode): - self.execMode = ExecMode[self.execMode] - self.mrNodeType = d.get("mrNodeType", MrNodeType.NONE) - if not isinstance(self.mrNodeType, MrNodeType): - self.mrNodeType = MrNodeType[self.mrNodeType] - - self.nodeName = d.get("nodeName", "") - self.nodeType = d.get("nodeType", "") - self.packageName = d.get("packageName", "") - self.packageVersion = d.get("packageVersion", "") - self.graph = d.get("graph", "") - self.commandLine = d.get("commandLine", "") - self.env = d.get("env", "") - self.startDateTime = d.get("startDateTime", "") - self.endDateTime = d.get("endDateTime", "") - self.elapsedTime = d.get("elapsedTime", 0) - self.hostname = d.get("hostname", "") - self.sessionUid = d.get("sessionUid", "") - self.submitterSessionUid = d.get("submitterSessionUid", "") + self.reset() + if "status" in d: + self.status: Status = Status[d.pop("status")] + if "execMode" in d: + self.execMode = ExecMode[d.pop("execMode")] + if "mrNodeType" in d: + self.mrNodeType = MrNodeType[d.pop("mrNodeType")] + for _key, _value in d.items(): + if _key in self.__slots__: + setattr(self, _key, _value) class LogManager: @@ -341,7 +472,7 @@ def __init__(self, node, range, parent=None): self.node = node self.range = range self._logManager = None - self._status: StatusData = StatusData(node.name, node.nodeType, node.packageName, + self._status: ChunkStatusData = ChunkStatusData(node.name, node.nodeType, node.packageName, node.packageVersion, node.getMrNodeType()) self.statistics: stats.Statistics = stats.Statistics() self.statusFileLastModTime = -1 @@ -349,6 +480,9 @@ def __init__(self, node, range, parent=None): # Notify update in filepaths when node's internal folder changes self.node.internalFolderChanged.connect(self.nodeFolderChanged) + def __repr__(self): + return f"" + @property def index(self): return self.range.iteration @@ -364,26 +498,36 @@ def name(self): def logManager(self): if self._logManager is None: logger = logging.getLogger(self.node.getName()) - self._logManager = LogManager(logger, self.logFile) + self._logManager = LogManager(logger, self.getLogFile()) return self._logManager - @property - def statusName(self): + def getStatusName(self): return self._status.status.name @property def logger(self): return self.logManager.logger - @property - def execModeName(self): + def getExecModeName(self): return self._status.execMode.name + + def shouldMonitorChanges(self): + """ Check whether we should monitor changes in minimal mode + Only chunks that are run externally or local_isolated should be monitored, + when run locally, status changes are already notified. + Chunks with an ERROR status may be re-submitted externally and should thus still be monitored + """ + return (self.isExtern() and self._status.status in (Status.SUBMITTED, Status.RUNNING, Status.ERROR)) or \ + (self.node.getMrNodeType() == MrNodeType.NODE and self._status.status in (Status.SUBMITTED, Status.RUNNING)) def updateStatusFromCache(self): """ - Update node status based on status file content/existence. + Update chunk status based on status file content/existence. """ - statusFile = self.statusFile + # TODO : If this is a placeholder chunk + # Then we shouldn't do anything here + + statusFile = self.getStatusFile() oldStatus = self._status.status # No status file => reset status to Status.None if not os.path.exists(statusFile): @@ -394,7 +538,7 @@ def updateStatusFromCache(self): try: with open(statusFile) as jsonFile: statusData = json.load(jsonFile) - # logging.debug(f"updateStatusFromCache({self.node.name}): From status {self.status.status} to {statusData['status']}") + # logging.debug(f"updateStatusFromCache({self.node.name}): From status {self._status.status} to {statusData['status']}") self._status.fromDict(statusData) self.statusFileLastModTime = os.path.getmtime(statusFile) except Exception as e: @@ -403,38 +547,33 @@ def updateStatusFromCache(self): self._status.reset() self._status.setNodeType(self.node) - if oldStatus != self.status.status: + if oldStatus != self._status.status: self.statusChanged.emit() - @property - def statusFile(self): + def getStatusFile(self): if self.range.blockSize == 0: return os.path.join(self.node.internalFolder, "status") else: - return os.path.join(self.node.internalFolder, - str(self.index) + ".status") + return os.path.join(self.node.internalFolder, str(self.index) + ".status") - @property - def statisticsFile(self): + def getStatisticsFile(self): if self.range.blockSize == 0: return os.path.join(self.node.internalFolder, "statistics") else: return os.path.join(self.node.internalFolder, str(self.index) + ".statistics") - @property - def logFile(self): + def getLogFile(self): if self.range.blockSize == 0: return os.path.join(self.node.internalFolder, "log") else: - return os.path.join(self.node.internalFolder, - str(self.index) + ".log") + return os.path.join(self.node.internalFolder, str(self.index) + ".log") def saveStatusFile(self): """ Write node status on disk. """ data = self._status.toDict() - statusFilepath = self.statusFile + statusFilepath = self.getStatusFile() folder = os.path.dirname(statusFilepath) os.makedirs(folder, exist_ok=True) @@ -449,6 +588,8 @@ def upgradeStatusFile(self): """ self.saveStatusFile() self.statusChanged.emit() + # We want to make sure the nodeStatus is up to date too + self.node.upgradeStatusFile() def upgradeStatusTo(self, newStatus, execMode=None): if newStatus.value < self._status.status.value: @@ -463,7 +604,7 @@ def updateStatisticsFromCache(self): """ """ oldTimes = self.statistics.times - statisticsFile = self.statisticsFile + statisticsFile = self.getStatisticsFile() if not os.path.exists(statisticsFile): return with open(statisticsFile) as jsonFile: @@ -474,7 +615,7 @@ def updateStatisticsFromCache(self): def saveStatistics(self): data = self.statistics.toDict() - statisticsFilepath = self.statisticsFile + statisticsFilepath = self.getStatisticsFile() folder = os.path.dirname(statisticsFilepath) os.makedirs(folder, exist_ok=True) statisticsFilepathWriting = getWritingFilepath(statisticsFilepath) @@ -575,9 +716,6 @@ def _processInIsolatedEnvironment(self): self.node.updateOutputAttr() def stopProcess(self): - if self.isExtern(): - raise ValueError("Cannot stop process: node is computed externally (another instance of Meshroom)") - # Ensure that we are up-to-date self.updateStatusFromCache() @@ -612,20 +750,20 @@ def isExtern(self): return True elif self._status.execMode == ExecMode.LOCAL: if self._status.status in (Status.SUBMITTED, Status.RUNNING): - return meshroom.core.sessionUid not in (self._status.submitterSessionUid, self._status.sessionUid) + return meshroom.core.sessionUid not in (self.node._nodeStatus.submitterSessionUid, self._status.computeSessionUid) return False return False statusChanged = Signal() status = Property(Variant, lambda self: self._status, notify=statusChanged) - statusName = Property(str, statusName.fget, notify=statusChanged) - execModeName = Property(str, execModeName.fget, notify=statusChanged) + statusName = Property(str, getStatusName, notify=statusChanged) + execModeName = Property(str, getExecModeName, notify=statusChanged) statisticsChanged = Signal() nodeFolderChanged = Signal() - statusFile = Property(str, statusFile.fget, notify=nodeFolderChanged) - logFile = Property(str, logFile.fget, notify=nodeFolderChanged) - statisticsFile = Property(str, statisticsFile.fget, notify=nodeFolderChanged) + statusFile = Property(str, getStatusFile, notify=nodeFolderChanged) + logFile = Property(str, getLogFile, notify=nodeFolderChanged) + statisticsFile = Property(str, getStatisticsFile, notify=nodeFolderChanged) nodeName = Property(str, lambda self: self.node.name, constant=True) statusNodeName = Property(str, lambda self: self._status.nodeName, notify=statusChanged) @@ -679,7 +817,8 @@ def __init__(self, nodeType: str, position: Position = None, parent: BaseObject self._name: str = f"_{nodeType}_{uuid.uuid1()}" self.graph = None self.dirty: bool = True # whether this node's outputs must be re-evaluated on next Graph update - self._chunks = ListModel(parent=self) + self._chunks: list[NodeChunk] = ListModel(parent=self) + self._chunksCreated = False # Only initialize chunks on compute self._uid: str = uid self._expVars: dict = {} self._size: int = 0 @@ -692,6 +831,10 @@ def __init__(self, nodeType: str, position: Position = None, parent: BaseObject self._locked: bool = False self._duplicates = ListModel(parent=self) # list of nodes with the same uid self._hasDuplicates: bool = False + + self._nodeStatus: NodeStatusData = NodeStatusData(self._name, nodeType, self.packageName, + self.packageVersion, self.getMrNodeType()) + self.nodeStatusFileLastModTime = -1 self.globalStatusChanged.connect(self.updateDuplicatesStatusAndLocked) @@ -1094,13 +1237,11 @@ def _buildAttributeCmdLineVars(cmdLineVars, name, attr): def isParallelized(self): return bool(self.nodeDesc.parallelization) if meshroom.useMultiChunks else False - @property - def nbParallelizationBlocks(self): - return len(self._chunks) - def hasStatus(self, status: Status): - if not self._chunks: - return status == Status.INPUT + if not self._chunks or not self._chunksCreated: + if self.isInputNode: + return status == Status.INPUT + return status == Status.NONE for chunk in self._chunks: if chunk.status.status != status: return False @@ -1123,6 +1264,10 @@ def clearData(self): """ Delete this Node internal folder. Status will be reset to Status.NONE """ + # Clear cache + self._nodeStatus.reset() + # Reset chunks + self._resetChunks() if self.internalFolder and os.path.exists(self.internalFolder): try: shutil.rmtree(self.internalFolder) @@ -1143,16 +1288,16 @@ def getStartDateTime(self): return min(dateTime) if len(dateTime) != 0 else "" def isAlreadySubmitted(self): - for chunk in self._chunks: - if chunk.isAlreadySubmitted(): - return True - return False + if self._chunksCreated: + return any(c.isAlreadySubmitted() for c in self._chunks) + else: + return self._nodeStatus.status in (Status.SUBMITTED, Status.RUNNING) def isAlreadySubmittedOrFinished(self): - for chunk in self._chunks: - if not chunk.isAlreadySubmittedOrFinished(): - return False - return True + if self._chunksCreated: + return all(c.isAlreadySubmittedOrFinished() for c in self._chunks) + else: + return self._nodeStatus.status in (Status.SUBMITTED, Status.RUNNING, Status.SUCCESS) @Slot(result=bool) def isSubmittedOrRunning(self): @@ -1160,6 +1305,8 @@ def isSubmittedOrRunning(self): Return True if all chunks are at least submitted and there is one running chunk, False otherwise. """ + if not self._chunksCreated: + return False if not self.isAlreadySubmittedOrFinished(): return False for chunk in self._chunks: @@ -1178,6 +1325,8 @@ def isFinishedOrRunning(self): Return True if all chunks of this Node is either finished or running, False otherwise. """ + if not self._chunks: + return False return all(chunk.isFinishedOrRunning() for chunk in self._chunks) @Slot(result=bool) @@ -1185,9 +1334,6 @@ def isPartiallyFinished(self): """ Return True is at least one chunk of this Node is finished, False otherwise. """ return any(chunk.isFinished() for chunk in self._chunks) - def alreadySubmittedChunks(self): - return [ch for ch in self._chunks if ch.isAlreadySubmitted()] - def isExtern(self): """ Return True if at least one chunk of this Node has an external execution mode, @@ -1199,7 +1345,11 @@ def isExtern(self): interrupted, its execution mode will always be local, even if computations resume externally. """ - if len(self._chunks) == 0: + if not self._chunksCreated: + if self._nodeStatus.execMode == ExecMode.EXTERN: + return True + elif self._nodeStatus.execMode == ExecMode.LOCAL and self._nodeStatus.status in (Status.SUBMITTED, Status.RUNNING): + return meshroom.core.sessionUid != self._nodeStatus.submitterSessionUid return False return any(chunk.isExtern() for chunk in self._chunks) @@ -1213,26 +1363,49 @@ def clearSubmittedChunks(self): This must be used with caution. This could lead to inconsistent node status if the graph is still being computed. """ - for chunk in self._chunks: - if chunk.isAlreadySubmitted(): - chunk.upgradeStatusTo(Status.NONE, ExecMode.NONE) + if self._chunksCreated: + for chunk in self._chunks: + if chunk.isAlreadySubmitted(): + chunk.upgradeStatusTo(Status.NONE, ExecMode.NONE) + else: + if self.isAlreadySubmitted(): + self.upgradeStatusTo(Status.NONE, ExecMode.NONE) + self.globalStatusChanged.emit() def clearLocallySubmittedChunks(self): """ Reset all locally submitted chunks to Status.NONE. """ - for chunk in self._chunks: - if chunk.isAlreadySubmitted() and not chunk.isExtern(): - chunk.upgradeStatusTo(Status.NONE, ExecMode.NONE) + if self._chunksCreated: + for chunk in self._chunks: + if chunk.isAlreadySubmitted() and not chunk.isExtern(): + chunk.upgradeStatusTo(Status.NONE, ExecMode.NONE) + else: + if self.isAlreadySubmitted() and not self.isExtern(): + self.upgradeStatusTo(Status.NONE, ExecMode.NONE) + self.globalStatusChanged.emit() - def upgradeStatusTo(self, newStatus): + def upgradeStatusTo(self, newStatus, execMode=None): """ Upgrade node to the given status and save it on disk. """ - for chunk in self._chunks: - chunk.upgradeStatusTo(newStatus) + if self._chunksCreated: + for chunk in self._chunks: + chunk.upgradeStatusTo(newStatus) + else: + if execMode is not None: + self._nodeStatus.execMode = execMode + self._nodeStatus.status = newStatus + self.upgradeStatusFile() + self.globalStatusChanged.emit() def updateStatisticsFromCache(self): for chunk in self._chunks: chunk.updateStatisticsFromCache() - def _updateChunks(self): + def _resetChunks(self): + pass + + def createChunksFromCache(self): + pass + + def _createChunks(self): pass def _updateNodeSize(self): @@ -1321,8 +1494,8 @@ def updateInternals(self, cacheDir=None): self._updateNodeSize() - # Update chunks splitting - self._updateChunks() + # Reset chunks splitting + self._resetChunks() # Retrieve current internal folder (if possible) try: folder = self.internalFolder @@ -1350,30 +1523,112 @@ def internalFolder(self): def sourceCodeFolder(self): return self._sourceCodeFolder + @property + def nodeStatusFile(self): + return os.path.join(self.graph.cacheDir, self.internalFolder, "nodeStatus") + + def shouldMonitorChanges(self): + """ Check whether we should monitor changes in minimal mode + Only chunks that are run externally or local_isolated should be monitored, + when run locally, status changes are already notified. + Chunks with an ERROR status may be re-submitted externally and should thus still be monitored + """ + if self._chunksCreated: + # Only monitor when chunks are not created (in this case monitor chunk status files instead) + return False + return (self.isExtern() and self._nodeStatus.status in (Status.SUBMITTED, Status.RUNNING, Status.ERROR)) or \ + (self.getMrNodeType() == MrNodeType.NODE and self._nodeStatus.status in (Status.SUBMITTED, Status.RUNNING)) + + def updateNodeStatusFromCache(self): + """ + Update node status based on status file content/existence. + # TODO : integrate nodeStatusFileLastModTime ? + Returns True if a change on the chunk setup has been detected + """ + chunksRangeHasChanged = False + if os.path.exists(self.nodeStatusFile): + oldChunkSetup = self._nodeStatus.chunks + self._nodeStatus.loadFromCache(self.nodeStatusFile) + if self._nodeStatus.chunks != oldChunkSetup: + chunksRangeHasChanged = True + self.nodeStatusFileLastModTime = os.path.getmtime(self.nodeStatusFile) + else: + # No status file => reset status to Status.None + self.nodeStatusFileLastModTime = -1 + self._nodeStatus.reset() + self._nodeStatus.setNodeType(self) + return chunksRangeHasChanged + def updateStatusFromCache(self): """ Update node status based on status file content/existence. """ + # Update nodeStatus from cache + chunkChanged = self.updateNodeStatusFromCache() + # Create chunks if we found info on them on the node cache + if chunkChanged and self._nodeStatus.nbChunks > 0: + # Update number of chunks + try: + self.createChunksFromCache() + except Exception as e: + logging.warning(f"Could not create chunks from cache :{e}") + return s = self.globalStatus - for chunk in self._chunks: - chunk.updateStatusFromCache() - # logging.warning(f"updateStatusFromCache: {self.name}, status: {s} => {self.globalStatus}") + if self._chunksCreated: + for chunk in self._chunks: + chunk.updateStatusFromCache() + # logging.debug(f"updateStatusFromCache: {self.name}, status: {s} => {self.globalStatus}") self.updateOutputAttr() + def upgradeStatusFile(self): + """ Write node status on disk. """ + # Make sure the node has the globalStatus before saving it + self._nodeStatus.status = self.getGlobalStatus() + data = self._nodeStatus.toDict() + statusFilepath = self.nodeStatusFile + folder = os.path.dirname(statusFilepath) + os.makedirs(folder, exist_ok=True) + statusFilepathWriting = getWritingFilepath(statusFilepath) + with open(statusFilepathWriting, 'w') as jsonFile: + json.dump(data, jsonFile, indent=4) + renameWritingToFinalPath(statusFilepathWriting, statusFilepath) + + def setJobId(self, jid, submitterName): + self._nodeStatus.setJob(jid, submitterName) + self.upgradeStatusFile() + def initStatusOnSubmit(self, forceCompute=False): """ Prepare chunks status when the node is in a graph that was submitted """ + hasChunkToLaunch = False + if not self._chunksCreated: + hasChunkToLaunch = True for chunk in self._chunks: - if forceCompute or chunk.status.status != Status.SUCCESS: + if forceCompute or chunk._status.status != Status.SUCCESS: + hasChunkToLaunch = True chunk._status.setNode(self) chunk._status.initExternSubmit() chunk.upgradeStatusFile() + if hasChunkToLaunch: + self._nodeStatus.setNode(self) + self._nodeStatus.initExternSubmit() + self.upgradeStatusFile() + self.globalStatusChanged.emit() - def beginSequence(self, forceCompute=False): + def initStatusOnCompute(self, forceCompute=False): + hasChunkToLaunch = False + if not self._chunksCreated: + hasChunkToLaunch = True for chunk in self._chunks: - if forceCompute or (chunk.status.status not in (Status.RUNNING, Status.SUCCESS)): + if forceCompute or (chunk._status.status not in (Status.RUNNING, Status.SUCCESS)): + hasChunkToLaunch = True chunk._status.setNode(self) chunk._status.initLocalSubmit() chunk.upgradeStatusFile() + if hasChunkToLaunch: + self._nodeStatus.setNode(self) + self._nodeStatus.initLocalSubmit() + self.upgradeStatusFile() + self.globalStatusChanged.emit() def processIteration(self, iteration): self._chunks[iteration].process() @@ -1483,8 +1738,15 @@ def endSequence(self): def stopComputation(self): """ Stop the computation of this node. """ - for chunk in self._chunks.values(): - chunk.stopProcess() + if self._chunks: + for chunk in self._chunks.values(): + chunk.stopProcess() + else: + # Ensure that we are up-to-date + self.updateNodeStatusFromCache() + # The only status possible here is submitted + if self._nodeStatus.status is Status.SUBMITTED: + self.upgradeStatusTo(Status.NONE) def getGlobalStatus(self): """ @@ -1495,12 +1757,15 @@ def getGlobalStatus(self): """ if isinstance(self.nodeDesc, desc.InputNode): return Status.INPUT + if not self._chunksCreated: + # Get status from nodeStatus + return self._nodeStatus.status if not self._chunks: return Status.NONE if len( self._chunks) == 1: - return self._chunks[0].status.status + return self._chunks[0]._status.status - chunksStatus = [chunk.status.status for chunk in self._chunks] + chunksStatus = [chunk._status.status for chunk in self._chunks] anyOf = (Status.ERROR, Status.STOPPED, Status.KILLED, Status.RUNNING, Status.SUBMITTED) @@ -1515,18 +1780,18 @@ def getGlobalStatus(self): return Status.NONE - @Slot(result=StatusData) + @Slot(result=ChunkStatusData) def getFusedStatus(self): if not self._chunks: - return StatusData() - fusedStatus = StatusData() - fusedStatus.fromDict(self._chunks[0].status.toDict()) + return ChunkStatusData() + fusedStatus = ChunkStatusData() + fusedStatus.fromDict(self._chunks[0]._status.toDict()) for chunk in self._chunks[1:]: - fusedStatus.merge(chunk.status) + fusedStatus.merge(chunk._status) fusedStatus.status = self.getGlobalStatus() return fusedStatus - @Slot(result=StatusData) + @Slot(result=ChunkStatusData) def getRecursiveFusedStatus(self): fusedStatus = self.getFusedStatus() nodes = self.getInputNodes(recursive=True, dependenciesOnly=True) @@ -1542,11 +1807,22 @@ def _isInputNode(self): @property def globalExecMode(self): + if not self._chunksCreated: + return self._nodeStatus.execMode.name if len(self._chunks): - return self._chunks.at(0).execModeName + return self._chunks.at(0).getExecModeName() else: return ExecMode.NONE + def _getJobName(self): + execMode = self._nodeStatus.execMode + if execMode == ExecMode.LOCAL: + return "LOCAL" + elif execMode == ExecMode.EXTERN: + return self._nodeStatus.jobName + else: + return "NONE" + def getChunks(self) -> list[NodeChunk]: return self._chunks @@ -1662,26 +1938,28 @@ def updateDuplicates(self, nodesPerUid): self.hasDuplicatesChanged.emit() def statusInThisSession(self) -> bool: + """ Check if chunks of the node are being computed in the current session + TODO: Not used -> depreciate ? + """ if not self._chunks: return False for chunk in self._chunks: - if chunk.status.sessionUid != meshroom.core.sessionUid: + if chunk._status.computeSessionUid != meshroom.core.sessionUid: return False return True def submitterStatusInThisSession(self) -> bool: - if not self._chunks: - return False - for chunk in self._chunks: - if chunk.status.submitterSessionUid != meshroom.core.sessionUid: - return False - return True + """ Check if the node is submitted by the current session + TODO: Not used -> depreciate ? + """ def initFromThisSession(self) -> bool: - if len(self._chunks) == 0: - return False + """ Check if the node was submitted from the current session """ + if not self._chunksCreated or not self._chunks: + return meshroom.core.sessionUid == self._nodeStatus.submitterSessionUid for chunk in self._chunks: - if meshroom.core.sessionUid not in (chunk.status.sessionUid, chunk.status.submitterSessionUid): + # Technically the check on chunk._status.computeSessionUid is useless + if meshroom.core.sessionUid not in (chunk._status.computeSessionUid, self._nodeStatus.submitterSessionUid): return False return True @@ -1702,7 +1980,7 @@ def canBeStopped(self) -> bool: if self.isCompatibilityNode: return False # Only locked nodes running in local with the same - # sessionUid as the Meshroom instance can be stopped + # computeSessionUid as the Meshroom instance can be stopped return (self.getGlobalStatus() == Status.RUNNING and self.globalExecMode == ExecMode.LOCAL.name and self.isMainNode() and @@ -1715,7 +1993,7 @@ def canBeCanceled(self) -> bool: if self.isCompatibilityNode: return False # Only locked nodes submitted in local with the same - # sessionUid as the Meshroom instance can be canceled + # computeSessionUid as the Meshroom instance can be canceled return (self.getGlobalStatus() == Status.SUBMITTED and self.globalExecMode == ExecMode.LOCAL.name and self.isMainNode() and @@ -1768,6 +2046,9 @@ def _hasDisplayableShape(self): nodeType = Property(str, nodeType.fget, constant=True) documentation = Property(str, getDocumentation, constant=True) nodeInfos = Property(Variant, getNodeInfos, constant=True) + nodeStatusChanged = Signal() + nodeStatus = Property(Variant, lambda self: self._nodeStatus, notify=nodeStatusChanged) + nodeStatusNodeName = Property(str, lambda self: self._nodeStatus.nodeName, notify=nodeStatusChanged) positionChanged = Signal() position = Property(Variant, position.fget, position.fset, notify=positionChanged) x = Property(float, lambda self: self._position.x, notify=positionChanged) @@ -1785,13 +2066,16 @@ def _hasDisplayableShape(self): depthChanged = Signal() depth = Property(int, depth.fget, notify=depthChanged) minDepth = Property(int, minDepth.fget, notify=depthChanged) + chunksCreatedChanged = Signal() + chunksCreated = Property(bool, lambda self: self._chunksCreated, notify=chunksCreatedChanged) chunksChanged = Signal() chunks = Property(Variant, getChunks, notify=chunksChanged) + nbParallelizationBlocks = Property(int, lambda self: len(self._chunks) if self._chunksCreated else 0, notify=chunksChanged) sizeChanged = Signal() size = Property(int, getSize, notify=sizeChanged) globalStatusChanged = Signal() globalStatus = Property(str, lambda self: self.getGlobalStatus().name, notify=globalStatusChanged) - fusedStatus = Property(StatusData, getFusedStatus, notify=globalStatusChanged) + fusedStatus = Property(ChunkStatusData, getFusedStatus, notify=globalStatusChanged) elapsedTime = Property(float, lambda self: self.getFusedStatus().elapsedTime, notify=globalStatusChanged) recursiveElapsedTime = Property(float, lambda self: self.getRecursiveFusedStatus().elapsedTime, notify=globalStatusChanged) @@ -1800,6 +2084,7 @@ def _hasDisplayableShape(self): isInputNode = Property(bool, lambda self: self._isInputNode(), constant=True) globalExecMode = Property(str, globalExecMode.fget, notify=globalStatusChanged) + jobName = Property(str, lambda self: self._getJobName(), notify=globalStatusChanged) isExternal = Property(bool, isExtern, notify=globalStatusChanged) isComputed = Property(bool, _isComputed, notify=globalStatusChanged) isComputableType = Property(bool, _isComputableType, notify=globalStatusChanged) @@ -1923,18 +2208,57 @@ def toDict(self): 'outputs': outputs, } - def _updateChunks(self): - """ Update Node's computation task splitting into NodeChunks based on its description """ + def _resetChunks(self): + """ Set chunks on the node + # TODO : Maybe don't delete chunks if we will recreate them as before ? + """ if isinstance(self.nodeDesc, desc.InputNode): + self._chunksCreated = True return - if self.isParallelized: + # Disconnect signals + for chunk in self._chunks: + chunk.statusChanged.disconnect(self.globalStatusChanged) + # Empty list + self._chunks.setObjectList([]) + # Recreate list with reset values (1 chunk or the static size) + if not self.isParallelized: + self.setSize(1) + self._chunks.setObjectList([NodeChunk(self, desc.Range())]) + self._chunks[0].statusChanged.connect(self.globalStatusChanged) + self._chunksCreated = True + elif isinstance(self.nodeDesc.size, desc.computation.StaticNodeSize): + self._chunksCreated = True + self.setSize(self.nodeDesc.size.computeSize(self)) + self._chunks.setObjectList([NodeChunk(self, desc.Range())]) + self._chunks[0].statusChanged.connect(self.globalStatusChanged) try: ranges = self.nodeDesc.parallelization.getRanges(self) + self._chunks.setObjectList([NodeChunk(self, range) for range in ranges]) + for c in self._chunks: + c.statusChanged.connect(self.globalStatusChanged) + logging.debug(f"Created {len(self._chunks)} chunks for node: {self.name}") + except RuntimeError: + # TODO: set node internal status to error + logging.warning(f"Invalid Parallelization on node {self._name}") + self._chunks.clear() + else: + self._chunksCreated = False + self.setSize(0) + self._chunks.setObjectList([]) + # Create chunks when possible + self.chunksCreatedChanged.emit() + self.chunksChanged.emit() + self.globalStatusChanged.emit() + + def __createChunks(self, ranges): + if self.isParallelized: + try: if len(ranges) != len(self._chunks): self._chunks.setObjectList([NodeChunk(self, range) for range in ranges]) for c in self._chunks: c.statusChanged.connect(self.globalStatusChanged) - else: + logging.debug(f"Created {len(self._chunks)} chunks for node: {self.name}") + else: for chunk, range in zip(self._chunks, ranges): chunk.range = range except RuntimeError: @@ -1947,6 +2271,51 @@ def _updateChunks(self): self._chunks[0].statusChanged.connect(self.globalStatusChanged) else: self._chunks[0].range = desc.Range() + self._chunksCreated = True + # Update node status + # TODO : update all chunks status ? + # TODO : update node status ? + # Emit signals for UI updates + self.chunksChanged.emit() + self.chunksCreatedChanged.emit() + + def createChunksFromCache(self): + """Create chunks when a node cache exists""" + try: + # Get size from cache + size = self._nodeStatus.nbChunks + self.setSize(size) + ranges = self._nodeStatus.getChunkRanges() + self.__createChunks(ranges) + except Exception as e: + logging.error(f"Failed to create chunks for {self.name}") + self._chunks.clear() + self._chunksCreated = False + raise e + + def createChunks(self): + """Create chunks when computation is about to start""" + if self._chunksCreated: + return + if isinstance(self.nodeDesc, desc.InputNode): + self._chunksCreated = True + self.chunksChanged.emit() + return + # Grab current chunk infos + logging.debug(f"Creating chunks for node: {self.name}") + try: + size = self.nodeDesc.size.computeSize(self) + self.setSize(size) + ranges = self.nodeDesc.parallelization.getRanges(self) + self.__createChunks(ranges) + except Exception as e: + logging.error(f"Failed to create chunks for {self.name}: {e}") + self._chunks.clear() + self._chunksCreated = False + raise e + # Update status + self._nodeStatus.setChunks(self._chunks) + self.upgradeStatusFile() class CompatibilityIssue(Enum): @@ -2231,4 +2600,3 @@ def upgrade(self): compatibilityIssue = Property(int, lambda self: self.issue.value, constant=True) canUpgrade = Property(bool, canUpgrade.fget, constant=True) issueDetails = Property(str, issueDetails.fget, constant=True) - diff --git a/meshroom/core/submitter.py b/meshroom/core/submitter.py index a0c4f113ef..45706e6744 100644 --- a/meshroom/core/submitter.py +++ b/meshroom/core/submitter.py @@ -1,5 +1,14 @@ #!/usr/bin/env python +import sys +import logging +import operator + +from enum import IntFlag, auto +from typing import Optional +from itertools import accumulate + +import meshroom from meshroom.common import BaseObject, Property import logging @@ -7,17 +16,230 @@ logger.setLevel(logging.INFO) +logger = logging.getLogger("Submitter") +logger.setLevel(logging.INFO) + + +class SubmitterOptionsEnum(IntFlag): + RETRIEVE = auto() # Can retrieve job (read job tasks, ...) + INTERRUPT_JOB = auto() # Can interrupt + RESUME_JOB = auto() # Can resume after interruption + EDIT_TASKS = auto() # Can edit tasks + ATTACH_JOB = auto() # Can attach a job that will execute after another job + + @classmethod + def get(cls, option): + if isinstance(option, str): + # Try to cast to SubmitterOptionsEnum + option = getattr(cls, option.upper(), None) + elif isinstance(option, int): + option = cls(option) + if isinstance(option, cls): + return option + return 0 + +# SubmitterOptionsEnum.ALL = SubmitterOptionsEnum(SubmitterOptionsEnum._all_bits_) # _all_bits_ -> py 3.11 +SubmitterOptionsEnum.ALL = list(accumulate(SubmitterOptionsEnum, operator.__ior__))[-1] + + +class SubmitterOptions: + def __init__(self, *args): + self._options = 0 + for option in args: + self.addOption(option) + + def addOption(self, option): + option = SubmitterOptionsEnum.get(option) + self._options |= option + + def includes(self, option): + option = SubmitterOptionsEnum.get(option) + return self._options & option > 0 + + def __iter__(self): + for o in SubmitterOptionsEnum: + if self.includes(o): + yield(o) + + def __repr__(self): + if self._options == 0: + return f"" + if self._options == SubmitterOptionsEnum.ALL: + return f"" + return f"" + + +class BaseSubmittedJob: + """ + Interface to manipulate the job via Meshroom + """ + + def __init__(self, jobId, submitter): + self.jid = jobId + self.submitterName: str = submitter._name + self.submitterOptions: SubmitterOptions = submitter._options + + def __repr__(self): + return f"<{self.__class__.__name__} {self.jid}>" + + # Task actions + # For all methods if If iteration is -1 then it kills all the tasks for the given node + + def stopChunkTask(self, node, iteration): + """ This will kill one task + If iteration is -1 then it kills all the tasks for the given node + """ + if self.submitterOptions.includes(SubmitterOptionsEnum.INTERRUPT_JOB): + raise NotImplementedError(f"'stopChunkTask' method must be implemented in subclasses") + else: + raise RuntimeError(f"Submitter {self.__class__.__name__} cannot interrupt the job") + + def skipChunkTask(self, node, iteration): + """ This will kill one task """ + if self.submitterOptions.includes(SubmitterOptionsEnum.INTERRUPT_JOB): + raise NotImplementedError("'skipChunkTask' method must be implemented in subclasses") + else: + raise RuntimeError(f"Submitter {self.__class__.__name__} cannot interrupt the job") + + def restartChunkTask(self, node, iteration): + """ This will kill one task """ + if self.submitterOptions.includes(SubmitterOptionsEnum.RESUME_JOB): + raise NotImplementedError("'restartChunkTask' method must be implemented in subclasses") + else: + raise RuntimeError(f"Submitter {self.__class__.__name__} cannot interrupt the job") + + # Job actions + + def pauseJob(self): + """ This will pause the job : new tasks will not be processed """ + if self.submitterOptions.includes(SubmitterOptionsEnum.INTERRUPT_JOB): + raise NotImplementedError("'pauseJob' method must be implemented in subclasses") + else: + raise RuntimeError(f"Submitter {self.__class__.__name__} cannot interrupt the job") + + def resumeJob(self): + """ This will unpause the job """ + if self.submitterOptions.includes(SubmitterOptionsEnum.RESUME_JOB): + raise NotImplementedError("'resumeJob' method must be implemented in subclasses") + else: + raise RuntimeError(f"Submitter {self.__class__.__name__} cannot interrupt the job") + + def interruptJob(self): + """ This will interrupt the job (and kill running tasks) """ + if self.submitterOptions.includes(SubmitterOptionsEnum.INTERRUPT_JOB): + raise NotImplementedError("'interruptJob' method must be implemented in subclasses") + else: + raise RuntimeError(f"Submitter {self.__class__.__name__} cannot interrupt the job") + + def restartErrorTasks(self): + if self.submitterOptions.includes(SubmitterOptionsEnum.RESUME_JOB): + raise NotImplementedError("'restartErrorTasks' method must be implemented in subclasses") + else: + raise RuntimeError(f"Submitter {self.__class__.__name__} cannot restart the job") + + +class JobManager(BaseObject): + """ Central manager for all jobs """ + + def __init__(self): + super().__init__() + self._jobs = {} # jobId -> BaseSubmittedJob + self._nodeToJob = {} # node uid -> Job + + def addJob(self, job: BaseSubmittedJob, nodes): + jid = job.jid + if jid not in self._jobs: + self._jobs[jid] = job + for node in nodes: + nodeUid = node._uid + self._nodeToJob[nodeUid] = jid + # Update the node status file to store the job ID + node.setJobId(jid, job.submitterName) + + def resetNodeJob(self, node): + node._nodeStatus.jobInfos = {} + if node._uid in self._nodeToJob: + del self._nodeToJob[node._uid] + + def getJob(self, jobId: str) -> Optional[BaseSubmittedJob]: + return self._jobs.get(jobId) + + def removeJob(self, jobId: str): + with self._lock: + if jobId in self._jobs: + del self._jobs[jobId] + + def getNodeJob(self, node): + nodeUid = node._uid + jobId = self._nodeToJob.get(nodeUid) + if jobId: + return self.getJob(jobId) + return None + + def getAllNodesUIDForJob(self, job): + return [n for n, j in self._nodeToJob.items() if j == job.jid] + + def retreiveJob(self, submitter, jid) -> Optional[BaseSubmittedJob]: + if not submitter._options.includes(SubmitterOptionsEnum.RETRIEVE): + return None + job = submitter.retrieveJob(jid) + return job + + +# Global instance that manages submitted jobs +jobManager = JobManager() + + class BaseSubmitter(BaseObject): - def __init__(self, name, parent=None): + _options: SubmitterOptions = SubmitterOptions() + _name = "" + + def __init__(self, parent=None): + if not self._name: + raise ValueError("Could not register submitter without name") super().__init__(parent) - self._name = name - logger.info(f"Registered submitter {self._name}") + logger.info(f"Registered submitter {self._name} (options={self._options})") + + @property + def name(self): + return self._name - def submit(self, nodes, edges, filepath, submitLabel="{projectName}"): + def createJob(self, nodes, edges, filepath, submitLabel="{projectName}"): """ Submit the given graph Returns: bool: whether the submission succeeded """ - raise NotImplementedError("'submit' method must be implemented in subclasses") + raise NotImplementedError("'createJob' method must be implemented in subclasses") + + def createChunkTask(self, node, graphFile, **kwargs): + if self._options.includes(SubmitterOptionsEnum.RESUME_JOB): + raise NotImplementedError("'createChunkTask' method must be implemented in subclasses") + else: + raise RuntimeError(f"Submitter {self.name} cannot edit the job") + + def retrieveJob(self, jobId) -> BaseSubmittedJob: + raise NotImplementedError("'retrieveJob' method must be implemented in subclasses") + + def submit(self, nodes, edges, filepath, submitLabel="{projectName}") -> BaseSubmittedJob: + """ Submit the given graph + Returns: + bool: whether the submission succeeded + """ + job = self.createJob(nodes, edges, filepath, submitLabel) + if not job: + # Failed to create the job + return None + return job + + @staticmethod + def killRunningJob(): + """ Sometimes farms are automatically re-trying job once in case it was + killed by a user that don't want his machine to be used. Unfortunately this + means jobs will be launched twice even if they failed for a good reason. + This function can be used to make sure the current job will not restart + Note : the ERROR_NO_RETRY itself won't do anything. This function must be + implemented on a case-by-case for each possible farm system + """ + sys.exit(meshroom.MeshroomExitStatus.ERROR_NO_RETRY) name = Property(str, lambda self: self._name, constant=True) diff --git a/meshroom/core/taskManager.py b/meshroom/core/taskManager.py index c45b4f5eb3..a4a5b14888 100644 --- a/meshroom/core/taskManager.py +++ b/meshroom/core/taskManager.py @@ -1,11 +1,14 @@ +import traceback import logging from threading import Thread +from PySide6.QtCore import QThread, QEventLoop, QTimer from enum import Enum import meshroom from meshroom.common import BaseObject, DictModel, Property, Signal, Slot -from meshroom.core.node import Status, Node +from meshroom.core.node import Node, Status, Node from meshroom.core.graph import Graph +from meshroom.core.submitter import jobManager, BaseSubmittedJob import meshroom.core.graph @@ -20,31 +23,74 @@ class State(Enum): ERROR = 4 -class TaskThread(Thread): +class TaskThread(QThread): """ A thread with a pile of nodes to compute """ def __init__(self, manager): - Thread.__init__(self, target=self.run) + QThread.__init__(self) self._state = State.IDLE self._manager = manager self.forceCompute = False + # Connect to manager's chunk creation handler + self.createChunksSignal.connect(manager.createChunks) def isRunning(self): return self._state == State.RUNNING + def waitForChunkCreation(self, node): + + if hasattr(node, "_chunksCreated") and node._chunksCreated: + return True + + loop = QEventLoop() + + # A timer is used to make sure we don't indefinitely block the taskManager + timer = QTimer() + timer.timeout.connect(loop.quit) + timer.setSingleShot(True) + timer.start(1*60*1000) # 1 min timeout + + # Connect to completion signal + def onChunksCreated(createdNode): + if createdNode == node: + loop.quit() + + self._manager.chunksCreated.connect(onChunksCreated) + + try: + # Start the event loop - will block until signal or timeout + loop.exec() + success = hasattr(node, "_chunksCreated") and node._chunksCreated + if not success: + logging.error(f"Timeout or failure creating chunks for {node.name}") + return success + finally: + self._manager.chunksCreated.disconnect(onChunksCreated) + timer.stop() + def run(self): """ Consume compute tasks. """ self._state = State.RUNNING - stopAndRestart = False for nId, node in enumerate(self._manager._nodesToProcess): + if node not in self._manager._nodesToProcess: + # Node was removed from the processing list + continue # skip already finished/running nodes if node.isFinishedOrRunning(): continue + # Request chunk creation if not already done + if not (hasattr(node, "_chunksCreated") and node._chunksCreated): + self.createChunksSignal.emit(node) + # Wait for chunk creation to complete + if not self.waitForChunkCreation(node): + logging.error(f"Failed to create chunks for {node.name}, stopping the process") + break + # if a node does not exist anymore, node.chunks becomes a PySide property try: multiChunks = len(node.chunks) > 1 @@ -56,13 +102,16 @@ def run(self): if chunk.isFinishedOrRunning() or not self.isRunning(): continue + if self._manager.isChunkCancelled(chunk): + continue + + _nodeName, _node, _nbNodes = node.nodeType, nId+1, len(self._manager._nodesToProcess) + if multiChunks: - logging.info('[{node}/{nbNodes}]({chunk}/{nbChunks}) {nodeName}'.format( - node=nId+1, nbNodes=len(self._manager._nodesToProcess), - chunk=cId+1, nbChunks=len(node.chunks), nodeName=node.nodeType)) + _chunk, _nbChunks = cId+1, len(node.chunks) + logging.info(f"[{_node}/{_nbNodes}]({_chunk}/{_nbChunks}) {_nodeName}") else: - logging.info('[{node}/{nbNodes}] {nodeName}'.format( - node=nId+1, nbNodes=len(self._manager._nodesToProcess), nodeName=node.nodeType)) + logging.info(f"[{_node}/{_nbNodes}] {_nodeName}") try: chunk.process(self.forceCompute) except Exception as e: @@ -91,6 +140,9 @@ def run(self): else: self._manager._nodesToProcess = [] self._state = State.DEAD + + # Signals and properties + createChunksSignal = Signal(BaseObject) class TaskManager(BaseObject): @@ -102,6 +154,7 @@ def __init__(self, parent: BaseObject = None): self._graph = None self._nodes = DictModel(keyAttrName='_name', parent=self) self._nodesToProcess = [] + self._cancelledChunks = [] self._nodesExtern = [] # internal thread in which local tasks are executed self._thread = TaskThread(self) @@ -109,7 +162,31 @@ def __init__(self, parent: BaseObject = None): self._blockRestart = False self.restartRequested.connect(self.restart) - def requestBlockRestart(self): + def join(self): + self._thread.wait() + self._cancelledChunks = [] + + @Slot(BaseObject) + def createChunks(self, node: Node): + """ Create chunks on main process """ + try: + if not node._chunksCreated: + node.createChunks() + # Prepare all chunks + node.initStatusOnCompute() + self.chunksCreated.emit(node) + except Exception as e: + logging.error(f"Failed to create chunks for {node.name}: {e}") + self.chunksCreated.emit(node) # Still emit to unblock waiting thread + + def isChunkCancelled(self, chunk): + for i, ch in enumerate(self._cancelledChunks): + if ch == chunk: + del self._cancelledChunks[i] + return True + return False + + def requestBlockRestart(self): """ Block computing. Note: should only be used to completely stop computing. @@ -129,7 +206,24 @@ def blockRestart(self): self._blockRestart = False self._nodesToProcess = [] + self._cancelledChunks = [] self._thread._state = State.DEAD + + @Slot() + def pauseProcess(self): + if self._thread.isRunning(): + self.join() + for node in self._nodesToProcess: + if node.getGlobalStatus() == Status.STOPPED: + # Remove node from the computing list + self.removeNode(node, displayList=False, processList=True) + + # Remove output nodes from display and computing lists + outputNodes = node.getOutputNodes(recursive=True, dependenciesOnly=True) + for n in outputNodes: + if n.getGlobalStatus() in (Status.ERROR, Status.SUBMITTED): + n.upgradeStatusTo(Status.NONE) + self.removeNode(n, displayList=True, processList=True) @Slot() def restart(self): @@ -138,7 +232,8 @@ def restart(self): Note: this is done like this to avoid app freezing. """ # Make sure to wait the end of the current thread - self._thread.join() + if self._thread.isRunning(): + self.join() # Avoid restart if thread was globally stopped if self._blockRestart: @@ -174,9 +269,11 @@ def compute(self, graph: Graph = None, toNodes: list[Node] = None, forceCompute: :param forceCompute: force the computation despite nodes status. :param forceStatus: force the computation even if some nodes are submitted externally. """ + self._graph = graph self.updateNodes() + self._cancelledChunks = [] if forceCompute: nodes, edges = graph.dfsOnFinish(startNodes=toNodes) @@ -222,7 +319,7 @@ def compute(self, graph: Graph = None, toNodes: list[Node] = None, forceCompute: for node in nodes: node.destroyed.connect(lambda obj=None, name=node.name: self.onNodeDestroyed(obj, name)) - node.beginSequence(forceCompute) + node.initStatusOnCompute(forceCompute) self._nodes.update(nodes) self._nodesToProcess.extend(nodes) @@ -383,7 +480,6 @@ def submit(self, graph, submitter=None, toNodes=None, submitLabel="{projectName} :param toNodes: :return: """ - # Ensure submitter is properly set sub = None if submitter: @@ -400,6 +496,8 @@ def submit(self, graph, submitter=None, toNodes=None, submitLabel="{projectName} submitter=submitter, allSubmitters=str(meshroom.core.submitters.keys()) )) + + # TODO : If possible with the submitter (ATTACH_JOB) # Update task manager's lists self.updateNodes() @@ -423,6 +521,14 @@ def submit(self, graph, submitter=None, toNodes=None, submitLabel="{projectName} self.checkCompatibilityNodes(graph, nodesToProcess, "SUBMITTING") # name of the context is important for QML self.checkDuplicates(nodesToProcess, "SUBMITTING") # name of the context is important for QML + # Update nodes status + for node in nodesToProcess: + node.destroyed.connect(lambda obj=None, name=node.name: self.onNodeDestroyed(obj, name)) + node.initStatusOnSubmit() + jobManager.resetNodeJob(node) + + graph.updateMonitoredFiles() + flowEdges = graph.flowEdges(startNodes=toNodes) edgesToProcess = set(edgesToProcess).intersection(flowEdges) @@ -432,9 +538,12 @@ def submit(self, graph, submitter=None, toNodes=None, submitLabel="{projectName} try: res = sub.submit(nodesToProcess, edgesToProcess, graph.filepath, submitLabel=submitLabel) if res: + if isinstance(res, BaseSubmittedJob): + jobManager.addJob(res, nodesToProcess) + else: for node in nodesToProcess: - node.destroyed.connect(lambda obj=None, name=node.name: self.onNodeDestroyed(obj, name)) - node.initStatusOnSubmit() # update node status + # TODO : Notify the node that there was an issue on submit + pass self._nodes.update(nodesToProcess) self._nodesExtern.extend(nodesToProcess) @@ -442,7 +551,7 @@ def submit(self, graph, submitter=None, toNodes=None, submitLabel="{projectName} if not allReady: self.raiseDependenciesMessage("SUBMITTING") except Exception as e: - logging.error(f"Error on submit : {e}") + logging.error(f"Error on submit : {e}\n{traceback.format_exc()}") def submitFromFile(self, graphFile, submitter, toNode=None, submitLabel="{projectName}"): """ @@ -466,4 +575,5 @@ def getAlreadySubmittedChunks(self, nodes): return out nodes = Property(BaseObject, lambda self: self._nodes, constant=True) + chunksCreated = Signal(BaseObject) restartRequested = Signal() diff --git a/meshroom/submitters/localFarmSubmitter.py b/meshroom/submitters/localFarmSubmitter.py new file mode 100644 index 0000000000..33187747dd --- /dev/null +++ b/meshroom/submitters/localFarmSubmitter.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python + +import os +import re +import shutil +import logging +import shutil +from pathlib import Path +from typing import List, Dict +from meshroom.core.submitter import BaseSubmitter, SubmitterOptions, BaseSubmittedJob, SubmitterOptionsEnum +from meshroom.core.node import Status +from collections import namedtuple, defaultdict + +from localfarm.localFarm import Task, Job, LocalFarmEngine + + +logger = logging.getLogger("LocalFarmSubmitter") +logger.setLevel(logging.INFO) + + +DEFAULT_FARM_PATH = os.getenv("MR_LOCAL_FARM_PATH", os.path.join(os.path.expanduser("~"), ".local_farm")) +REZ_DELIMITER_PATTERN = re.compile(r"(-|==|>=|>|<=|<)") +MESHROOM_ROOT = Path(__file__).resolve().parent.parent.parent + + +Chunk = namedtuple("chunk", ["iteration", "start", "end"]) +CreatedTask = namedtuple("task", ["task", "chunkParams"]) + + + +def wrapMeshroomBin(_bin): + if shutil.which(_bin): + # The alias exists so use it directly + return _bin + binFolder = str(MESHROOM_ROOT / "bin") + return os.path.join(binFolder, _bin) + + +def getResolvedVersionsDict(): + """ Get a dict {packageName: version} corresponding to the current context """ + resolvedPackages = os.environ.get('REZ_RESOLVE', '').split() + resolvedVersions = {} + for r in resolvedPackages: + if r.startswith('~'): # remove implicit packages + continue + v = r.split('-') + if len(v) == 2: + resolvedVersions[v[0]] = v[1] + elif len(v) > 2: # Handle case with multiple hyphen-minus + resolvedVersions[v[0]] = "-".join(v[1:]) + return resolvedVersions + + +def getRequestPackages(packagesDelimiter="=="): + """ + Get list of packages required for the job + Depends on env var and current rez context + + By default we use the "==" delimiter to make sure we have the same version + in the job that the one we have in the env where meshroom is launched + """ + reqPackages = set() + if 'REZ_REQUEST' in os.environ: + # Get the names of the packages that have been requested + requestedPackages = os.environ.get('REZ_USED_REQUEST', '').split() + usedPackages = set() # Use set to remove duplicates + for p in requestedPackages: + if p.startswith('~') or p.startswith("!"): + continue + v = REZ_DELIMITER_PATTERN.split(p) + usedPackages.add(v[0]) + # Add requested packages to the reqPackages set + resolvedVersions = getResolvedVersionsDict() + for p in usedPackages: + reqPackages.add(packagesDelimiter.join([p, resolvedVersions[p]])) + logging.debug(f"LocalFarmSubmitter: REZ Packages: {str(reqPackages)}") + elif 'REZ_MESHROOM_VERSION' in os.environ: + reqPackages.add(f"meshroom{packagesDelimiter}{os.environ.get('REZ_MESHROOM_VERSION', '')}") + return list(reqPackages) + + +def rezWrapCommand(cmd, useCurrentContext=False, useRequestedContext=True, otherRezPkg: list[str] = None): + """ Wrap command to be runned using rez + :param cmd: command to run + :type cmd: bool + :param useCurrentContext: use current rez context to retrieve a list of rez packages + :type useCurrentContext: bool + :param useRequestedContext: use rez packages that have been requested (not the full context) # TODO : remove it + :type useRequestedContext: bool + :param otherRezPkg: Additionnal rez packages + :type otherRezPkg: list[str] + """ + packages = set() + if useCurrentContext: + # In this case we want to use the full context + packages.update([p for p in os.environ.get('REZ_RESOLVE', '').split(" ") if p]) + elif useRequestedContext: + # In this case we want to use only packages in the rez request + packages.update(getRequestPackages()) + # Add additional packages + if otherRezPkg: + packages.update(otherRezPkg) + packagesStr = " ".join([p for p in packages if p]) + if packagesStr: + rezBin = "rez" + if "REZ_BIN" in os.environ and os.environ["REZ_BIN"]: + rezBin = os.environ["REZ_BIN"] + elif "REZ_PACKAGES_ROOT" in os.environ and os.environ["REZ_PACKAGES_ROOT"]: + rezBin = os.path.join(os.environ["REZ_PACKAGES_ROOT"], "bin/rez") + elif shutil.which("rez"): + rezBin = shutil.which("rez") + return f"{rezBin} env {packagesStr} -- {cmd}" + return cmd + + +class LocalFarmJob(BaseSubmittedJob): + """Interface to manipulate the job via Meshroom""" + + def __init__(self, jid, submitter, farmPath=None): + super().__init__(jid, submitter) + self.jid = jid + self.submitter: LocalFarmSubmitter = submitter + self.__localJob = None + self.__localJobTasks = None + self.farmPath = farmPath or DEFAULT_FARM_PATH + self._engine = LocalFarmEngine(self.farmPath) + + def __getJobInfos(self): + """ Find job """ + self.__localJob = self._engine.get_job_infos(self.jid) + self.__localJobTasks = {t.get("tid"): t for t in self.__localJob["tasks"]} + + @property + def localfarmJob(self): + if not self.__localJob: + self.__getJobInfos() + return self.__localJob + + @property + def localfarmTasks(self): + if not self.__localJobTasks: + self.__getJobInfos() + return self.__localJobTasks + + def __getChunkTasks(self, nodeUid, iteration): + tasks = [] + for _, task in self.localfarmTasks.items(): + taskNodeUid = task["metadata"].get("nodeUid", None) + taskIt = task["metadata"].get("iteration", -1) + if taskNodeUid == nodeUid and taskIt == iteration: + tasks.append(task) + return tasks + + # Task actions + + def stopChunkTask(self, node, iteration): + """ This will kill one task """ + tasks = self.__getChunkTasks(node._uid, iteration) + for task in tasks: + self._engine.stop_task(self.jid, task["tid"]) + + def skipChunkTask(self, node, iteration): + """ This will kill one task """ + tasks = self.__getChunkTasks(node._uid, iteration) + for task in tasks: + self._engine.skip_task(self.jid, task["tid"]) + + def restartChunkTask(self, node, iteration): + """ This will kill one task """ + tasks = self.__getChunkTasks(node._uid, iteration) + for task in tasks: + self._engine.restart_task(self.jid, task["tid"]) + + # Job actions + + def getJobErrors(self): + """ Check for error in the job """ + return self._engine.get_job_errors(self.jid) + + def pauseJob(self): + """ This will pause the job : new tasks will not be processed """ + self._engine.pause_job(self.jid) + + def resumeJob(self): + """ This will unpause the job """ + self._engine.unpause_job(self.jid) + + def interruptJob(self): + """ This will interrupt the job (and kill running tasks) """ + self._engine.interrupt_job(self.jid) + + def restartJob(self): + """ Restarts the whole job """ + self._engine.restart_job(self.jid) + + def restartErrorTasks(self): + """ Restart all error tasks on the job """ + self._engine.restart_error_tasks(self.jid) + + +class LocalFarmSubmitter(BaseSubmitter): + """ + Meshroom submitter to localfarm + """ + + _name = "LocalFarm" + _options = SubmitterOptions(SubmitterOptionsEnum.ALL) + + dryRun = False + environment = {} + + def __init__(self, parent=None): + super().__init__(parent=parent) + self.farmPath = DEFAULT_FARM_PATH + self.reqPackages = getRequestPackages() + self.jobEnv = {} + + def setFarmPath(self, path: str): + self.farmPath = path + + def setJobEnv(self, env: dict): + self.jobEnv = env + + def retrieveJob(self, jid) -> LocalFarmJob: + job = LocalFarmJob(jid, self, farmPath=self.farmPath) + return job + + @staticmethod + def getChunks(chunkParams) -> list[Chunk]: + """ Get list of chunks """ + it = None + ignoreIterations = chunkParams.get("ignoreIterations", []) + if chunkParams: + start, end = chunkParams.get("start", -1), chunkParams.get("end", -2) + size = chunkParams.get("packetSize", 1) + frameRange = list(range(start, end+1, 1)) + if frameRange: + slices = [frameRange[i:i + size] for i in range(0, len(frameRange), size)] + it = [Chunk(i, item[0], item[-1]) for i, item in enumerate(slices) if i not in ignoreIterations] + return it + + @staticmethod + def getExpandWrappedCmd(cmdArgs, rezPackages): + # Wrap with create_chunks + cmdBin = wrapMeshroomBin("meshroom_createChunks") + cmd = f"{cmdBin} --submitter LocalFarm {cmdArgs}" + # Wrap with rez + cmd = rezWrapCommand(cmd, otherRezPkg=rezPackages) + return cmd + + def __createChunkTasks(self, job: Job, parentTask: Task, children: List[Task], chunkParams: dict) -> Task: + cmdArgs = chunkParams.get("chunkCmdArgs") + chunks = self.getChunks(chunkParams) + for c in chunks: + name = f"{parentTask.name}_{c.start}_{c.end}" + meta = parentTask.metadata.copy() + meta["iteration"] = c.iteration + cmdBin = wrapMeshroomBin("meshroom_compute") + cmd = f"{cmdBin} {cmdArgs} --iteration {c.iteration}" + cmd = rezWrapCommand(cmd, otherRezPkg=self.reqPackages) + chunkTask = Task(name=name, command=cmd, metadata=meta, env=self.jobEnv) + job.addTask(chunkTask) + for child in children: + job.addTaskDependency(child, chunkTask) + job.addTaskDependency(chunkTask, parentTask) + + def createTask(self, meshroomFile: str, node) -> CreatedTask: + cmdArgs = f"--node {node.name} \"{meshroomFile}\" --extern" + metadata = {"nodeUid": node._uid} + + if not node._chunksCreated: + cmd = self.getExpandWrappedCmd(cmdArgs, self.reqPackages) + task = Task(name=node.name, command=cmd, metadata=metadata, env=self.jobEnv) + task = CreatedTask(task, None) + + elif node.isParallelized: + _, _, nbBlocks = node.nodeDesc.parallelization.getSizes(node) + iterationsToIgnore = [] + for c in node._chunks: + if c._status.status == Status.SUCCESS: + iterationsToIgnore.append(c.range.iteration) + chunkParams = { + "start": 0, "end": nbBlocks - 1, "step": 1, + "ignoreIterations": iterationsToIgnore, + "chunkCmdArgs": cmdArgs + } + task = Task(name=node.name, command="", metadata=metadata, env=self.jobEnv) + task = CreatedTask(task, chunkParams) + + else: + cmdBin = wrapMeshroomBin("meshroom_compute") + cmd = f"{cmdBin} {cmdArgs} --iteration 0" + cmd = rezWrapCommand(cmd, otherRezPkg=self.reqPackages) + task = Task(name=node.name, command=cmd, metadata=metadata, env=self.jobEnv) + task = CreatedTask(task, None) + + print("Created task : ", task) + + return task + + def buildDependencies(self, job: Job, nodeUidToTask: Dict[str, CreatedTask], edges): + """ Gather and create dependencies + First we get all parents and all children for each task + Then for each task : + - we add the dependency to their parent and children + - if the task is a chunked task (which means multi iteration tasks) the we create the + chunk tasks and add dependencies from chunk tasks to children tasks + + # TODO : there's a lot of confusion between nodes and tasks here + # I wrote bad code I'm sorry, I'll do better when I get more sleep + """ + # Gather dependencies + tasksParentsUids = defaultdict(set) + tasksChildrenUids = defaultdict(set) + for u, v in edges: + # tasksParentsUids[v._uid].add(u._uid) + # tasksChildrenUids[u._uid].add(v._uid) + tasksParentsUids[u._uid].add(v._uid) + tasksChildrenUids[v._uid].add(u._uid) + # Create dependencies + for taskUid, createdTask in nodeUidToTask.items(): + parentsTasks = [nodeUidToTask[tuid].task for tuid in tasksParentsUids.get(taskUid, set())] + childrenTasks = [nodeUidToTask[tuid].task for tuid in tasksChildrenUids.get(taskUid, set())] + # Create regular dependencies + for parentTask in parentsTasks: + job.addTaskDependency(createdTask.task, parentTask) + for childTask in childrenTasks: + job.addTaskDependency(childTask, createdTask.task) + # Create chunk tasks if necessary + if createdTask.chunkParams: + self.__createChunkTasks(job, createdTask.task, childrenTasks, createdTask.chunkParams) + + def createJob(self, nodes, edges, filepath, submitLabel="{projectName}") -> LocalFarmJob: + projectName = os.path.splitext(os.path.basename(filepath))[0] + name = submitLabel.format(projectName=projectName) + # Create job + job = Job(name) + # Create tasks + nodeUidToTask: Dict[str, CreatedTask] = {} + for node in nodes: + if node._uid in nodeUidToTask: + continue # HACK: Should not be necessary + createdTask: CreatedTask = self.createTask(filepath, node) + job.addTask(createdTask.task) + nodeUidToTask[node._uid] = createdTask + # Build dependencies + self.buildDependencies(job, nodeUidToTask, edges) + # Submit job + engine = LocalFarmEngine(self.farmPath) + res = job.submit(engine) + print(f"Submitted job : {res}") + if self.dryRun: + return True + if len(res) == 0: + return False + submittedJob = LocalFarmJob(res.get("jid"), LocalFarmSubmitter, farmPath=self.farmPath) + return submittedJob + + def createChunkTask(self, node, graphFile, **kwargs): + """ + Dynamically create chunk tasks for the given node (executed by meshroom_createChunks) + """ + # Retrieve current job/task infos + currentJid, currentTid = int(os.getenv("LOCALFARM_CURRENT_JID")), int(os.getenv("LOCALFARM_CURRENT_TID")) + # Make sure we inherit current MESHROOM_PLUGINS_PATH for submission + # TODO : later we can immplement a proper env inheriting system like what we have in tractor + taskEnv = { + "MESHROOM_PLUGINS_PATH": os.environ.get("MESHROOM_PLUGINS_PATH", "") + } + if self.jobEnv: + taskEnv.update(self.jobEnv) + # Get engine + engine = LocalFarmEngine(self.farmPath) + # Get chunk infos + cmdArgs = f"--node {node.name} \"{graphFile}\" --extern" + _, _, nbBlocks = node.nodeDesc.parallelization.getSizes(node) + if nbBlocks <= 0: + return + chunkRangeParams = {'start': 0, 'end': nbBlocks - 1, 'step': 1} + # Create subtasks + for chunk in self.getChunks(chunkRangeParams): + name = f"{node.name}_{chunk.start}_{chunk.end}" + metadata = {"nodeUid": node._uid, "iteration": chunk.iteration} + cmdBin = wrapMeshroomBin("meshroom_compute") + cmd = f"{cmdBin} {cmdArgs} --iteration {chunk.iteration}" + cmd = rezWrapCommand(cmd, otherRezPkg=self.reqPackages) + print("Additional chunk task command: ", cmd) + task = Task(name=name, command=cmd, metadata=metadata, env=taskEnv) + engine.create_additional_task(currentJid, currentTid, task) diff --git a/meshroom/ui/graph.py b/meshroom/ui/graph.py index 5a2bbb86af..b51f3ea273 100644 --- a/meshroom/ui/graph.py +++ b/meshroom/ui/graph.py @@ -8,6 +8,7 @@ from multiprocessing.pool import ThreadPool from typing import Optional, Union from collections.abc import Iterator +from collections import OrderedDict from PySide6.QtCore import ( Slot, @@ -28,6 +29,7 @@ from meshroom.core.graphIO import GraphIO from meshroom.core.taskManager import TaskManager +from meshroom.core.submitter import jobManager, SubmitterOptionsEnum from meshroom.core.node import NodeChunk, Node, Status, ExecMode, CompatibilityNode, Position from meshroom.core import submitters, MrNodeType @@ -87,6 +89,7 @@ def setFiles(self, files): Args: files: the list of files to monitor """ + logging.debug(f"FilesModTimePollerThread: Watch files {files}") with self._mutex: self._files = files @@ -129,68 +132,59 @@ def onFilePollerRefreshChanged(self, value): filePollerRefreshReady = Signal() # The refresh status has been updated and is ready to be used -class ChunksMonitor(QObject): +class NodeStatusMonitor(QObject): """ - ChunksMonitor regularly check NodeChunks' status files for modification and trigger their update on change. + NodeStatusMonitor regularly check status files for modification and trigger their update on change. When working locally, status changes are reflected through the emission of 'statusChanged' signals. But when a graph is being computed externally - either via a Submitter or on another machine, - NodeChunks status files are modified by another instance, potentially outside this machine file system scope. + Status files are modified by another instance, potentially outside this machine file system scope. Same goes when status files are deleted/modified manually. Thus, for genericity, monitoring is based on regular polling and not file system watching. """ - def __init__(self, chunks=(), parent=None): + def __init__(self, parent=None): super().__init__(parent) - self.monitorableChunks = [] - self.monitoredChunks = [] + self.monitorableNodes = [] + self.monitoredFiles = {} # Dict {filepath: node} self._filesTimePoller = FilesModTimePollerThread(parent=self) self._filesTimePoller.timesAvailable.connect(self.compareFilesTimes) self._filesTimePoller.start() - self.setChunks(chunks) - + self.setMonitored([]) self.filePollerRefreshChanged.connect(self._filesTimePoller.onFilePollerRefreshChanged) self._filesTimePoller.filePollerRefreshReady.connect(self.onFilePollerRefreshUpdated) - def setChunks(self, chunks): - """ - Set the lists of chunks that can be monitored and that are monitored. - When the file poller status is set to AUTO_ENABLED, the lists of monitorable and monitored chunks are identical. - """ - self.monitorableChunks = chunks - files, monitoredChunks = self.watchedStatusFiles - self._filesTimePoller.setFiles(files) - self.monitoredChunks = monitoredChunks + def setWatchedFiles(self): + self.monitoredItems = self.getMonitoredFiles() + monitoredFiles = list([f for f in self.monitoredItems.keys()]) + self._filesTimePoller.setFiles(monitoredFiles) + + def setMonitored(self, nodes): + self.monitorableNodes = nodes + self.setWatchedFiles() def stop(self): """ Stop the status files monitoring. """ self._filesTimePoller.stop() - @property - def statusFiles(self): - """ Get status file paths from the monitorable chunks. """ - return [c.statusFile for c in self.monitorableChunks] - - @property - def watchedStatusFiles(self): - """ - Get the status file paths from the currently monitored chunks. - Depending on the file poller status, the paths may either be those of all the current chunks, or those from the currently submitted/running chunks. - """ - - files = [] - chunks = [] - if self.filePollerRefresh is PollerRefreshStatus.AUTO_ENABLED.value: - return self.statusFiles, self.monitorableChunks - elif self.filePollerRefresh is PollerRefreshStatus.MINIMAL_ENABLED.value: - for c in self.monitorableChunks: + def getMonitoredFiles(self): + monitoredItems = OrderedDict() + for node in self.monitorableNodes: + if node._chunksCreated: + fileItems = {c.getStatusFile(): ("chunk", c) for c in node._chunks} + else: + fileItems = {node.nodeStatusFile: ("node", node)} + if self.filePollerRefresh is PollerRefreshStatus.AUTO_ENABLED.value: + # Add everything + monitoredItems.update(fileItems) + elif self.filePollerRefresh is PollerRefreshStatus.MINIMAL_ENABLED.value: # Only chunks that are run externally or local_isolated should be monitored, # when run locally, status changes are already notified. # Chunks with an ERROR status may be re-submitted externally and should thus still be monitored - if (c.isExtern() and c._status.status in (Status.SUBMITTED, Status.RUNNING, Status.ERROR)) or ( - (c.node.getMrNodeType() == MrNodeType.NODE) and (c._status.status in (Status.SUBMITTED, Status.RUNNING))): - files.append(c.statusFile) - chunks.append(c) - return files, chunks + for file, (_type, _item) in fileItems.items(): + if not _item.shouldMonitorChanges(): + continue + monitoredItems[file] = (_type, _item) + return monitoredItems def compareFilesTimes(self, times): """ @@ -200,14 +194,27 @@ def compareFilesTimes(self, times): Args: times: the last modification times for currently monitored files. """ - newRecords = dict(zip(self.monitoredChunks, times)) hasChangesAndSuccess = False - for chunk, fileModTime in newRecords.items(): - # update chunk status if last modification time has changed since previous record - if fileModTime != chunk.statusFileLastModTime: - chunk.updateStatusFromCache() - if chunk.status.status == Status.SUCCESS: - hasChangesAndSuccess = True + newRecords = dict(zip(self.monitoredItems.items(), times)) + for monitoredItem, fileModTime in newRecords.items(): + _, (_type, _item) = monitoredItem + if _type == "chunk": + chunk = _item + # update chunk status if last modification time has changed since previous record + if fileModTime != chunk.statusFileLastModTime: + chunk.updateStatusFromCache() + if chunk._status.status == Status.SUCCESS: + hasChangesAndSuccess = True + elif _type == "node": + node = _item + if fileModTime != node.nodeStatusFileLastModTime: + node.updateStatusFromCache() + # Check for success + if node.getGlobalStatus() == Status.SUCCESS: + hasChangesAndSuccess = True + elif node._chunksCreated: + # Chunks have been created -> set the watched files again + self.setWatchedFiles() if hasChangesAndSuccess: chunk.node.loadOutputAttr() @@ -219,9 +226,7 @@ def onFilePollerRefreshUpdated(self): In minimal auto-refresh mode, this includes only the chunks that are submitted or running. """ if self.filePollerRefresh is not PollerRefreshStatus.DISABLED.value: - files, chunks = self.watchedStatusFiles - self._filesTimePoller.setFiles(files) - self.monitoredChunks = chunks + self.setWatchedFiles() def onComputeStatusChanged(self): """ @@ -229,9 +234,7 @@ def onComputeStatusChanged(self): file poller status is minimal auto-refresh. """ if self.filePollerRefresh is PollerRefreshStatus.MINIMAL_ENABLED.value: - files, chunks = self.watchedStatusFiles - self._filesTimePoller.setFiles(files) - self.monitoredChunks = chunks + self.setWatchedFiles() filePollerRefreshChanged = Signal(int) filePollerRefresh = Property(int, lambda self: self._filesTimePoller.filePollerRefresh, notify=filePollerRefreshChanged) @@ -371,12 +374,13 @@ def __init__(self, undoStack: commands.UndoStack, taskManager: TaskManager, pare self._graph: Graph = Graph('', self) self._modificationCount = 0 - self._chunksMonitor: ChunksMonitor = ChunksMonitor(parent=self) + self._chunksMonitor: NodeStatusMonitor = NodeStatusMonitor(parent=self) self._computeThread: Thread = Thread() self._computingLocally = self._submitted = False self._sortedDFSChunks: QObjectListModel = QObjectListModel(parent=self) self._layout: GraphLayout = GraphLayout(self) self._selectedNode = None + self._selectedChunk = None self._nodeSelection: QItemSelectionModel = QItemSelectionModel(self._graph.nodes, parent=self) self._hoveredNode = None @@ -398,6 +402,7 @@ def setGraph(self, g): oldGraph.deleteLater() self._graph.updated.connect(self.onGraphUpdated) + self._graph.statusUpdated.connect(self.updateChunkMonitor) self._taskManager.update(self._graph) # update and connect chunks when the graph is set for the first time @@ -425,11 +430,18 @@ def onGraphUpdated(self): def updateChunks(self): dfsNodes = self._graph.dfsOnFinish(None)[0] - chunks = self._graph.getChunks(dfsNodes) - # Nothing has changed, return + chunks = [] + for node in dfsNodes: + if node._chunksCreated: + nodechunks = node.getChunks() + chunks.extend(nodechunks) if self._sortedDFSChunks.objectList() == chunks: + # Nothing has changed, return return for chunk in self._sortedDFSChunks: + if chunk not in chunks: + # Chunk have been already deleted + continue chunk.statusChanged.disconnect(self.updateGraphComputingStatus) chunk.statusChanged.disconnect(self._chunksMonitor.onComputeStatusChanged) self._sortedDFSChunks.setObjectList(chunks) @@ -437,13 +449,19 @@ def updateChunks(self): chunk.statusChanged.connect(self.updateGraphComputingStatus) chunk.statusChanged.connect(self._chunksMonitor.onComputeStatusChanged) # provide ChunkMonitor with the update list of chunks - self.updateChunkMonitor(self._sortedDFSChunks) + self.updateChunkMonitor() # update graph computing status based on the new list of NodeChunks self.updateGraphComputingStatus() - def updateChunkMonitor(self, chunks): + def updateChunkMonitor(self): """ Update the list of chunks for status files monitoring. """ - self._chunksMonitor.setChunks(chunks) + nodes = set() + for node in self._graph.dfsOnFinish(None)[0]: + if not node._chunksCreated: + nodes.add(node) + for chunk in self._sortedDFSChunks: + nodes.add(chunk.node) + self._chunksMonitor.setMonitored(list(nodes)) def clear(self): if self._graph: @@ -512,7 +530,7 @@ def _saveAs(self, url, setupProjectFile=True, template=False): self._undoStack.setClean() # saving file on disk impacts cache folder location # => force re-evaluation of monitored status files paths - self.updateChunkMonitor(self._sortedDFSChunks) + self.updateChunkMonitor() @Slot() def saveAsTemp(self): @@ -542,25 +560,28 @@ def execute(self, nodes: Optional[Union[list[Node], Node]] = None): @Slot() def stopExecution(self): + self.updateChunks() if not self.isComputingLocally(): return self._taskManager.requestBlockRestart() self._graph.stopExecution() - self._taskManager._thread.join() + self._taskManager.join() @Slot(Node) def stopNodeComputation(self, node): """ Stop the computation of the node and update all the nodes depending on it. """ + self.updateChunks() if not self.isComputingLocally(): return # Stop the node and wait Task Manager node.stopComputation() - self._taskManager._thread.join() + self._taskManager.join() @Slot(Node) def cancelNodeComputation(self, node): """ Cancel the computation of the node and all the nodes depending on it. """ + self.updateChunks() if node.getGlobalStatus() == Status.SUBMITTED: # Status from SUBMITTED to NONE # Make sure to remove the nodes from the Task Manager list @@ -570,6 +591,222 @@ def cancelNodeComputation(self, node): for n in node.getOutputNodes(recursive=True, dependenciesOnly=True): n.clearSubmittedChunks() self._taskManager.removeNode(n, displayList=True, processList=True) + + def isChunkComputingLocally(self, chunk): + # update graph computing status + computingLocally = chunk._status.execMode == ExecMode.LOCAL and \ + (sessionUid in (chunk.node._nodeStatus.submitterSessionUid, chunk._status.computeSessionUid)) and \ + (chunk._status.status in (Status.RUNNING, Status.SUBMITTED)) + return computingLocally + + def isChunkComputingExternally(self, chunk): + # Note: We do not check computeSessionUid for the submitted status, + # as the source instance of the submit has no importance. + return (chunk._status.execMode == ExecMode.EXTERN) and \ + chunk._status.status in (Status.RUNNING, Status.SUBMITTED) + + @Slot(NodeChunk) + def stopTask(self, chunk: NodeChunk): + """ Stop the selected task """ + chunk.updateStatusFromCache() + if not chunk.isAlreadySubmitted(): + return + node = chunk.node + job = jobManager.getNodeJob(node) + if job: + chunkIteration = chunk.range.iteration + try: + job.stopChunkTask(node, chunkIteration) + except Exception as e: + self.parent().showMessage(f"Failed to stop chunk {chunkIteration} of {node.label}", "error") + logging.warning(f"Error on stopTask:\n{e}") + else: + chunk.updateStatusFromCache() + chunk.upgradeStatusTo(Status.STOPPED) + # TODO : Stop depending nodes ? + self.parent().showMessage(f"Stopped chunk {chunkIteration} of {node.label}") + else: + chunk.stopProcess() + self._taskManager._cancelledChunks.append(chunk) + for chunk in node._chunks: + if chunk._status.status == Status.SUBMITTED: + chunk.stopProcess() + self._taskManager._cancelledChunks.append(chunk) + for n in node.getOutputNodes(recursive=True, dependenciesOnly=True): + n.clearSubmittedChunks() + self._taskManager.removeNode(n, displayList=True, processList=True) + + @Slot(Node) + def stopNode(self, node: Node): + """ Stop the selected task """ + job = jobManager.getNodeJob(node) + if job: + try: + job.stopChunkTask(node, -1) + except Exception as e: + self.parent().showMessage(f"Failed to stop node {node.label}", "error") + logging.warning(f"Error on stopTask:\n{e}") + else: + node.updateNodeStatusFromCache() + node.upgradeStatusTo(Status.STOPPED) + # TODO : Stop depending nodes ? + self.parent().showMessage(f"Stopped node {node.label}") + else: + self.cancelNodeComputation(node) + node.stopComputation() + + @Slot(NodeChunk) + def restartTask(self, chunk: NodeChunk): + """ Relaunch a stopped task """ + node = chunk.node + job = jobManager.getNodeJob(node) + if job: + chunkIteration = chunk.range.iteration + try: + chunk.updateStatusFromCache() + chunk.upgradeStatusTo(Status.SUBMITTED) + job.restartChunkTask(node, chunkIteration) + except Exception as e: + chunk.updateStatusFromCache() + chunk.upgradeStatusTo(Status.ERROR) + self.parent().showMessage(f"Failed to relaunch chunk {chunkIteration} of {node.label}", "error") + logging.warning(f"Error on restartTask:\n{e}") + else: + self.parent().showMessage(f"Relaunched chunk {chunkIteration} of {node.label}") + else: + # For this we would need to use a pool (with either chunks or nodes) + # instead of the list of nodes that are processed serially + self.parent().showMessage(f"Chunks cannot be launched individually locally", "warning") + if self.canComputeNode(node): + self.execute([node]) + + @Slot(NodeChunk) + def skipTask(self, chunk: NodeChunk): + """ Skip the task : the job will continue as if the task succeeded + In local mode, the chunk status will be set to success + """ + chunk.updateStatusFromCache() + node = chunk.node + chunkIteration = chunk.range.iteration + job = jobManager.getNodeJob(node) + if job: + try: + job.skipChunkTask(node, chunkIteration) + except Exception as e: + self.parent().showMessage(f"Failed to skip chunk {chunkIteration} of {node.label}", "error") + logging.warning(f"Error on skipTask:\n{e}") + else: + chunk.upgradeStatusTo(Status.SUCCESS) + self.parent().showMessage(f"Skipped chunk {chunkIteration} of {node.label}") + else: + chunk.stopProcess() + chunk.upgradeStatusTo(Status.SUCCESS) + self._taskManager._cancelledChunks.append(chunk) + self.parent().showMessage(f"Skipped chunk {chunkIteration} of {node.label}") + + @Slot(Node) + def pauseJob(self, node: Node): + """ Pause the running job : cancel all scheduled tasks. + Current task don't stop but future tasks won't be launched + """ + job = jobManager.getNodeJob(node) + if job: + try: + job.pauseJob() + except Exception as e: + logging.warning(f"Error on pauseJob:\n{e}") + self.parent().showMessage(f"Failed to pause the job for node {node}", "error") + else: + self.parent().showMessage(f"Paused node {node.label} on farm") + elif not node.isExtern(): + self.parent().showMessage(f"PauseJob is only available in external computation mode!", "warning") + else: + self.parent().showMessage(f"Cannot retrieve the job", "error") + + @Slot(Node) + def resumeJob(self, node: Node): + """ Resume the paused job + """ + job = jobManager.getNodeJob(node) + if job: + # Node is submitted to farm + try: + job.resumeJob() + except Exception as e: + self.parent().showMessage(f"Failed to resume node {node.label} on farm") + logging.warning(f"Error on resumeJob:\n{e}") + else: + self.parent().showMessage(f"Resumed the job for node {node}") + else: + # In this case user can just relaunch the node computation + # Could be implemented if we had a paused state on the task manager + # Where unprocessed nodes are retained + pass + + @Slot(Node) + def interruptJob(self, node: Node): + """ Interrupt the job that processes the node + """ + job = jobManager.getNodeJob(node) + if job: + try: + job.interruptJob() + except Exception as e: + self.parent().showMessage(f"Failed to interrupt node {node.label} on farm", "error") + logging.warning(f"Error on interruptJob:\n{e}") + else: + for chunk in self._sortedDFSChunks: + if jobManager.getNodeJob(chunk.node) == job: + if chunk._status.status in (Status.SUBMITTED, Status.RUNNING): + chunk.updateStatusFromCache() + chunk.upgradeStatusTo(Status.STOPPED) + for _node in self._graph.dfsOnFinish(None)[0]: + if jobManager.getNodeJob(_node) == job and not _node._chunksCreated and \ + _node._nodeStatus.status in (Status.SUBMITTED, Status.RUNNING): + _node.upgradeStatusTo(Status.STOPPED) + self.parent().showMessage(f"Interrupted the job for node {node}") + elif not node.isExtern(): + for chunk in self._sortedDFSChunks: + if not chunk.isExtern() and chunk._status.status in (Status.SUBMITTED, Status.RUNNING): + chunk.updateStatusFromCache() + chunk.upgradeStatusTo(Status.STOPPED) + for node in self._graph.dfsOnFinish(None)[0]: + if not node.isExtern() and not node._chunksCreated and \ + node._nodeStatus.status in (Status.SUBMITTED, Status.RUNNING): + node.upgradeStatusTo(Status.STOPPED) + self.stopExecution() + self.parent().showMessage(f"Stopped the local job process") + else: + self.parent().showMessage(f"Could not retrieve job for node {node}", "error") + + @Slot(Node) + def restartJobErrorTasks(self, node: Node): + """ Restart all tasks in the job that have failed + """ + job = jobManager.getNodeJob(node) + if job: + try: + # Fist update status of each chunk to submitted + for chunk in self._sortedDFSChunks: + if chunk._status.status not in (Status.ERROR, Status.STOPPED, Status.KILLED): + continue + if jobManager.getNodeJob(chunk.node) == job: + chunk.upgradeStatusTo(Status.SUBMITTED) + for node in self._graph.dfsOnFinish(None)[0]: + if not node._chunksCreated and node._nodeStatus.status in (Status.ERROR, Status.STOPPED, Status.KILLED): + node.upgradeStatusTo(Status.SUBMITTED) + job.restartErrorTasks() + job.resumeJob() + except Exception as e: + self.parent().showMessage(f"Failed to restart error tasks for node {node.label} on farm", "error") + logging.warning(f"Error on restartJobErrorTasks:\n{e}") + else: + self.parent().showMessage(f"Restarted error tasks for the node {node}") + else: + # In this case user can just relaunch the node computation + # Could be implemented if we had a paused state on the task manager + # Where error/failed nodes are retained + pass @Slot() @Slot(Node) @@ -587,18 +824,37 @@ def submit(self, nodes: Optional[Union[list[Node], Node]] = None): nodes = [nodes] if not isinstance(nodes, Iterable) and nodes else nodes mrDefaultSubmitter = os.environ.get('MESHROOM_DEFAULT_SUBMITTER', '') chosenSubmitter = self.parent()._defaultSubmitterName or mrDefaultSubmitter + self.parent().showMessage(f"Submit job on farm through {chosenSubmitter}") + self.parent().showMessage(f"Nodes to submit : {nodes}") self._taskManager.submit(self._graph, chosenSubmitter, nodes, submitLabel=self.submitLabel) def updateGraphComputingStatus(self): + dfsNodes = self._graph.dfsOnFinish(None)[0] + # TODO : these functions should go on the node part + # We should do any([node.isRunning for node in dfsNodes]) + # update graph computing status computingLocally = any([ - ch.status.execMode == ExecMode.LOCAL and - (sessionUid in (ch.status.submitterSessionUid, ch.status.sessionUid)) and ( - ch.status.status in (Status.RUNNING, Status.SUBMITTED)) - for ch in self._sortedDFSChunks]) - # Note: We do not check sessionUid for the submitted status, + ch._status.execMode == ExecMode.LOCAL and \ + (sessionUid in (ch.node._nodeStatus.submitterSessionUid, ch._status.computeSessionUid)) and \ + (ch._status.status in (Status.RUNNING, Status.SUBMITTED)) + for ch in self._sortedDFSChunks + ]) + # Note: We do not check computeSessionUid for the submitted status, # as the source instance of the submit has no importance. - submitted = any([ch.status.execMode == ExecMode.EXTERN and ch.status.status in (Status.RUNNING, Status.SUBMITTED) for ch in self._sortedDFSChunks]) + submitted = any([ch._status.execMode == ExecMode.EXTERN and ch._status.status in (Status.RUNNING, Status.SUBMITTED) for ch in self._sortedDFSChunks]) + + # Handle nodes with uninitialized chunks + for node in dfsNodes: + if node._chunksCreated: + continue + if node._nodeStatus.status in (Status.RUNNING, Status.SUBMITTED): + # TODO : save session ID in node + if (node._nodeStatus.execMode == ExecMode.LOCAL): + computingLocally = True + elif (node._nodeStatus.execMode == ExecMode.EXTERN): + submitted = True + if self._computingLocally != computingLocally or self._submitted != submitted: self._computingLocally = computingLocally self._submitted = submitted @@ -737,11 +993,6 @@ def alignVertically(self): for selectedNode in selectedNodes: self.moveNode(selectedNode, Position(meanX, selectedNode.y)) - @Slot() - def removeSelectedNodes(self): - """Remove selected nodes from the graph.""" - self.removeNodes(list(self.iterSelectedNodes())) - @Slot(list) def removeNodes(self, nodes: list[Node]): """ @@ -757,6 +1008,11 @@ def removeNodes(self, nodes: list[Node]): for node in nodes: self.push(commands.RemoveNodeCommand(self._graph, node)) + @Slot() + def removeSelectedNodes(self): + """Remove selected nodes from the graph.""" + self.removeNodes(list(self.iterSelectedNodes())) + @Slot(list) def removeNodesFrom(self, nodes: list[Node]): """ @@ -1244,6 +1500,9 @@ def canSubmitNode(self, node: Node) -> bool: selectedNodeChanged = Signal() # Current main selected node selectedNode = makeProperty(QObject, "_selectedNode", selectedNodeChanged, resetOnDestroy=True) + # Current chunk selected (used to send signals from TaskManager to ChunksListView) + selectedChunkChanged = Signal() + selectedChunk = makeProperty(QObject, "_selectedChunk", selectedChunkChanged, resetOnDestroy=True) nodeSelection = makeProperty(QObject, "_nodeSelection") diff --git a/meshroom/ui/qml/Controls/NodeActions.qml b/meshroom/ui/qml/Controls/NodeActions.qml index 26c199b988..b52764ad8a 100644 --- a/meshroom/ui/qml/Controls/NodeActions.qml +++ b/meshroom/ui/qml/Controls/NodeActions.qml @@ -80,6 +80,9 @@ Item { // Position header above the node (fixed offset in screen pixels) x = nodeScreenX + (selectedNodeDelegate.width * draggable.scale - width) / 2 y = nodeScreenY - height - headerOffset + if (y < 0) { + y = 0 + } } onWidthChanged: { @@ -210,13 +213,13 @@ Item { ToolTip.delay: 1000 visible: actionHeader.computeButtonState != NodeActions.ButtonState.DELETABLE enabled: actionHeader.computeButtonState % 2 == 1 // Launchable & Stoppable + // Icon color + textColor: (!enabled && actionHeader.nodeSubmitted) ? Colors.statusColors["SUBMITTED"] : (checked ? palette.highlight : palette.text) + // Background color background: Rectangle { color: { - if (!computeButton.enabled) { - if (actionHeader.nodeSubmitted) - return Qt.darker(Colors.statusColors["SUBMITTED"], 1.2) + if (!computeButton.enabled) return activePalette.button - } if (actionHeader.computeButtonState == NodeActions.ButtonState.STOPPABLE) return computeButton.hovered ? Colors.orange : Qt.darker(Colors.orange, 1.3) return computeButton.hovered ? activePalette.highlight : activePalette.button @@ -272,13 +275,13 @@ Item { ToolTip.delay: 1000 visible: root.uigraph ? root.uigraph.canSubmit : false enabled: actionHeader.submitButtonState != NodeActions.ButtonState.DISABLED + // Icon color + textColor: (!enabled && actionHeader.nodeSubmitted) ? Colors.statusColors["SUBMITTED"] : (checked ? palette.highlight : palette.text) + // Background color background: Rectangle { color: { - if (!submitButton.enabled) { - if (actionHeader.nodeSubmitted) - return Qt.darker(Colors.statusColors["SUBMITTED"], 1.2) + if (!submitButton.enabled) return activePalette.button - } return submitButton.hovered ? activePalette.highlight : activePalette.button } opacity: submitButton.hovered ? 1 : root._opacity diff --git a/meshroom/ui/qml/GraphEditor/ChunksListView.qml b/meshroom/ui/qml/GraphEditor/ChunksListView.qml index 1afc3182b9..6ddd6d2c26 100644 --- a/meshroom/ui/qml/GraphEditor/ChunksListView.qml +++ b/meshroom/ui/qml/GraphEditor/ChunksListView.qml @@ -5,11 +5,13 @@ import QtQuick.Layouts import Utils 1.0 /** - * ChunkListView + * ChunksListView */ ColumnLayout { id: root + + property var uigraph: null property variant chunks property int currentIndex: 0 property variant currentChunk: (chunks && currentIndex >= 0) ? chunks.at(currentIndex) : undefined @@ -90,4 +92,17 @@ ColumnLayout { } } } + + Connections { + target: _reconstruction + function onSelectedChunkChanged() { + for (var i = 0; i < root.chunks.count; i++) { + if (_reconstruction.selectedChunk === root.chunks.at(i)) { + root.currentIndex = i + break; + } + } + } + ignoreUnknownSignals: true + } } diff --git a/meshroom/ui/qml/GraphEditor/Node.qml b/meshroom/ui/qml/GraphEditor/Node.qml index 438aad0535..cbe02059c8 100755 --- a/meshroom/ui/qml/GraphEditor/Node.qml +++ b/meshroom/ui/qml/GraphEditor/Node.qml @@ -496,10 +496,11 @@ Item { // Node Chunks NodeChunks { visible: node.isComputableType + targetNode: node defaultColor: Colors.sysPalette.mid implicitHeight: 3 width: parent.width - model: node ? node.chunks : undefined + model: node && node.chunksCreated ? node.chunks : undefined Rectangle { anchors.fill: parent diff --git a/meshroom/ui/qml/GraphEditor/NodeChunks.qml b/meshroom/ui/qml/GraphEditor/NodeChunks.qml index 5505da7c38..5048a1cadc 100644 --- a/meshroom/ui/qml/GraphEditor/NodeChunks.qml +++ b/meshroom/ui/qml/GraphEditor/NodeChunks.qml @@ -9,13 +9,15 @@ ListView { SystemPalette { id: activePalette } + property var targetNode: null + property color defaultColor: Qt.darker(activePalette.window, 1.1) property real chunkHeight: height - property bool modelIsBig: (3 * model.count >= width) + property int modelSize: model ? model.count : 0 + property bool modelIsBig: (3 * modelSize >= width) property real chunkWidth: { - if (!model || model.count == 0) - return 0 - return (width / model.count) - spacing + if (modelSize == 0) return 0 + return (width / modelSize) - spacing } orientation: ListView.Horizontal @@ -28,7 +30,7 @@ ListView { width: root.chunkWidth property var chunkColor: Colors.getChunkColor(object, { "NONE": root.defaultColor }) color: { - if (!highlightChunks || model.count == 1) + if (!highlightChunks || modelSize == 1) return chunkColor if (index % 2 == 0) return Qt.lighter(chunkColor, 1.1) @@ -36,4 +38,16 @@ ListView { return Qt.darker(chunkColor, 1.1) } } + + // Default rectangle shown when model is empty/undefined (= no chunks) + Rectangle { + anchors.fill: parent + color: root.targetNode !== null + ? (root.targetNode.globalStatus === "NONE" + ? Colors.darkpurple + : Colors.getNodeColor(root.targetNode, { "NONE": root.defaultColor })) + : "transparent" + enabled: modelSize == 0 + visible: enabled + } } diff --git a/meshroom/ui/qml/GraphEditor/NodeEditor.qml b/meshroom/ui/qml/GraphEditor/NodeEditor.qml index db824eebdb..9a4579cfd4 100644 --- a/meshroom/ui/qml/GraphEditor/NodeEditor.qml +++ b/meshroom/ui/qml/GraphEditor/NodeEditor.qml @@ -284,8 +284,9 @@ Panel { // The list of chunks ChunksListView { id: chunksLV - visible: (tabBar.currentIndex >= 1 && tabBar.currentIndex <= 3) - chunks: root.node.chunks + enabled: root.node ? root.node.chunksCreated : false + chunks: root.node ? root.node.chunks : null + visible: enabled && (tabBar.currentIndex >= 1 && tabBar.currentIndex <= 3) SplitView.preferredWidth: 55 SplitView.minimumWidth: 20 } diff --git a/meshroom/ui/qml/GraphEditor/TaskManager.qml b/meshroom/ui/qml/GraphEditor/TaskManager.qml index aa3bbc79aa..96ab531cb0 100644 --- a/meshroom/ui/qml/GraphEditor/TaskManager.qml +++ b/meshroom/ui/qml/GraphEditor/TaskManager.qml @@ -2,6 +2,7 @@ import QtQuick import QtQuick.Controls import QtQuick.Layouts +import MaterialIcons 2.2 import Controls 1.0 import Utils 1.0 @@ -22,10 +23,20 @@ Item { property color tableBorder: Colors.sysPalette.window property int borderWidth: 3 + // Max wifth for some columns + readonly property int maxExecWidth: 200 + + property var selectedChunk: null + function selectNode(node) { uigraph.selectedNode = node } + function selectChunk(chunk) { + root.selectedChunk = chunk + uigraph.selectedChunk = chunk + } + TextMetrics { id: nbMetrics text: root.taskManager ? root.taskManager.nodes.count : "0" @@ -51,222 +62,424 @@ Item { text: "Progress" } - ListView { - id: taskList + RowLayout { anchors.fill: parent - ScrollBar.vertical: MScrollBar {} - model: parent.taskManager ? parent.taskManager.nodes : null - spacing: 3 + ColumnLayout { + Layout.alignment: Qt.AlignLeft | Qt.AlignTop + width: childrenRect.width + spacing: 8 - headerPositioning: ListView.OverlayHeader + // TODO : enable/disable buttons depending on selectedChunk + // TODO : Also handle case where uigraph.selectedNode and selectedNode.chunksCreated==false - header: RowLayout { - height: 30 - spacing: 3 + // Task toolbar + Rectangle { + Layout.preferredWidth: 40 + Layout.preferredHeight: taskColumn.height + 8 + color: "transparent" + border.color: Colors.darkpurple + border.width: 2 + radius: 8 - width: parent.width + ColumnLayout { + id: taskColumn + anchors.centerIn: parent + spacing: 2 - z: 2 + MaterialToolButton { + ToolTip.text: "Stop Task" + Layout.alignment: Qt.AlignHCenter + enabled: selectedChunk !== null || root.uigraph.selectedNode !== null + text: MaterialIcons.stop_circle + font.pointSize: 15 + onClicked: { + if (selectedChunk !== null) { + root.uigraph.stopTask(selectedChunk) + } else { + root.uigraph.stopNode(root.uigraph.selectedNode) + } + } + } - Label { - text: qsTr("Nb") - Layout.preferredWidth: nbMetrics.width + 20 - Layout.preferredHeight: parent.height - horizontalAlignment: Label.AlignHCenter - verticalAlignment: Label.AlignVCenter - background: Rectangle { - color: headBgColor - } - } - Label { - text: qsTr("Node") - Layout.preferredWidth: 250 - Layout.preferredHeight: parent.height - horizontalAlignment: Label.AlignHCenter - verticalAlignment: Label.AlignVCenter - background: Rectangle { - color: headBgColor - } - } - Label { - text: qsTr("State") - Layout.preferredWidth: statusMetrics.width + 20 - Layout.preferredHeight: parent.height - horizontalAlignment: Label.AlignHCenter - verticalAlignment: Label.AlignVCenter - background: Rectangle { - color: headBgColor - } - } - Label { - text: qsTr("Chunks Done") - Layout.preferredWidth: chunksMetrics.width + 20 - Layout.preferredHeight: parent.height - horizontalAlignment: Label.AlignHCenter - verticalAlignment: Label.AlignVCenter - background: Rectangle { - color: headBgColor - } - } - Label { - text: qsTr("Exec Mode") - Layout.preferredWidth: execMetrics.width + 20 - Layout.preferredHeight: parent.height - horizontalAlignment: Label.AlignHCenter - verticalAlignment: Label.AlignVCenter - background: Rectangle { - color: headBgColor + MaterialToolButton { + ToolTip.text: "Restart Task" + Layout.alignment: Qt.AlignHCenter + enabled: selectedChunk !== null + text: MaterialIcons.replay_circle_filled + font.pointSize: 15 + onClicked: { + uigraph.restartTask(selectedChunk) + } + } + + MaterialToolButton { + ToolTip.text: "Skip Task" + Layout.alignment: Qt.AlignHCenter + enabled: selectedChunk !== null + text: MaterialIcons.skip_next + font.pointSize: 15 + onClicked: { + uigraph.skipTask(selectedChunk) + } + } + + Item { + Layout.preferredWidth: 40 + Layout.preferredHeight: 50 + + Text { + text: "TASK" + anchors.centerIn: parent + color: Colors.sysPalette.text + font.pixelSize: 11 + font.bold: true + rotation: -90 + transformOrigin: Item.Center + } + } } } - Label { - text: qsTr("Progress") - Layout.fillWidth: true - Layout.minimumWidth: progressMetrics.width + 20 - Layout.preferredHeight: parent.height - horizontalAlignment: Label.AlignHCenter - verticalAlignment: Label.AlignVCenter - background: Rectangle { - color: headBgColor + + // Job toolbar + Rectangle { + Layout.preferredWidth: 40 + Layout.preferredHeight: jobColumn.height + 8 + color: "transparent" + border.color: Colors.darkpurple + border.width: 2 + radius: 8 + + ColumnLayout { + id: jobColumn + anchors.centerIn: parent + spacing: 2 + + MaterialToolButton { + ToolTip.text: "Pause Job" + Layout.alignment: Qt.AlignHCenter + enabled: root.uigraph.selectedNode !== null + text: MaterialIcons.pause_circle_filled + font.pointSize: 15 + onClicked: { + uigraph.pauseJob(uigraph.selectedNode) + } + } + + MaterialToolButton { + ToolTip.text: "Resume Job" + Layout.alignment: Qt.AlignHCenter + enabled: root.uigraph.selectedNode !== null + text: MaterialIcons.play_circle_filled + font.pointSize: 15 + onClicked: { + uigraph.resumeJob(uigraph.selectedNode) + } + } + + MaterialToolButton { + ToolTip.text: "Interrupt Job" + Layout.alignment: Qt.AlignHCenter + enabled: root.uigraph.selectedNode !== null + text: MaterialIcons.stop_circle + font.pointSize: 15 + onClicked: { + uigraph.interruptJob(uigraph.selectedNode) + } + } + + MaterialToolButton { + ToolTip.text: "Restart All Error Tasks" + Layout.alignment: Qt.AlignHCenter + enabled: root.uigraph.selectedNode !== null + text: MaterialIcons.replay_circle_filled + font.pointSize: 15 + onClicked: { + uigraph.restartJobErrorTasks(uigraph.selectedNode) + } + } + + Item { + Layout.preferredWidth: 40 + Layout.preferredHeight: 40 + + Text { + text: "JOB" + anchors.centerIn: parent + color: Colors.sysPalette.text + font.pixelSize: 11 + font.bold: true + rotation: -90 + transformOrigin: Item.Center + } + } } } } - delegate: RowLayout { - width: ListView.view.width - height: 18 + ListView { + id: taskList + Layout.alignment: Qt.AlignLeft | Qt.AlignTop + Layout.fillWidth: true + Layout.fillHeight: true + ScrollBar.vertical: MScrollBar {} + + model: root.taskManager ? root.taskManager.nodes : null spacing: 3 - function getNbFinishedChunks(chunks) { - var nbSuccess = 0 - for (var i = 0; i < chunks.count; i++) { - if (chunks.at(i).statusName === "SUCCESS") { - nbSuccess += 1 + headerPositioning: ListView.OverlayHeader + + header: RowLayout { + height: 30 + spacing: 3 + + width: parent.width + + z: 2 + + Label { + text: qsTr("Nb") + Layout.preferredWidth: nbMetrics.width + 20 + Layout.preferredHeight: parent.height + horizontalAlignment: Label.AlignHCenter + verticalAlignment: Label.AlignVCenter + background: Rectangle { + color: headBgColor } } - return nbSuccess - } - - Label { - text: index + 1 - Layout.preferredWidth: nbMetrics.width + 20 - Layout.preferredHeight: parent.height - horizontalAlignment: Label.AlignHCenter - verticalAlignment: Label.AlignVCenter - color: object === uigraph.selectedNode ? Colors.sysPalette.window : Colors.sysPalette.text - background: Rectangle { - color: object === uigraph.selectedNode ? Colors.sysPalette.text : bgColor + Label { + text: qsTr("Node") + Layout.preferredWidth: 200 + Layout.preferredHeight: parent.height + horizontalAlignment: Label.AlignHCenter + verticalAlignment: Label.AlignVCenter + background: Rectangle { + color: headBgColor + } } - - MouseArea { - anchors.fill: parent - onPressed: { - selectNode(object) + Label { + text: qsTr("State") + Layout.preferredWidth: statusMetrics.width + 20 + Layout.preferredHeight: parent.height + horizontalAlignment: Label.AlignHCenter + verticalAlignment: Label.AlignVCenter + background: Rectangle { + color: headBgColor } } - } - Label { - text: object.label - Layout.preferredWidth: 250 - Layout.preferredHeight: parent.height - horizontalAlignment: Label.AlignHCenter - verticalAlignment: Label.AlignVCenter - color: object === uigraph.selectedNode ? Colors.sysPalette.window : Colors.sysPalette.text - background: Rectangle { - color: object === uigraph.selectedNode ? Colors.sysPalette.text : bgColor + Label { + text: qsTr("Chunks Done") + Layout.preferredWidth: chunksMetrics.width + 20 + Layout.preferredHeight: parent.height + horizontalAlignment: Label.AlignHCenter + verticalAlignment: Label.AlignVCenter + background: Rectangle { + color: headBgColor + } } - - MouseArea { - anchors.fill: parent - onPressed: { - selectNode(object) + Label { + text: qsTr("Exec Mode") + Layout.preferredWidth: execMetrics.width + 60 + Layout.preferredHeight: parent.height + horizontalAlignment: Label.AlignHCenter + verticalAlignment: Label.AlignVCenter + background: Rectangle { + color: headBgColor + } + } + Label { + text: qsTr("Progress") + Layout.fillWidth: true + Layout.minimumWidth: progressMetrics.width + 20 + Layout.preferredHeight: parent.height + horizontalAlignment: Label.AlignHCenter + verticalAlignment: Label.AlignVCenter + background: Rectangle { + color: headBgColor } } } - Label { - text: object.globalStatus - Layout.preferredWidth: statusMetrics.width + 20 - Layout.preferredHeight: parent.height - horizontalAlignment: Label.AlignHCenter - verticalAlignment: Label.AlignVCenter - color: object === uigraph.selectedNode ? Colors.sysPalette.window : Colors.sysPalette.text - background: Rectangle { - color: object === uigraph.selectedNode ? Colors.sysPalette.text : bgColor + + delegate: RowLayout { + width: ListView.view.width + height: 18 + spacing: 3 + + function getNbFinishedChunks(chunks) { + var nbSuccess = 0 + for (var i = 0; i < chunks.count; i++) { + if (chunks.at(i).statusName === "SUCCESS") { + nbSuccess += 1 + } + } + return nbSuccess } - MouseArea { - anchors.fill: parent - onPressed: { - selectNode(object) + Label { + text: index + 1 + Layout.preferredWidth: nbMetrics.width + 20 + Layout.preferredHeight: parent.height + horizontalAlignment: Label.AlignHCenter + verticalAlignment: Label.AlignVCenter + color: object === uigraph.selectedNode ? Colors.sysPalette.window : Colors.sysPalette.text + background: Rectangle { + color: object === uigraph.selectedNode ? Colors.sysPalette.text : bgColor + } + + MouseArea { + anchors.fill: parent + onPressed: { + selectNode(object) + } } } - } - Label { - text: getNbFinishedChunks(object.chunks) + "/" + object.chunks.count - Layout.preferredWidth: chunksMetrics.width + 20 - Layout.preferredHeight: parent.height - horizontalAlignment: Label.AlignHCenter - verticalAlignment: Label.AlignVCenter - color: object === uigraph.selectedNode ? Colors.sysPalette.window : Colors.sysPalette.text - background: Rectangle { - color: object === uigraph.selectedNode ? Colors.sysPalette.text : bgColor + Label { + text: object.label + elide: Text.ElideRight + Layout.preferredWidth: 200 + Layout.preferredHeight: parent.height + horizontalAlignment: Label.AlignHCenter + verticalAlignment: Label.AlignVCenter + color: object === uigraph.selectedNode ? Colors.sysPalette.window : Colors.sysPalette.text + background: Rectangle { + color: object === uigraph.selectedNode ? Colors.sysPalette.text : bgColor + } + + MouseArea { + anchors.fill: parent + acceptedButtons: Qt.LeftButton | Qt.RightButton + onPressed: (mouse) => { + if (mouse.button === Qt.LeftButton) { + selectNode(object) + } else if (mouse.button === Qt.RightButton) { + contextMenu.popup() + } + } + Menu { + id: contextMenu + MenuItem { + text: "Open Folder" + height: visible ? implicitHeight : 0 + onTriggered: Qt.openUrlExternally(Filepath.stringToUrl(object.internalFolder)) + } + } + } } + Label { + text: object.globalStatus + Layout.preferredWidth: statusMetrics.width + 20 + Layout.preferredHeight: parent.height + horizontalAlignment: Label.AlignHCenter + verticalAlignment: Label.AlignVCenter + color: object === uigraph.selectedNode ? Colors.sysPalette.window : Colors.sysPalette.text + background: Rectangle { + color: object === uigraph.selectedNode ? Colors.sysPalette.text : bgColor + } - MouseArea { - anchors.fill: parent - onPressed: { - selectNode(object) + MouseArea { + anchors.fill: parent + onPressed: { + selectNode(object) + } } } - } - Label { - text: object.globalExecMode - Layout.preferredWidth: execMetrics.width + 20 - Layout.preferredHeight: parent.height - horizontalAlignment: Label.AlignHCenter - verticalAlignment: Label.AlignVCenter - color: object === uigraph.selectedNode ? Colors.sysPalette.window : Colors.sysPalette.text - background: Rectangle { - color: object === uigraph.selectedNode ? Colors.sysPalette.text : bgColor + Label { + text: getNbFinishedChunks(object.chunks) + "/" + object.chunks.count + Layout.preferredWidth: chunksMetrics.width + 20 + Layout.preferredHeight: parent.height + horizontalAlignment: Label.AlignHCenter + verticalAlignment: Label.AlignVCenter + color: object === uigraph.selectedNode ? Colors.sysPalette.window : Colors.sysPalette.text + background: Rectangle { + color: object === uigraph.selectedNode ? Colors.sysPalette.text : bgColor + } + + MouseArea { + anchors.fill: parent + onPressed: { + selectNode(object) + } + } } + Label { + text: object.jobName + elide: Text.ElideRight + Layout.preferredWidth: execMetrics.width + 60 + Layout.preferredHeight: parent.height + horizontalAlignment: Label.AlignHCenter + verticalAlignment: Label.AlignVCenter + color: object === uigraph.selectedNode ? Colors.sysPalette.window : Colors.sysPalette.text + background: Rectangle { + color: object === uigraph.selectedNode ? Colors.sysPalette.text : bgColor + } - MouseArea { - anchors.fill: parent - onPressed: { - selectNode(object) + MouseArea { + anchors.fill: parent + onPressed: { + selectNode(object) + } } } - } - Item { - Layout.fillWidth: true - Layout.minimumWidth: progressMetrics.width + 20 - Layout.preferredHeight: parent.height - - ListView { - id: chunkList - width: parent.width - height: parent.height - orientation: ListView.Horizontal - model: object.chunks - property var node: object - - spacing: 3 - - delegate: Label { - width: ListView.view.model ? (ListView.view.width / ListView.view.model.count) - 3 : 0 - height: ListView.view.height - anchors.verticalCenter: parent.verticalCenter - background: Rectangle { - color: Colors.getChunkColor(object, {"NONE": bgColor}) - radius: 3 - border.width: 2 - border.color: chunkList.node === uigraph.selectedNode ? Colors.sysPalette.text : Colors.getChunkColor(object, {"NONE": bgColor}) + Item { + Layout.fillWidth: true + Layout.minimumWidth: progressMetrics.width + 20 + Layout.preferredHeight: parent.height + + ListView { + id: chunkList + width: parent.width + height: parent.height + orientation: ListView.Horizontal + model: object.chunks + property var node: object + + spacing: 3 + + delegate: Loader { + id: chunkDelegate + width: ListView.view.model + ? (ListView.view.width - (ListView.view.model.count - 1) * chunkList.spacing) / ListView.view.model.count + : 0 + + height: ListView.view.height + + sourceComponent: Label { + anchors.fill: parent + background: Rectangle { + color: Colors.getChunkColor(object, {"NONE": bgColor}) + radius: 3 + border.width: 2 + border.color: (root.selectedChunk == object) ? Qt.darker(color, 1.3) : "transparent" + } + + MouseArea { + anchors.fill: parent + onPressed: { + selectNode(chunkList.node) + selectChunk(object) + } + } + } } - MouseArea { + // Placeholder for uninitialized chunks + Label { + enabled: chunkList.model.count == 0 + visible: enabled anchors.fill: parent - onPressed: { - selectNode(chunkList.node) + background: Rectangle { + color: Colors.getNodeColor(chunkList.node, {"NONE": Colors.darkpurple}) + radius: 3 + border.width: 2 + border.color: (chunkList.node === uigraph.selectedNode) ? Qt.lighter(color, 1.3) : "transparent" + } + + MouseArea { + anchors.fill: parent + onPressed: { + selectNode(chunkList.node) + selectChunk(null) + } } } } diff --git a/meshroom/ui/qml/MaterialIcons/MaterialToolButton.qml b/meshroom/ui/qml/MaterialIcons/MaterialToolButton.qml index 8378b20ee9..7ead423c1d 100644 --- a/meshroom/ui/qml/MaterialIcons/MaterialToolButton.qml +++ b/meshroom/ui/qml/MaterialIcons/MaterialToolButton.qml @@ -14,9 +14,13 @@ ToolButton { font.pointSize: 13 ToolTip.visible: ToolTip.text && hovered ToolTip.delay: 100 + + property color textColor: checked ? palette.highlight : palette.text + Component.onCompleted: { - contentItem.color = Qt.binding(function() { return checked ? palette.highlight : palette.text }) + contentItem.color = Qt.binding(function() { return textColor }) } + background: Rectangle { color: { if (enabled && (pressed || checked || hovered)) { diff --git a/meshroom/ui/qml/Utils/Colors.qml b/meshroom/ui/qml/Utils/Colors.qml index af27c91ce5..749ce63dfa 100644 --- a/meshroom/ui/qml/Utils/Colors.qml +++ b/meshroom/ui/qml/Utils/Colors.qml @@ -20,6 +20,7 @@ QtObject { readonly property color lime: "#CDDC39" readonly property color grey: "#555555" readonly property color lightgrey: "#999999" + readonly property color darkpurple: "#5c4885" readonly property var statusColors: { "NONE": "transparent", @@ -27,7 +28,8 @@ QtObject { "RUNNING": orange, "ERROR": red, "SUCCESS": green, - "STOPPED": pink + "STOPPED": pink, + "INPUT": "transparent" } readonly property var ghostColors: { @@ -62,6 +64,22 @@ QtObject { console.warn("Unknown status : " + chunk.status) return "magenta" } + + function getNodeColor(node, overrides) { + if (node === undefined) + return "transparent" + if (overrides && node.globalStatus in overrides) { + return overrides[node.globalStatus] + } else if (node.globalExecMode === "EXTERN" && node.globalStatus in statusColorsExternOverrides) { + return statusColorsExternOverrides[node.globalStatus] + } else if (node.name !== node.nodeStatusNodeName && node.globalStatus in ghostColors) { + return ghostColors[node.globalStatus] + } else if (node.globalStatus in statusColors) { + return statusColors[node.globalStatus] + } + console.warn("Unknown status : " + node.globalStatus) + return "magenta" + } function toRgb(color) { return [ diff --git a/meshroom/ui/reconstruction.py b/meshroom/ui/reconstruction.py index 770cf9c518..10ca798b44 100755 --- a/meshroom/ui/reconstruction.py +++ b/meshroom/ui/reconstruction.py @@ -1031,7 +1031,8 @@ def _setSfm(self, node): # disconnection step in 'setSfm' (at this point, 'self._sfm' underlying object # has been destroyed and can't be evaluated anymore) self._sfm.destroyed.connect(self._unsetSfm) - self._sfm.chunks[0].statusChanged.connect(self.updateSfMResults) + if len(self._sfm._chunks) > 0: + self._sfm.chunks[0].statusChanged.connect(self.updateSfMResults) self.sfmChanged.emit() def setSfm(self, node): diff --git a/tests/plugins/meshroom/pluginSubmitter/PluginSubmitter.py b/tests/plugins/meshroom/pluginSubmitter/PluginSubmitter.py new file mode 100644 index 0000000000..c438bc487a --- /dev/null +++ b/tests/plugins/meshroom/pluginSubmitter/PluginSubmitter.py @@ -0,0 +1,54 @@ +__version__ = "1.0" + + +import logging +from meshroom.core import desc + + +LOGGER = logging.getLogger("TestSubmit") + + +class PluginSubmitterA(desc.BaseNode): + """ + Test process no parallelization + """ + parallelization = None + + inputs = [ + desc.IntParam( + name="input", + label="Input", + description="input", + value=1, + ), + ] + outputs = [ + desc.IntParam( + name="output", + label="Output", + description="Output", + value=None, + ), + ] + + def processChunk(self, chunk): + iteration = chunk.range.iteration + nbBlocks = chunk.range.nbBlocks + LOGGER.info(f"> Process chunk {iteration}/{nbBlocks}") + LOGGER.info(f"> Done") + + +class PluginSubmitterB(PluginSubmitterA): + """ + Test process with parallelization adn static node size + """ + size = desc.StaticNodeSize(2) + parallelization = desc.Parallelization(blockSize=1) + + +class PluginSubmitterC(PluginSubmitterA): + """ + Test process with parallelization and dynamic node size + """ + size = desc.DynamicNodeSize("input") + parallelization = desc.Parallelization(blockSize=1) diff --git a/tests/plugins/meshroom/pluginSubmitter/__init__.py b/tests/plugins/meshroom/pluginSubmitter/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_submit.py b/tests/test_submit.py new file mode 100644 index 0000000000..dfcdbdbde0 --- /dev/null +++ b/tests/test_submit.py @@ -0,0 +1,151 @@ +# coding:utf-8 + +""" +This test aims to replicate toe process on node submission +""" + +import os +import time +from sys import platform + +from .utils import registerNodeDesc + +import meshroom +from meshroom.core import pluginManager, loadClassesNodes +from meshroom.core.graph import Graph +from meshroom.core.plugins import Plugin +from meshroom.core.node import Node, Status +from meshroom.core.submitter import jobManager +from meshroom.submitters.localFarmSubmitter import LocalFarmSubmitter, LocalFarmJob + +from localfarm.localFarmLauncher import FarmLauncher + + +IS_LINUX = (platform == "linux" or platform == "linux2") + + +def get_submitter() -> LocalFarmSubmitter: + for sName, s in meshroom.core.submitters.items(): + if sName == "LocalFarm": + return s + raise RuntimeError("LocalFarm submitter not found") + + +def getJobEnv(): + """ Required to have meshroom recognize plugins that were created here """ + pluginFolder = os.path.join(os.path.dirname(__file__), "plugins") + return { + "MESHROOM_PLUGINS_PATH": pluginFolder + } + + +def waitForNodeCompletion(job: LocalFarmJob, node: Node, timeout=25): + """ + Wait for a node to complete processing + """ + print(f"Waiting for node {node.name} to complete...") + startTime = time.time() + while True: + node.updateStatusFromCache() + nodeStatus = node.getGlobalStatus() + if nodeStatus not in (Status.SUBMITTED, Status.RUNNING): + print(f"Node status switched to {nodeStatus}") + return + # Check for job error + err = job.getJobErrors() + if err: + raise RuntimeError(f"Job encountered an error: {err}") + if time.time() - startTime > timeout: + raise TimeoutError(f"Node {node.name} did not complete within {timeout} seconds") + time.sleep(1) + +def processSubmit(node: Node, graph, tmp_path): + """ + Actual function that test the submit process + """ + # Save graph + tmp_path = str(tmp_path) + graph.save(os.path.join(tmp_path, "graph.mg")) + # Prepare all chunks + node.initStatusOnSubmit() + # Start farm + farmLauncher = FarmLauncher(tmp_path) + farmLauncher.start() + time.sleep(1) + error = None + try: + print(f"submit {node}") + submitter = get_submitter() + submitter.setFarmPath(tmp_path) + submitter.setJobEnv(getJobEnv()) + nodesToProcess, edgesToProcess = [node], [] + # Update nodes status + for node in nodesToProcess: + node.initStatusOnSubmit() + # Update monitored to make sure meshroom knows when task status change + graph.updateMonitoredFiles() + assert node.getGlobalStatus() == Status.SUBMITTED + res = submitter.submit(nodesToProcess, edgesToProcess, graph.filepath, submitLabel="TestSubmit") + assert res is not None, "Submitter returned no job" + assert res.__class__.__name__ == "LocalFarmJob", "Submitted job is not a LocalFarmJob" + jobManager.addJob(res, nodesToProcess) + waitForNodeCompletion(res, node) + except Exception as e: + error = e + finally: + farmLauncher.stop() + if error: + raise error + else: + farmLauncher.clean() + + +class TestNodeSubmit: + __test__ = IS_LINUX + + @classmethod + def setup_class(cls): + meshroom.core.initSubmitters() + + cls.folder = os.path.join(os.path.dirname(__file__), "plugins", "meshroom") + package = "pluginSubmitter" + cls.plugin = Plugin(package, cls.folder) + nodes = loadClassesNodes(cls.folder, package) + for node in nodes: + cls.plugin.addNodePlugin(node) + pluginManager.addPlugin(cls.plugin) + + @classmethod + def teardown_class(cls): + for node in cls.plugin.nodes.values(): + pluginManager.unregisterNode(node) + cls.plugin = None + + def setupNode(self, graph, name): + plugin = pluginManager.getPlugin("pluginSubmitter") + node = plugin.nodes[name] + nodeType = node.nodeDescriptor + registerNodeDesc(nodeType) + node = graph.addNewNode(nodeType.__name__) + return node + + def test_submitNoParallel(self, tmp_path): + graph = Graph("") + graph._cacheDir = os.path.join(tmp_path, "cache") + node = self.setupNode(graph, "PluginSubmitterA") + # Submit + processSubmit(node, graph, tmp_path) + + def test_submitStaticSize(self, tmp_path): + graph = Graph("") + graph._cacheDir = os.path.join(tmp_path, "cache") + node = self.setupNode(graph, "PluginSubmitterB") + # Submit + processSubmit(node, graph, tmp_path) + + def test_submitDynamicSize(self, tmp_path): + graph = Graph("") + graph._cacheDir = os.path.join(tmp_path, "cache") + node = self.setupNode(graph, "PluginSubmitterC") + # Submit + processSubmit(node, graph, tmp_path)