Skip to content

Commit 63d2bd6

Browse files
authored
Merge pull request #246 from djarecka/mnt/memory_tojob
moving from deepcopy to pickle files in to_job method
2 parents f042738 + d17b593 commit 63d2bd6

File tree

10 files changed

+310
-126
lines changed

10 files changed

+310
-126
lines changed

.travis.yml

+5
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ matrix:
6161
allow_failures:
6262
- python: 3.7
6363
env: INSTALL_DEPENDS="pip==10.0.1 setuptools==30.3.0"
64+
- python: 3.7
65+
env:
66+
- INSTALL_TYPE="develop"
67+
- CHECK_TYPE="test_dask"
68+
6469

6570

6671
before_install:

pydra/conftest.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def pytest_addoption(parser):
88
def pytest_generate_tests(metafunc):
99
if "plugin_dask_opt" in metafunc.fixturenames:
1010
if bool(shutil.which("sbatch")):
11-
Plugins = ["cf", "slurm"]
11+
Plugins = ["slurm"]
1212
else:
1313
Plugins = ["cf"]
1414
if metafunc.config.getoption("dask"):
@@ -19,7 +19,7 @@ def pytest_generate_tests(metafunc):
1919
if metafunc.config.getoption("dask"):
2020
Plugins = []
2121
elif bool(shutil.which("sbatch")):
22-
Plugins = ["cf", "slurm"]
22+
Plugins = ["slurm"]
2323
else:
2424
Plugins = ["cf"]
2525
metafunc.parametrize("plugin", Plugins)

pydra/engine/core.py

+8-11
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import os
77
from pathlib import Path
88
import typing as ty
9-
from copy import deepcopy
9+
from copy import deepcopy, copy
1010

1111
import cloudpickle as cp
1212
from filelock import SoftFileLock
@@ -503,16 +503,13 @@ def get_input_el(self, ind):
503503
inputs_dict = {inp: getattr(self.inputs, inp) for inp in self.input_names}
504504
return None, inputs_dict
505505

506-
def to_job(self, ind):
507-
"""Run interface one element generated from node_state."""
508-
# logger.debug("Run interface el, name={}, ind={}".format(self.name, ind))
509-
el = deepcopy(self)
510-
el.state = None
511-
# dj might be needed
512-
# el._checksum = None
513-
_, inputs_dict = self.get_input_el(ind)
514-
el.inputs = attr.evolve(el.inputs, **inputs_dict)
515-
return el
506+
def pickle_task(self):
507+
""" Pickling the tasks with full inputs"""
508+
pkl_files = self.cache_dir / "pkl_files"
509+
pkl_files.mkdir(exist_ok=True, parents=True)
510+
task_main_path = pkl_files / f"{self.name}_{self.checksum}_task.pklz"
511+
save(task_path=pkl_files, task=self, name_prefix=f"{self.name}_{self.checksum}")
512+
return task_main_path
516513

517514
@property
518515
def done(self):

pydra/engine/helpers.py

+111-57
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,18 @@
44
import attr
55
import cloudpickle as cp
66
from pathlib import Path
7+
from filelock import SoftFileLock
78
import os
89
import sys
910
from hashlib import sha256
1011
import subprocess as sp
12+
import getpass
13+
import uuid
14+
from time import strftime
15+
from traceback import format_exception
1116

12-
from .specs import Runtime, File, Directory, attr_fields
17+
18+
from .specs import Runtime, File, Directory, attr_fields, Result
1319
from .helpers_file import hash_file, hash_dir, copyfile, is_existing_file
1420

1521

@@ -94,7 +100,7 @@ def load_result(checksum, cache_locations):
94100
return None
95101

96102

97-
def save(task_path: Path, result=None, task=None):
103+
def save(task_path: Path, result=None, task=None, name_prefix=None):
98104
"""
99105
Save a :class:`~pydra.engine.core.TaskBase` object and/or results.
100106
@@ -106,20 +112,28 @@ def save(task_path: Path, result=None, task=None):
106112
Result to pickle and write
107113
task : :class:`~pydra.engine.core.TaskBase`
108114
Task to pickle and write
109-
110115
"""
116+
111117
if task is None and result is None:
112118
raise ValueError("Nothing to be saved")
119+
120+
if not isinstance(task_path, Path):
121+
task_path = Path(task_path)
113122
task_path.mkdir(parents=True, exist_ok=True)
114-
if result:
115-
if Path(task_path).name.startswith("Workflow"):
116-
# copy files to the workflow directory
117-
result = copyfile_workflow(wf_path=task_path, result=result)
118-
with (task_path / "_result.pklz").open("wb") as fp:
119-
cp.dump(result, fp)
120-
if task:
121-
with (task_path / "_task.pklz").open("wb") as fp:
122-
cp.dump(task, fp)
123+
if name_prefix is None:
124+
name_prefix = ""
125+
126+
lockfile = task_path.parent / (task_path.name + "_save.lock")
127+
with SoftFileLock(lockfile):
128+
if result:
129+
if task_path.name.startswith("Workflow"):
130+
# copy files to the workflow directory
131+
result = copyfile_workflow(wf_path=task_path, result=result)
132+
with (task_path / f"{name_prefix}_result.pklz").open("wb") as fp:
133+
cp.dump(result, fp)
134+
if task:
135+
with (task_path / f"{name_prefix}_task.pklz").open("wb") as fp:
136+
cp.dump(task, fp)
123137

124138

125139
def copyfile_workflow(wf_path, result):
@@ -221,7 +235,7 @@ def make_klass(spec):
221235
if isinstance(item[1], attr._make._CountingAttr):
222236
newfields[item[0]] = item[1]
223237
else:
224-
newfields[item[0]] = attr.ib(repr=False, type=item[1])
238+
newfields[item[0]] = attr.ib(type=item[1])
225239
else:
226240
if (
227241
any([isinstance(ii, attr._make._CountingAttr) for ii in item])
@@ -369,8 +383,33 @@ def create_checksum(name, inputs):
369383

370384
def record_error(error_path, error):
371385
"""Write an error file."""
386+
387+
error_message = str(error)
388+
389+
resultfile = error_path / "_result.pklz"
390+
if not resultfile.exists():
391+
error_message += """\n
392+
When creating this error file, the results file corresponding
393+
to the task could not be found."""
394+
395+
name_checksum = str(error_path.name)
396+
timeofcrash = strftime("%Y%m%d-%H%M%S")
397+
try:
398+
login_name = getpass.getuser()
399+
except KeyError:
400+
login_name = "UID{:d}".format(os.getuid())
401+
402+
full_error = {
403+
"time of crash": timeofcrash,
404+
"login name": login_name,
405+
"name with checksum": name_checksum,
406+
"error message": error,
407+
}
408+
372409
with (error_path / "_error.pklz").open("wb") as fp:
373-
cp.dump(error, fp)
410+
cp.dump(full_error, fp)
411+
412+
return error_path / "_error.pklz"
374413

375414

376415
def get_open_loop():
@@ -397,49 +436,6 @@ def get_open_loop():
397436
return loop
398437

399438

400-
def create_pyscript(script_path, checksum, rerun=False):
401-
"""
402-
Create standalone script for task execution in a different environment.
403-
404-
Parameters
405-
----------
406-
script_path : :obj:`os.pathlike`
407-
Path to the script.
408-
checksum : str
409-
Task checksum.
410-
411-
Returns
412-
-------
413-
pyscript : :obj:`File`
414-
Execution script
415-
416-
"""
417-
task_pkl = script_path / "_task.pklz"
418-
if not task_pkl.exists() or not task_pkl.stat().st_size:
419-
raise Exception("Missing or empty task!")
420-
421-
content = f"""import cloudpickle as cp
422-
from pathlib import Path
423-
424-
425-
cache_path = Path("{str(script_path)}")
426-
task_pkl = (cache_path / "_task.pklz")
427-
task = cp.loads(task_pkl.read_bytes())
428-
429-
# submit task
430-
task(rerun={rerun})
431-
432-
if not task.result():
433-
raise Exception("Something went wrong")
434-
print("Completed", task.checksum, task)
435-
task_pkl.unlink()
436-
"""
437-
pyscript = script_path / f"pyscript_{checksum}.py"
438-
with pyscript.open("wt") as fp:
439-
fp.writelines(content)
440-
return pyscript
441-
442-
443439
def hash_function(obj):
444440
"""Generate hash of object."""
445441
return sha256(str(obj).encode()).hexdigest()
@@ -544,3 +540,61 @@ def get_available_cpus():
544540

545541
# Last resort
546542
return os.cpu_count()
543+
544+
545+
def load_and_run(
546+
task_pkl, ind=None, rerun=False, submitter=None, plugin=None, **kwargs
547+
):
548+
"""
549+
loading a task from a pickle file, settings proper input
550+
and running the task
551+
"""
552+
try:
553+
task = load_task(task_pkl=task_pkl, ind=ind)
554+
except Exception as excinfo:
555+
if task_pkl.parent.exists():
556+
etype, eval, etr = sys.exc_info()
557+
traceback = format_exception(etype, eval, etr)
558+
errorfile = record_error(task_pkl.parent, error=traceback)
559+
result = Result(output=None, runtime=None, errored=True)
560+
save(task_pkl.parent, result=result)
561+
raise
562+
563+
resultfile = task.output_dir / "_result.pklz"
564+
try:
565+
task(rerun=rerun, plugin=plugin, submitter=submitter, **kwargs)
566+
except Exception as excinfo:
567+
# creating result and error files if missing
568+
errorfile = task.output_dir / "_error.pklz"
569+
if not resultfile.exists():
570+
etype, eval, etr = sys.exc_info()
571+
traceback = format_exception(etype, eval, etr)
572+
errorfile = record_error(task.output_dir, error=traceback)
573+
result = Result(output=None, runtime=None, errored=True)
574+
save(task.output_dir, result=result)
575+
raise type(excinfo)(
576+
str(excinfo.with_traceback(None)),
577+
f" full crash report is here: {errorfile}",
578+
)
579+
return resultfile
580+
581+
582+
async def load_and_run_async(task_pkl, ind=None, submitter=None, rerun=False, **kwargs):
583+
"""
584+
loading a task from a pickle file, settings proper input
585+
and running the workflow
586+
"""
587+
task = load_task(task_pkl=task_pkl, ind=ind)
588+
await task._run(submitter=submitter, rerun=rerun, **kwargs)
589+
590+
591+
def load_task(task_pkl, ind=None):
592+
""" loading a task from a pickle file, settings proper input for the specific ind"""
593+
if isinstance(task_pkl, str):
594+
task_pkl = Path(task_pkl)
595+
task = cp.loads(task_pkl.read_bytes())
596+
if ind is not None:
597+
_, inputs_dict = task.get_input_el(ind)
598+
task.inputs = attr.evolve(task.inputs, **inputs_dict)
599+
task.state = None
600+
return task

pydra/engine/submitter.py

+21-12
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import asyncio
33
from .workers import SerialWorker, ConcurrentFuturesWorker, SlurmWorker, DaskWorker
44
from .core import is_workflow
5-
from .helpers import get_open_loop
5+
from .helpers import get_open_loop, load_and_run_async
66

77
import logging
88

@@ -64,11 +64,21 @@ def __call__(self, runnable, cache_locations=None, rerun=False):
6464

6565
async def submit_workflow(self, workflow, rerun=False):
6666
"""Distribute or initiate workflow execution."""
67-
if workflow.plugin and workflow.plugin != self.plugin:
68-
# dj: this is not tested!!!
69-
await self.worker.run_el(workflow, rerun=rerun)
70-
else:
71-
await workflow._run(self, rerun=rerun)
67+
if is_workflow(workflow):
68+
if workflow.plugin and workflow.plugin != self.plugin:
69+
# dj: this is not tested!!! TODO
70+
await self.worker.run_el(workflow, rerun=rerun)
71+
else:
72+
await workflow._run(self, rerun=rerun)
73+
else: # could be a tuple with paths to pickle files wiith tasks and inputs
74+
ind, wf_main_pkl, wf_orig = workflow
75+
if wf_orig.plugin and wf_orig.plugin != self.plugin:
76+
# dj: this is not tested!!! TODO
77+
await self.worker.run_el(workflow, rerun=rerun)
78+
else:
79+
await load_and_run_async(
80+
task_pkl=wf_main_pkl, ind=ind, submitter=self, rerun=rerun
81+
)
7282

7383
async def submit(self, runnable, wait=False, rerun=False):
7484
"""
@@ -100,17 +110,16 @@ async def submit(self, runnable, wait=False, rerun=False):
100110
logger.debug(
101111
f"Expanding {runnable} into {len(runnable.state.states_val)} states"
102112
)
113+
task_pkl = runnable.pickle_task()
114+
103115
for sidx in range(len(runnable.state.states_val)):
104-
job = runnable.to_job(sidx)
105-
logger.debug(
106-
f'Submitting runnable {job}{str(sidx) if sidx is not None else ""}'
107-
)
116+
job_tuple = (sidx, task_pkl, runnable)
108117
if is_workflow(runnable):
109118
# job has no state anymore
110-
futures.add(self.submit_workflow(job, rerun=rerun))
119+
futures.add(self.submit_workflow(job_tuple, rerun=rerun))
111120
else:
112121
# tasks are submitted to worker for execution
113-
futures.add(self.worker.run_el(job, rerun=rerun))
122+
futures.add(self.worker.run_el(job_tuple, rerun=rerun))
114123
else:
115124
if is_workflow(runnable):
116125
await self._run_workflow(runnable, rerun=rerun)

0 commit comments

Comments
 (0)