Skip to content

Commit 7e8dbb5

Browse files
authored
Merge pull request #494 from kyleam/slurm
run: Add Slurm support
2 parents 5911619 + 8456499 commit 7e8dbb5

File tree

6 files changed

+231
-63
lines changed

6 files changed

+231
-63
lines changed

.travis.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ matrix:
1717
- REPROMAN_TESTS_SSH=1
1818
- INSTALL_DATALAD=1
1919
- INSTALL_CONDOR=1
20+
- SETUP_SLURM=1
2021
- python: 3.5
2122
env:
2223
- REPROMAN_TESTS_SSH=1
@@ -66,6 +67,9 @@ before_install:
6667
sudo eatmydata tools/ci/prep-travis-forssh-sudo.sh;
6768
tools/ci/prep-travis-forssh.sh;
6869
fi
70+
- if [ ! -z "${SETUP_SLURM:-}" ]; then
71+
tools/ci/setup-slurm-container.sh;
72+
fi
6973
- git config --global user.email "[email protected]"
7074
- git config --global user.name "Travis Almighty"
7175

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/sh
2+
3+
#SBATCH --output={{ shlex_quote(_meta_directory) }}/stdout.%a
4+
#SBATCH --error={{ shlex_quote(_meta_directory) }}/stderr.%a
5+
{#
6+
TODO: We need to assess how we treat batch parameters across different
7+
submitters---things like whether we should try to expose common names and, if
8+
so, what are the discrepancies in the behavior, and how should we deal with
9+
that. We should also revisit the goal of making it possible for the caller to
10+
extend the submit file template to add stuff like parameters we do not expose
11+
and environment modules.
12+
#}
13+
{% if memory is defined %}
14+
#SBATCH --mem={{ memory }}
15+
{% endif %}
16+
{% if num_processes is defined %}
17+
#SBATCH --cpus-per-task={{ num_processes }}
18+
{% endif %}
19+
{% if _num_subjobs == 1 %}
20+
#SBATCH --array=0
21+
{% else %}
22+
#SBATCH --array=0-{{ _num_subjobs - 1}}
23+
{% endif %}
24+
25+
{{ shlex_quote(_meta_directory) }}/runscript $SLURM_ARRAY_TASK_ID

reproman/support/jobs/submitters.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,56 @@ def _status_no_json(self):
255255
return ours, theirs
256256

257257

258+
class SlurmSubmitter(Submitter):
259+
"""Submit a Slurm job.
260+
"""
261+
name = "slurm"
262+
263+
def __init__(self, session):
264+
super(SlurmSubmitter, self).__init__(session)
265+
266+
@property
267+
@borrowdoc(Submitter)
268+
def submit_command(self):
269+
return ["sbatch"]
270+
271+
@borrowdoc(Submitter)
272+
def submit(self, script, submit_command=None):
273+
out = super(SlurmSubmitter, self).submit(script, submit_command)
274+
# Output example (v19.05): Submitted batch job 5
275+
job_id = out.strip().split()[-1]
276+
self.submission_id = job_id
277+
return job_id
278+
279+
@property
280+
@assert_submission_id
281+
@borrowdoc(Submitter)
282+
def status(self):
283+
try:
284+
stat_out, _ = self.session.execute_command(
285+
"scontrol show jobid={}".format(self.submission_id))
286+
except CommandError:
287+
return "unknown", None
288+
289+
# Running scontrol with our jobid will show an entry for each subjob.
290+
matches = re.findall(r"JobState=([A-Z]+)\b", stat_out)
291+
if not matches:
292+
lgr.warning("No job status match found in %s", stat_out)
293+
return "unknown", None
294+
295+
# https://github.com/SchedMD/slurm/blob/db82f4eb3d844501b53a72ea313a9166d7a421b2/src/common/slurm_protocol_defs.c#L2656
296+
waiting_states = ["PENDING", "RUNNING"]
297+
if any(m in waiting_states for m in matches):
298+
our_state = "waiting"
299+
elif all(m == "COMPLETED" for m in matches):
300+
our_state = "completed"
301+
else:
302+
our_state = "unknown"
303+
# FIXME: their status should represent all subjobs, but right now we're
304+
# just taking the first code.
305+
return our_state, matches[0]
306+
307+
258308
class LocalSubmitter(Submitter):
259309
"""Submit a local job.
260310
"""
@@ -298,6 +348,7 @@ def status(self):
298348
(o.name, o) for o in [
299349
PbsSubmitter,
300350
CondorSubmitter,
351+
SlurmSubmitter,
301352
LocalSubmitter,
302353
]
303354
)

reproman/support/jobs/tests/test_orchestrators.py

Lines changed: 99 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,14 @@ def ssh():
5555
return SSH("testssh", host="reproman-test")
5656

5757

58+
@pytest.fixture(scope="module")
59+
def ssh_slurm():
60+
skipif.no_ssh()
61+
skipif.no_slurm()
62+
from reproman.resource.ssh import SSH
63+
return SSH("slurm-res", host="slurm")
64+
65+
5866
def test_orc_root_directory(shell):
5967
orc = orcs.PlainOrchestrator(shell, submission_type="local")
6068
assert orc.root_directory == op.expanduser("~/.reproman/run-root")
@@ -166,6 +174,49 @@ def container_dataset(tmpdir_factory):
166174
return ds
167175

168176

177+
@pytest.fixture()
178+
def check_orc_datalad(job_spec, dataset):
179+
def fn(resource, orc_class, sub_type):
180+
dataset.repo.tag("start-pt")
181+
182+
def run_and_check(spec):
183+
with chpwd(dataset.path):
184+
orc = orc_class(resource,
185+
submission_type=sub_type, job_spec=spec)
186+
orc.prepare_remote()
187+
orc.submit()
188+
orc.follow()
189+
190+
orc.fetch()
191+
assert dataset.repo.file_has_content("out")
192+
assert open("out").read() == "content\nmore\n"
193+
return orc
194+
195+
orc = run_and_check(job_spec)
196+
197+
# Perform another run based on the dumped job spec from the first.
198+
assert dataset.repo.get_active_branch() == "master"
199+
metadir = op.relpath(orc.meta_directory, orc.working_directory)
200+
with open(op.join(dataset.path, metadir, "spec.yaml")) as f:
201+
dumped_spec = yaml.safe_load(f)
202+
assert "_reproman_version" in dumped_spec
203+
assert "_spec_version" in dumped_spec
204+
if orc.name == "datalad-local-run":
205+
# Our reproman-based copying of data doesn't isn't (yet) OK with
206+
# data files that already exist.
207+
dumped_spec["inputs"] = []
208+
# FIXME: Use exposed method once available.
209+
dataset.repo._git_custom_command(
210+
[], ["git", "reset", "--hard", "start-pt"])
211+
if dataset.repo.dirty:
212+
# The submitter log file is ignored (currently only relevant for
213+
# condor; see b9277ebc0 for more details). Add the directory to get
214+
# to a clean state.
215+
dataset.add(".reproman")
216+
orc = run_and_check(dumped_spec)
217+
return fn
218+
219+
169220
@pytest.mark.integration
170221
@pytest.mark.parametrize("orc_class",
171222
[orcs.DataladLocalRunOrchestrator,
@@ -175,43 +226,13 @@ def container_dataset(tmpdir_factory):
175226
["local",
176227
pytest.param("condor", marks=mark.skipif_no_condor)],
177228
ids=["sub:local", "sub:condor"])
178-
def test_orc_datalad_run(job_spec, dataset, shell, orc_class, sub_type):
179-
dataset.repo.tag("start-pt")
180-
181-
def run_and_check(spec):
182-
with chpwd(dataset.path):
183-
orc = orc_class(shell, submission_type=sub_type, job_spec=spec)
184-
orc.prepare_remote()
185-
orc.submit()
186-
orc.follow()
187-
188-
orc.fetch()
189-
assert dataset.repo.file_has_content("out")
190-
assert open("out").read() == "content\nmore\n"
191-
return orc
229+
def test_orc_datalad_run(check_orc_datalad, shell, orc_class, sub_type):
230+
check_orc_datalad(shell, orc_class, sub_type)
192231

193-
orc = run_and_check(job_spec)
194232

195-
# Perform another run based on the dumped job spec from the first.
196-
assert dataset.repo.get_active_branch() == "master"
197-
metadir = op.relpath(orc.meta_directory, orc.working_directory)
198-
with open(op.join(dataset.path, metadir, "spec.yaml")) as f:
199-
dumped_spec = yaml.safe_load(f)
200-
assert "_reproman_version" in dumped_spec
201-
assert "_spec_version" in dumped_spec
202-
if orc.name == "datalad-local-run":
203-
# Our reproman-based copying of data doesn't isn't (yet) OK with data
204-
# files that already exist.
205-
dumped_spec["inputs"] = []
206-
# FIXME: Use exposed method once available.
207-
dataset.repo._git_custom_command(
208-
[], ["git", "reset", "--hard", "start-pt"])
209-
if dataset.repo.dirty:
210-
# The submitter log file is ignored (currently only relevant for
211-
# condor; see b9277ebc0 for more details). Add the directory to get to
212-
# a clean state.
213-
dataset.add(".reproman")
214-
orc = run_and_check(dumped_spec)
233+
@pytest.mark.integration
234+
def test_orc_datalad_slurm(check_orc_datalad, ssh_slurm):
235+
check_orc_datalad(ssh_slurm, orcs.DataladLocalRunOrchestrator, "slurm")
215236

216237

217238
@pytest.mark.integration
@@ -574,6 +595,41 @@ def test_dataset_as_dict(shell, dataset, job_spec):
574595
assert "_dataset_id" in d
575596

576597

598+
@pytest.fixture()
599+
def check_orc_datalad_concurrent(job_spec, dataset):
600+
def fn(ssh, orc_class, sub_type):
601+
names = ["paul", "rosa"]
602+
603+
job_spec["inputs"] = ["{p[name]}.in"]
604+
job_spec["outputs"] = ["{p[name]}.out"]
605+
job_spec["_resolved_command_str"] = "sh -c 'cat {inputs} {inputs} >{outputs}'"
606+
job_spec["_resolved_batch_parameters"] = [{"name": n} for n in names]
607+
608+
in_files = [n + ".in" for n in names]
609+
for fname in in_files:
610+
with open(op.join(dataset.path, fname), "w") as fh:
611+
fh.write(fname[0])
612+
dataset.save(path=in_files)
613+
614+
with chpwd(dataset.path):
615+
orc = orc_class(ssh, submission_type=sub_type, job_spec=job_spec)
616+
orc.prepare_remote()
617+
orc.submit()
618+
orc.follow()
619+
# Just make sure each fetch() seems to have wired up
620+
# on_remote_finish. test_run.py tests the actual --follow actions.
621+
remote_fn = MagicMock()
622+
orc.fetch(on_remote_finish=remote_fn)
623+
remote_fn.assert_called_once_with(orc.resource, [])
624+
625+
out_files = [n + ".out" for n in names]
626+
for ofile in out_files:
627+
assert dataset.repo.file_has_content(ofile)
628+
with open(ofile) as ofh:
629+
assert ofh.read() == ofile[0] * 2
630+
return fn
631+
632+
577633
@pytest.mark.integration
578634
@pytest.mark.parametrize("orc_class",
579635
[orcs.DataladLocalRunOrchestrator,
@@ -584,33 +640,13 @@ def test_dataset_as_dict(shell, dataset, job_spec):
584640
["local",
585641
pytest.param("condor", marks=mark.skipif_no_condor)],
586642
ids=["sub:local", "sub:condor"])
587-
def test_orc_datalad_concurrent(job_spec, dataset, ssh, orc_class, sub_type):
588-
names = ["paul", "rosa"]
643+
def test_orc_datalad_concurrent(check_orc_datalad_concurrent,
644+
ssh, orc_class, sub_type):
645+
check_orc_datalad_concurrent(ssh, orc_class, sub_type)
589646

590-
job_spec["inputs"] = ["{p[name]}.in"]
591-
job_spec["outputs"] = ["{p[name]}.out"]
592-
job_spec["_resolved_command_str"] = "sh -c 'cat {inputs} {inputs} >{outputs}'"
593-
job_spec["_resolved_batch_parameters"] = [{"name": n} for n in names]
594647

595-
in_files = [n + ".in" for n in names]
596-
for fname in in_files:
597-
with open(op.join(dataset.path, fname), "w") as fh:
598-
fh.write(fname[0])
599-
dataset.save(path=in_files)
600-
601-
with chpwd(dataset.path):
602-
orc = orc_class(ssh, submission_type=sub_type, job_spec=job_spec)
603-
orc.prepare_remote()
604-
orc.submit()
605-
orc.follow()
606-
# Just make sure each fetch() seems to have wired up on_remote_finish.
607-
# test_run.py tests the actual --follow actions.
608-
remote_fn = MagicMock()
609-
orc.fetch(on_remote_finish=remote_fn)
610-
remote_fn.assert_called_once_with(orc.resource, [])
611-
612-
out_files = [n + ".out" for n in names]
613-
for ofile in out_files:
614-
assert dataset.repo.file_has_content(ofile)
615-
with open(ofile) as ofh:
616-
assert ofh.read() == ofile[0] * 2
648+
@pytest.mark.integration
649+
def test_orc_datalad_concurrent_slurm(check_orc_datalad_concurrent, ssh_slurm):
650+
check_orc_datalad_concurrent(ssh_slurm,
651+
orcs.DataladLocalRunOrchestrator,
652+
"slurm")

reproman/tests/skip.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,18 @@ def no_singularity():
113113
not external_versions["cmd:singularity"])
114114

115115

116+
def no_slurm():
117+
def is_running():
118+
# Does it look like tools/ci/setup-slurm-container.sh was called?
119+
try:
120+
out, _ = Runner().run(
121+
["docker", "port", "reproman-slurm-container"])
122+
except CommandError:
123+
return False
124+
return out.strip()
125+
return "slurm container is not running", not is_running()
126+
127+
116128
def no_ssh():
117129
if _on_windows:
118130
reason = "no ssh on windows"
@@ -140,6 +152,7 @@ def on_windows():
140152
no_docker_engine,
141153
no_network,
142154
no_singularity,
155+
no_slurm,
143156
no_ssh,
144157
no_svn,
145158
on_windows,

tools/ci/setup-slurm-container.sh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/sh
2+
3+
set -eu
4+
5+
if ! test -f /tmp/rman-test-ssh-id
6+
then
7+
echo "prep-travis-forssh.sh needs to executed before this script" >&2
8+
exit 1
9+
fi
10+
11+
cat >>~/.ssh/config <<'EOF'
12+
13+
Host slurm
14+
HostName localhost
15+
Port 42241
16+
User root
17+
StrictHostKeyChecking no
18+
IdentityFile /tmp/rman-test-ssh-id
19+
EOF
20+
21+
docker run --name reproman-slurm-container \
22+
-dit -p 42241:22 -h ernie \
23+
repronim/reproman-slurm:latest
24+
25+
cat /tmp/rman-test-ssh-id.pub \
26+
| docker exec -i reproman-slurm-container \
27+
sh -c 'cat >>/root/.ssh/authorized_keys'
28+
29+
# Without the sleep below, the ssh call fails with
30+
#
31+
# ssh_exchange_identification: read: Connection reset by peer
32+
#
33+
# A 10 second sleep is probably longer than we need, but a 3 second sleep did
34+
# not seem to be enough:
35+
# https://travis-ci.org/ReproNim/reproman/jobs/627568055#L584
36+
sleep 10
37+
38+
echo "DEBUG: test connection to slurm container ..."
39+
ssh -v slurm exit

0 commit comments

Comments
 (0)