Merge pull request #494 from kyleam/slurm

yarikoptic · web-flow · commit 7e8dbb5d5110 · 2020-04-17T10:35:06.000-04:00
run: Add Slurm support
diff --git a/.travis.yml b/.travis.yml
@@ -17,6 +17,7 @@ matrix:
     - REPROMAN_TESTS_SSH=1
     - INSTALL_DATALAD=1
     - INSTALL_CONDOR=1
+    - SETUP_SLURM=1
   - python: 3.5
     env:
     - REPROMAN_TESTS_SSH=1
@@ -66,6 +67,9 @@ before_install:
       sudo eatmydata tools/ci/prep-travis-forssh-sudo.sh;
       tools/ci/prep-travis-forssh.sh;
     fi
+  - if [ ! -z "${SETUP_SLURM:-}" ]; then
+      tools/ci/setup-slurm-container.sh;
+    fi
   - git config --global user.email "test@travis.land"
   - git config --global user.name "Travis Almighty"
 
diff --git a/reproman/support/jobs/job_templates/submission/slurm.template b/reproman/support/jobs/job_templates/submission/slurm.template
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+#SBATCH --output={{ shlex_quote(_meta_directory) }}/stdout.%a
+#SBATCH --error={{ shlex_quote(_meta_directory) }}/stderr.%a
+{#
+  TODO: We need to assess how we treat batch parameters across different
+  submitters---things like whether we should try to expose common names and, if
+  so, what are the discrepancies in the behavior, and how should we deal with
+  that. We should also revisit the goal of making it possible for the caller to
+  extend the submit file template to add stuff like parameters we do not expose
+  and environment modules.
+#}
+{% if memory is defined %}
+#SBATCH --mem={{ memory }}
+{% endif %}
+{% if num_processes is defined %}
+#SBATCH --cpus-per-task={{ num_processes }}
+{% endif %}
+{% if _num_subjobs == 1 %}
+#SBATCH --array=0
+{% else %}
+#SBATCH --array=0-{{ _num_subjobs - 1}}
+{% endif %}
+
+{{ shlex_quote(_meta_directory) }}/runscript $SLURM_ARRAY_TASK_ID
diff --git a/reproman/support/jobs/submitters.py b/reproman/support/jobs/submitters.py
@@ -255,6 +255,56 @@ def _status_no_json(self):
         return ours, theirs
 
 
+class SlurmSubmitter(Submitter):
+    """Submit a Slurm job.
+    """
+    name = "slurm"
+
+    def __init__(self, session):
+        super(SlurmSubmitter, self).__init__(session)
+
+    @property
+    @borrowdoc(Submitter)
+    def submit_command(self):
+        return ["sbatch"]
+
+    @borrowdoc(Submitter)
+    def submit(self, script, submit_command=None):
+        out = super(SlurmSubmitter, self).submit(script, submit_command)
+        # Output example (v19.05): Submitted batch job 5
+        job_id = out.strip().split()[-1]
+        self.submission_id = job_id
+        return job_id
+
+    @property
+    @assert_submission_id
+    @borrowdoc(Submitter)
+    def status(self):
+        try:
+            stat_out, _ = self.session.execute_command(
+                "scontrol show jobid={}".format(self.submission_id))
+        except CommandError:
+            return "unknown", None
+
+        # Running scontrol with our jobid will show an entry for each subjob.
+        matches = re.findall(r"JobState=([A-Z]+)\b", stat_out)
+        if not matches:
+            lgr.warning("No job status match found in %s", stat_out)
+            return "unknown", None
+
+        # https://github.com/SchedMD/slurm/blob/db82f4eb3d844501b53a72ea313a9166d7a421b2/src/common/slurm_protocol_defs.c#L2656
+        waiting_states = ["PENDING", "RUNNING"]
+        if any(m in waiting_states for m in matches):
+            our_state = "waiting"
+        elif all(m == "COMPLETED" for m in matches):
+            our_state = "completed"
+        else:
+            our_state = "unknown"
+        # FIXME: their status should represent all subjobs, but right now we're
+        # just taking the first code.
+        return our_state, matches[0]
+
+
 class LocalSubmitter(Submitter):
     """Submit a local job.
     """
@@ -298,6 +348,7 @@ def status(self):
     (o.name, o) for o in [
         PbsSubmitter,
         CondorSubmitter,
+        SlurmSubmitter,
         LocalSubmitter,
     ]
 )
diff --git a/reproman/support/jobs/tests/test_orchestrators.py b/reproman/support/jobs/tests/test_orchestrators.py
@@ -55,6 +55,14 @@ def ssh():
     return SSH("testssh", host="reproman-test")
 
 
+@pytest.fixture(scope="module")
+def ssh_slurm():
+    skipif.no_ssh()
+    skipif.no_slurm()
+    from reproman.resource.ssh import SSH
+    return SSH("slurm-res", host="slurm")
+
+
 def test_orc_root_directory(shell):
     orc = orcs.PlainOrchestrator(shell, submission_type="local")
     assert orc.root_directory == op.expanduser("~/.reproman/run-root")
@@ -166,6 +174,49 @@ def container_dataset(tmpdir_factory):
     return ds
 
 
+@pytest.fixture()
+def check_orc_datalad(job_spec, dataset):
+    def fn(resource, orc_class, sub_type):
+        dataset.repo.tag("start-pt")
+
+        def run_and_check(spec):
+            with chpwd(dataset.path):
+                orc = orc_class(resource,
+                                submission_type=sub_type, job_spec=spec)
+                orc.prepare_remote()
+                orc.submit()
+                orc.follow()
+
+                orc.fetch()
+                assert dataset.repo.file_has_content("out")
+                assert open("out").read() == "content\nmore\n"
+                return orc
+
+        orc = run_and_check(job_spec)
+
+        # Perform another run based on the dumped job spec from the first.
+        assert dataset.repo.get_active_branch() == "master"
+        metadir = op.relpath(orc.meta_directory, orc.working_directory)
+        with open(op.join(dataset.path, metadir, "spec.yaml")) as f:
+            dumped_spec = yaml.safe_load(f)
+            assert "_reproman_version" in dumped_spec
+            assert "_spec_version" in dumped_spec
+        if orc.name == "datalad-local-run":
+            # Our reproman-based copying of data doesn't isn't (yet) OK with
+            # data files that already exist.
+            dumped_spec["inputs"] = []
+        # FIXME: Use exposed method once available.
+        dataset.repo._git_custom_command(
+            [], ["git", "reset", "--hard", "start-pt"])
+        if dataset.repo.dirty:
+            # The submitter log file is ignored (currently only relevant for
+            # condor; see b9277ebc0 for more details). Add the directory to get
+            # to a clean state.
+            dataset.add(".reproman")
+        orc = run_and_check(dumped_spec)
+    return fn
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize("orc_class",
                          [orcs.DataladLocalRunOrchestrator,
@@ -175,43 +226,13 @@ def container_dataset(tmpdir_factory):
                          ["local",
                           pytest.param("condor", marks=mark.skipif_no_condor)],
                          ids=["sub:local", "sub:condor"])
-def test_orc_datalad_run(job_spec, dataset, shell, orc_class, sub_type):
-    dataset.repo.tag("start-pt")
-
-    def run_and_check(spec):
-        with chpwd(dataset.path):
-            orc = orc_class(shell, submission_type=sub_type, job_spec=spec)
-            orc.prepare_remote()
-            orc.submit()
-            orc.follow()
-
-            orc.fetch()
-            assert dataset.repo.file_has_content("out")
-            assert open("out").read() == "content\nmore\n"
-            return orc
+def test_orc_datalad_run(check_orc_datalad, shell, orc_class, sub_type):
+    check_orc_datalad(shell, orc_class, sub_type)
 
-    orc = run_and_check(job_spec)
 
-    # Perform another run based on the dumped job spec from the first.
-    assert dataset.repo.get_active_branch() == "master"
-    metadir = op.relpath(orc.meta_directory, orc.working_directory)
-    with open(op.join(dataset.path, metadir, "spec.yaml")) as f:
-        dumped_spec = yaml.safe_load(f)
-        assert "_reproman_version" in dumped_spec
-        assert "_spec_version" in dumped_spec
-    if orc.name == "datalad-local-run":
-        # Our reproman-based copying of data doesn't isn't (yet) OK with data
-        # files that already exist.
-        dumped_spec["inputs"] = []
-    # FIXME: Use exposed method once available.
-    dataset.repo._git_custom_command(
-        [], ["git", "reset", "--hard", "start-pt"])
-    if dataset.repo.dirty:
-        # The submitter log file is ignored (currently only relevant for
-        # condor; see b9277ebc0 for more details). Add the directory to get to
-        # a clean state.
-        dataset.add(".reproman")
-    orc = run_and_check(dumped_spec)
+@pytest.mark.integration
+def test_orc_datalad_slurm(check_orc_datalad, ssh_slurm):
+    check_orc_datalad(ssh_slurm, orcs.DataladLocalRunOrchestrator, "slurm")
 
 
 @pytest.mark.integration
@@ -574,6 +595,41 @@ def test_dataset_as_dict(shell, dataset, job_spec):
     assert "_dataset_id" in d
 
 
+@pytest.fixture()
+def check_orc_datalad_concurrent(job_spec, dataset):
+    def fn(ssh, orc_class, sub_type):
+        names = ["paul", "rosa"]
+
+        job_spec["inputs"] = ["{p[name]}.in"]
+        job_spec["outputs"] = ["{p[name]}.out"]
+        job_spec["_resolved_command_str"] = "sh -c 'cat {inputs} {inputs} >{outputs}'"
+        job_spec["_resolved_batch_parameters"] = [{"name": n} for n in names]
+
+        in_files = [n + ".in" for n in names]
+        for fname in in_files:
+            with open(op.join(dataset.path, fname), "w") as fh:
+                fh.write(fname[0])
+        dataset.save(path=in_files)
+
+        with chpwd(dataset.path):
+            orc = orc_class(ssh, submission_type=sub_type, job_spec=job_spec)
+            orc.prepare_remote()
+            orc.submit()
+            orc.follow()
+            # Just make sure each fetch() seems to have wired up
+            # on_remote_finish. test_run.py tests the actual --follow actions.
+            remote_fn = MagicMock()
+            orc.fetch(on_remote_finish=remote_fn)
+            remote_fn.assert_called_once_with(orc.resource, [])
+
+            out_files = [n + ".out" for n in names]
+            for ofile in out_files:
+                assert dataset.repo.file_has_content(ofile)
+                with open(ofile) as ofh:
+                    assert ofh.read() == ofile[0] * 2
+    return fn
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize("orc_class",
                          [orcs.DataladLocalRunOrchestrator,
@@ -584,33 +640,13 @@ def test_dataset_as_dict(shell, dataset, job_spec):
                          ["local",
                           pytest.param("condor", marks=mark.skipif_no_condor)],
                          ids=["sub:local", "sub:condor"])
-def test_orc_datalad_concurrent(job_spec, dataset, ssh, orc_class, sub_type):
-    names = ["paul", "rosa"]
+def test_orc_datalad_concurrent(check_orc_datalad_concurrent,
+                                ssh, orc_class, sub_type):
+    check_orc_datalad_concurrent(ssh, orc_class, sub_type)
 
-    job_spec["inputs"] = ["{p[name]}.in"]
-    job_spec["outputs"] = ["{p[name]}.out"]
-    job_spec["_resolved_command_str"] = "sh -c 'cat {inputs} {inputs} >{outputs}'"
-    job_spec["_resolved_batch_parameters"] = [{"name": n} for n in names]
 
-    in_files = [n + ".in" for n in names]
-    for fname in in_files:
-        with open(op.join(dataset.path, fname), "w") as fh:
-            fh.write(fname[0])
-    dataset.save(path=in_files)
-
-    with chpwd(dataset.path):
-        orc = orc_class(ssh, submission_type=sub_type, job_spec=job_spec)
-        orc.prepare_remote()
-        orc.submit()
-        orc.follow()
-        # Just make sure each fetch() seems to have wired up on_remote_finish.
-        # test_run.py tests the actual --follow actions.
-        remote_fn = MagicMock()
-        orc.fetch(on_remote_finish=remote_fn)
-        remote_fn.assert_called_once_with(orc.resource, [])
-
-        out_files = [n + ".out" for n in names]
-        for ofile in out_files:
-            assert dataset.repo.file_has_content(ofile)
-            with open(ofile) as ofh:
-                assert ofh.read() == ofile[0] * 2
+@pytest.mark.integration
+def test_orc_datalad_concurrent_slurm(check_orc_datalad_concurrent, ssh_slurm):
+    check_orc_datalad_concurrent(ssh_slurm,
+                                 orcs.DataladLocalRunOrchestrator,
+                                 "slurm")
diff --git a/reproman/tests/skip.py b/reproman/tests/skip.py
@@ -113,6 +113,18 @@ def no_singularity():
             not external_versions["cmd:singularity"])
 
 
+def no_slurm():
+    def is_running():
+        # Does it look like tools/ci/setup-slurm-container.sh was called?
+        try:
+            out, _ = Runner().run(
+                ["docker", "port", "reproman-slurm-container"])
+        except CommandError:
+            return False
+        return out.strip()
+    return "slurm container is not running", not is_running()
+
+
 def no_ssh():
     if _on_windows:
         reason = "no ssh on windows"
@@ -140,6 +152,7 @@ def on_windows():
     no_docker_engine,
     no_network,
     no_singularity,
+    no_slurm,
     no_ssh,
     no_svn,
     on_windows,
diff --git a/tools/ci/setup-slurm-container.sh b/tools/ci/setup-slurm-container.sh
@@ -0,0 +1,39 @@
+#!/bin/sh
+
+set -eu
+
+if ! test -f /tmp/rman-test-ssh-id
+then
+    echo "prep-travis-forssh.sh needs to executed before this script" >&2
+    exit 1
+fi
+
+cat >>~/.ssh/config <<'EOF'
+
+Host slurm
+HostName localhost
+Port 42241
+User root
+StrictHostKeyChecking no
+IdentityFile /tmp/rman-test-ssh-id
+EOF
+
+docker run --name reproman-slurm-container \
+       -dit -p 42241:22 -h ernie \
+       repronim/reproman-slurm:latest
+
+cat /tmp/rman-test-ssh-id.pub \
+    | docker exec -i reproman-slurm-container \
+             sh -c 'cat >>/root/.ssh/authorized_keys'
+
+# Without the sleep below, the ssh call fails with
+#
+#   ssh_exchange_identification: read: Connection reset by peer
+#
+# A 10 second sleep is probably longer than we need, but a 3 second sleep did
+# not seem to be enough:
+# https://travis-ci.org/ReproNim/reproman/jobs/627568055#L584
+sleep 10
+
+echo "DEBUG: test connection to slurm container ..."
+ssh -v slurm exit