Skip to content

Commit cdce122

Browse files
committed
ENH: orchestrators(datalad-pair): Do a more targeted update
datalad-pair's fetch() fetches the job ref from the resource and then relies on 'datalad update' to bring down changes from the remote. If the merged in branch doesn't contain the job ref, then we temporarily switch to the job ref, get the outputs, and then let the caller know that the changes weren't brought in. This approach is problematic with submodules. One issue is that the head switch mentioned above does not check out submodules, and in general DataLad doesn't yet offer a way to set/reload a dataset hierarchy to a particular state. Another issue is that update() selects a merge target from the submodule remote that doesn't necessarily contain the registered commit. See DataLad's 94efa8637a (NF: update: Add option to merge in recorded submodule commit, 2020-02-17) for more information. In what will be the 0.13 release, update() learned a --follow=parentds mode that makes update() take the registered commit as the merge target when updating submodules. Use it. However, this still doesn't deal with bringing a non-mainline job ref into the top-level dataset because update() unconditionally merges in a branch from the remote. Try harder to bring in the job ref by doing an explicit 'git merge' of the ref before the update() call.
1 parent 0cb54a8 commit cdce122

File tree

2 files changed

+121
-9
lines changed

2 files changed

+121
-9
lines changed

reproman/support/jobs/orchestrators.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535

3636
import reproman
3737
from reproman.dochelpers import borrowdoc
38+
from reproman.dochelpers import exc_str
3839
from reproman.utils import cached_property
3940
from reproman.utils import chpwd
4041
from reproman.utils import write_update
@@ -922,6 +923,8 @@ def fetch(self, on_remote_finish=None):
922923
will be passed two arguments, the resource and the failed subjobs
923924
(list of ints).
924925
"""
926+
from datalad.support.exceptions import CommandError as DCError
927+
925928
lgr.info("Fetching results for %s", self.jobid)
926929
failed = self.get_failed_subjobs()
927930
resource_name = self.resource.name
@@ -930,10 +933,28 @@ def fetch(self, on_remote_finish=None):
930933
resource_name)
931934
self.ds.repo.fetch(resource_name, "{0}:{0}".format(ref),
932935
recurse_submodules="no")
933-
self.ds.update(sibling=resource_name,
934-
merge=True, recursive=True)
935-
lgr.info("Getting outputs from '%s'", resource_name)
936-
with head_at(self.ds, ref):
936+
failure = False
937+
try:
938+
self.ds.repo.merge(ref)
939+
except DCError as exc:
940+
lgr.warning(
941+
"Failed to merge in changes from %s. "
942+
"Check %s for merge conflicts. %s",
943+
ref, self.ds.path, exc_str(exc))
944+
else:
945+
# Handle any subdataset updates. We could avoid this if we knew
946+
# there were no subdataset changes, but it's probably simplest to
947+
# just unconditionally call update().
948+
for res in self.ds.update(
949+
sibling=resource_name,
950+
merge=True, follow="parentds", recursive=True,
951+
on_failure="ignore"):
952+
if res["status"] == "error":
953+
# DataLad will log failure.
954+
failure = True
955+
956+
if not failure:
957+
lgr.info("Getting outputs from '%s'", resource_name)
937958
outputs = list(self.get_outputs())
938959
if outputs:
939960
self.ds.get(path=outputs)
@@ -944,11 +965,6 @@ def fetch(self, on_remote_finish=None):
944965
lgr.info("Finished with remote resource '%s'", resource_name)
945966
if on_remote_finish:
946967
on_remote_finish(self.resource, failed)
947-
if not self.ds.repo.is_ancestor(ref, "HEAD"):
948-
lgr.info("Results stored on %s. "
949-
"Bring them into this branch with "
950-
"'git merge %s'",
951-
ref, ref)
952968

953969

954970
class FetchDataladRunMixin(object):

reproman/support/jobs/tests/test_orchestrators.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from reproman.consts import TEST_SSH_DOCKER_DIGEST
2020
from reproman.utils import chpwd
2121
from reproman.utils import swallow_logs
22+
from reproman.utils import swallow_outputs
2223
from reproman.resource.shell import Shell
2324
from reproman.support.exceptions import MissingExternalDependency
2425
from reproman.support.exceptions import OrchestratorError
@@ -665,3 +666,98 @@ def test_orc_datalad_pair_submodule(job_spec, dataset, shell):
665666
orc.submit()
666667
orc.follow()
667668
orc.fetch()
669+
670+
671+
def test_orc_datalad_pair_need_follow_parent(job_spec, dataset, shell):
672+
# An example of a scenario that fails without DataLad's --follow=parentds
673+
with chpwd(dataset.path):
674+
dataset.create("sub")
675+
dataset.save()
676+
677+
job_spec["_resolved_command_str"] = "sh -c 'echo baz >baz'"
678+
job_spec["inputs"] = []
679+
job_spec["outputs"] = []
680+
681+
orc0 = orcs.DataladPairOrchestrator(
682+
shell, submission_type="local", job_spec=job_spec)
683+
orc0.prepare_remote()
684+
orc0.submit()
685+
orc0.follow()
686+
687+
job_spec["_resolved_command_str"] = "sh -c 'echo bar >sub/bar'"
688+
output = op.join("sub", "bar")
689+
job_spec["outputs"] = [output]
690+
orc1 = orcs.DataladPairOrchestrator(
691+
shell, submission_type="local", job_spec=job_spec)
692+
orc1.prepare_remote()
693+
orc1.submit()
694+
orc1.follow()
695+
orc1.fetch()
696+
assert op.exists(output)
697+
698+
699+
def test_orc_datalad_pair_submodule_conflict(caplog, job_spec, dataset, shell):
700+
# In this scenario, one job modifies a submodule, and before that change,
701+
# another job is launched that modifies the same submodule. This creates a
702+
# change that can't be brought in with `datalad update` because, even with
703+
# --follow=parentds, the top-level repo still brings in changes from the
704+
# remote, whose master branch points to the first job. In a diagram, the
705+
# remote state is:
706+
#
707+
# ---- job 1 (master)
708+
# base --|
709+
# ---- job 2 (detached)
710+
#
711+
# On fetch of job 2, we merge the job 2 ref. The `datalad update` call
712+
# fails trying to merge in master.
713+
#
714+
# If this scenario ends up being common enough, we could consider modifying
715+
# `datalad update` to optionally not try to merge the remote state of the
716+
# top-level repo.
717+
with chpwd(dataset.path):
718+
dataset.create("sub")
719+
dataset.save()
720+
721+
job_spec["_resolved_command_str"] = "sh -c 'echo baz >sub/baz'"
722+
job_spec["inputs"] = []
723+
job_spec["outputs"] = []
724+
725+
orc0 = orcs.DataladPairOrchestrator(
726+
shell, submission_type="local", job_spec=job_spec)
727+
orc0.prepare_remote()
728+
orc0.submit()
729+
orc0.follow()
730+
731+
job_spec["_resolved_command_str"] = "sh -c 'echo bar >sub/bar'"
732+
orc1 = orcs.DataladPairOrchestrator(
733+
shell, submission_type="local", job_spec=job_spec)
734+
orc1.prepare_remote()
735+
orc1.submit()
736+
orc1.follow()
737+
# swallow_logs() won't work here because it hard codes the logger and
738+
# the log message being checked is bubbled up by DataLad.
739+
caplog.clear()
740+
with caplog.at_level(logging.ERROR):
741+
orc1.fetch()
742+
assert "CONFLICT" in caplog.text
743+
assert dataset.repo.call_git(["ls-files", "--unmerged"]).strip()
744+
745+
746+
def test_orc_datalad_pair_merge_conflict(job_spec, dataset, shell):
747+
with chpwd(dataset.path):
748+
job_spec["_resolved_command_str"] = "sh -c 'echo baz >baz'"
749+
job_spec["inputs"] = []
750+
job_spec["outputs"] = []
751+
752+
orc0 = orcs.DataladPairOrchestrator(
753+
shell, submission_type="local", job_spec=job_spec)
754+
orc0.prepare_remote()
755+
orc0.submit()
756+
orc0.follow()
757+
# Introduce a conflict.
758+
(dataset.pathobj / "baz").write_text("different")
759+
dataset.save()
760+
with swallow_logs(new_level=logging.WARNING) as logs:
761+
orc0.fetch()
762+
assert "Failed to merge in changes" in logs.out
763+
assert dataset.repo.call_git(["ls-files", "--unmerged"]).strip()

0 commit comments

Comments
 (0)