deepmodeling
diff --git a/‎doc/dpdispatcher_on_yarn.md‎
Lines changed: 181 additions & 0 deletions b/‎doc/dpdispatcher_on_yarn.md‎
Lines changed: 181 additions & 0 deletions
diff --git a/‎dpdispatcher/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎dpdispatcher/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎dpdispatcher/distributed_shell.py‎
Lines changed: 171 additions & 0 deletions b/‎dpdispatcher/distributed_shell.py‎
Lines changed: 171 additions & 0 deletions
@@ -0,0 +1,181 @@
+# Support DPDispatcher on Yarn
+## Background
+Currently, DPGen(or other DP softwares) supports for HPC systems like Slurm, PBS, LSF and cloud machines. In order to run DPGen jobs on ByteDance internal platform, we need to extend it to support yarn resources. Hadoop Ecosystem is a very commonly used platform to process the big data, and in the process of developing the new interface, we found it can be implemented by only using hadoop opensource components. So for the convenience of the masses, we decided to contribute the codes to opensource community. 
+
+## Design
+We use DistributedShell and HDFS to implement it. The control flow shows as follows:
+![image](https://github.com/shazj99/dpdispatcher/blob/yarn/doc/dpgen_yarn.jpg?raw=true)
+- Use DistributedShell to submit yarn jobs. It contains generating shell script and submitting it to yarn queues.
+- Use HDFS to save input files and output results. For performance reasons, we choose to pack forward files to a tar.gz file and upload it to HDFS directory. Accordingly, the task will download the tar file before running and upload result tar file to HDFS after it has done.
+
+## Implement
+We only need to add two Class which are HDFSContext and DistributedShell:
+
+```
+class HDFSContext(BaseContext) :
+    def upload(self, job_dirs, local_up_files):
+    """ upload forward files and forward command files to HDFS root dir
+
+    Parameters
+    ----------
+    job_dirs : list
+        the dictionary which contains the upload files
+    local_up_files: list
+        the file names which will be uploaded
+
+    Returns
+    -------
+    none
+    """
+        pass
+    
+    def download(self, submission):
+    """ download backward files from HDFS root dir
+
+    Parameters
+    ----------
+    submission : Submission class instance
+        represents a collection of tasks, such as backward file names
+
+    Returns
+    -------
+    none
+    """
+        pass
+        
+   def check_file_exists(self, fname):
+   """ check whether the given file exists, often used in checking whether the belonging job has finished
+
+    Parameters
+    ----------
+    fname : string
+        file name to be checked
+
+    Returns
+    -------
+    status: boolean
+    """
+        pass
+```
+
+```
+class DistributedShell(Machine):
+    def do_submit(self, job):
+    """ submit th job to yarn using distributed shell
+
+    Parameters
+    ----------
+    job : Job class instance
+        job to be submitted
+
+    Returns
+    -------
+    job_id: string
+        usually a yarn application id
+    """
+        pass
+        
+    def check_status(self, job):
+    """ check the yarn job status
+
+    Parameters
+    ----------
+    job : Job class instance
+        the submitted job
+
+    Returns
+    -------
+    status: JobStatus
+    """        
+        pass
+    
+    def gen_script_command(self, job):
+    """ Generate the shell script to be executed in DistibutedShell container
+
+    Parameters
+    ----------
+    job : Job class instance
+        the submitted job
+
+    Returns
+    -------
+    script: string
+        script command string
+    """          
+        pass
+```
+
+The following is an example of generated shell script. It will be executed in a yarn container:
+```
+#!/bin/bash
+
+## set envionment variables
+source /opt/intel/oneapi/setvars.sh
+
+## download the tar file from hdfs which contains forward files
+if ! ls uuid_upload_*.tgz 1>/dev/null 2>&1; then
+    hadoop fs -get /root/uuid/uuid_upload_*.tgz .
+fi
+for tgz_file in `ls *.tgz`; do tar xvf $tgz_file; done
+
+## check whether the task has finished successfully
+hadoop fs -test -e /root/uuid/sys-0001-0015/tag_0_finished
+{ if [ ! $? -eq 0 ] ;then
+  cur_dir=`pwd`
+  cd t sys-0001-0015
+  test $? -ne 0 && exit 1
+  
+  ## do your job here
+  mpirun -n 32 vasp_std  1>> log 2>> err
+  
+  if test $? -ne 0; then
+      exit 1
+  else
+      hadoop fs -touchz /root/uuid/sys-0001-0015/tag_0_finished
+  fi 
+  cd $cur_dir
+  test $? -ne 0 && exit 1
+fi }&
+
+wait
+
+## upload result files to hdfs
+tar czf uuid_download.tar.gz sys-0001-0015
+hadoop fs -put -f uuid_download.tar.gz /root/uuid/sys-0001-0015
+
+## mark the job has finished
+hadoop fs -touchz /root/uuid/uuid_tag_finished
+```
+An example of machine.json is as follows, whose batch_type is `DistributedShell`，and context_type is `HDFSContext`:
+
+```
+  "fp": [
+    {
+      "command": "mpirun -n 32 vasp_std",
+      "machine": {
+        "batch_type": "DistributedShell",
+        "context_type": "HDFSContext",
+        "local_root": "./",
+        "remote_root": "hdfs://path/to/remote/root"
+      },
+      "resources": {
+        "number_node": 1,
+        "cpu_per_node": 32,
+        "gpu_per_node": 0,
+        "queue_name": "queue_name",
+        "group_size": 1,
+        "source_list": ["/opt/intel/oneapi/setvars.sh"],
+        "kwargs": {
+          "img_name": "",
+          "mem_limit": 32,
+          "yarn_path": "/path/to/yarn/jars"
+        },
+        "envs" : {
+          "HADOOP_HOME" : "${HADOOP_HOME:/path/to/hadoop/bin}",
+          "CLASSPATH": "`${HADOOP_HOME}/bin/hadoop classpath --glob`",
+          "PATH": "${HADOOP_HOME}/bin:${PATH}"}
+        }
+      }
+    }
+  ]
+```
@@ -47,12 +47,14 @@
 from .shell import Shell
 from .lsf import LSF
 from .dp_cloud_server import DpCloudServer
+from .distributed_shell import DistributedShell
 from .machine import Machine
 
 from .lazy_local_context import LazyLocalContext
 from .local_context import LocalContext
 from .ssh_context import SSHContext
 from .dp_cloud_server_context import DpCloudServerContext
+from .hdfs_context import HDFSContext
 
 def info():
     """
 
@@ -0,0 +1,171 @@
+from dpdispatcher.JobStatus import JobStatus
+from dpdispatcher import dlog
+from dpdispatcher.machine import Machine
+from dpdispatcher.utils import run_cmd_with_all_output
+import subprocess as sp
+
+
+shell_script_header_template="""
+#!/bin/bash -l
+set -x
+"""
+
+script_env_template="""
+{module_unload_part}
+{module_load_part}
+{source_files_part}
+{export_envs_part}
+
+REMOTE_ROOT=`pwd`
+echo 0 > {flag_if_job_task_fail}
+test $? -ne 0 && exit 1
+
+if ! ls {submission_hash}_upload.tgz 1>/dev/null 2>&1; then
+    hadoop fs -get {remote_root}/*.tgz .
+fi
+for TGZ in `ls *.tgz`; do tar xvf $TGZ; done
+
+"""
+script_end_template="""
+cd $REMOTE_ROOT
+test $? -ne 0 && exit 1
+
+wait
+FLAG_IF_JOB_TASK_FAIL=$(cat {flag_if_job_task_fail})
+if test $FLAG_IF_JOB_TASK_FAIL -eq 0; then
+    tar czf {submission_hash}_{job_hash}_download.tar.gz {all_task_dirs}
+    hadoop fs -put -f {submission_hash}_{job_hash}_download.tar.gz {remote_root}
+    hadoop fs -touchz {remote_root}/{job_tag_finished}
+else
+    exit 1
+fi
+"""
+
+class DistributedShell(Machine):
+    def gen_script_env(self, job):
+        source_files_part = ""
+
+        module_unload_part = ""
+        module_unload_list = job.resources.module_unload_list
+        for ii in module_unload_list:
+            module_unload_part += f"module unload {ii}\n"
+
+        module_load_part = ""
+        module_list = job.resources.module_list
+        for ii in module_list:
+            module_load_part += f"module load {ii}\n"
+
+        source_list = job.resources.source_list
+        for ii in source_list:
+            line = "{ source %s; } \n" % ii
+            source_files_part += line
+
+        export_envs_part = ""
+        envs = job.resources.envs
+        for k, v in envs.items():
+            export_envs_part += f"export {k}={v}\n"
+
+        flag_if_job_task_fail = job.job_hash + '_flag_if_job_task_fail'
+
+        script_env = script_env_template.format(
+            flag_if_job_task_fail=flag_if_job_task_fail,
+            module_unload_part=module_unload_part,
+            module_load_part=module_load_part,
+            source_files_part=source_files_part,
+            export_envs_part=export_envs_part,
+            remote_root=self.context.remote_root,
+            submission_hash=self.context.submission.submission_hash,
+        )
+        return script_env
+
+    def gen_script_end(self, job):
+        all_task_dirs = ""
+        for task in job.job_task_list:
+            all_task_dirs += "%s " % task.task_work_path
+        job_tag_finished = job.job_hash + '_job_tag_finished'
+        flag_if_job_task_fail = job.job_hash + '_flag_if_job_task_fail'
+        script_end = script_end_template.format(
+            job_tag_finished=job_tag_finished,
+            flag_if_job_task_fail=flag_if_job_task_fail,
+            all_task_dirs=all_task_dirs,
+            remote_root=self.context.remote_root,
+            submission_hash=self.context.submission.submission_hash,
+            job_hash=job.job_hash
+        )
+        return script_end
+
+    def gen_script_header(self, job):
+        shell_script_header = shell_script_header_template
+        return shell_script_header
+
+    def do_submit(self, job):
+        """ submit th job to yarn using distributed shell
+
+        Parameters
+        ----------
+        job : Job class instance
+            job to be submitted
+
+        Returns
+        -------
+        job_id: string
+            submit process id
+        """
+
+        script_str = self.gen_script(job)
+        script_file_name = job.script_file_name
+        job_id_name = job.job_hash + '_job_id'
+        output_name = job.job_hash + '.out'
+        self.context.write_file(fname=script_file_name, write_str=script_str)
+
+        resources = job.resources
+        submit_command = 'hadoop jar %s/hadoop-yarn-applications-distributedshell-*.jar ' \
+                         'org.apache.hadoop.yarn.applications.distributedshell.Client ' \
+                         '-jar %s/hadoop-yarn-applications-distributedshell-*.jar ' \
+                         '-queue %s -appname "distributedshell_dpgen_%s" ' \
+                         '-shell_env YARN_CONTAINER_RUNTIME_TYPE=docker ' \
+                         '-shell_env YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=%s ' \
+                         '-shell_env ENV_DOCKER_CONTAINER_SHM_SIZE=\'600m\' '\
+                         '-master_memory 1024 -master_vcores 2 -num_containers 1 ' \
+                         '-container_resources memory-mb=%s,vcores=%s ' \
+                         '-shell_script /tmp/%s' % (resource.kwargs.get('yarn_path',''),
+                                        resource.kwargs.get('yarn_path',''), resources.queue_name, job.job_hash,
+                                        resources.kwargs.get('img_name',''),resources.kwargs.get('mem_limit', 1)*1024,
+                                        resources.cpu_per_node, script_file_name)
+
+        cmd = '{ nohup %s 1>%s 2>%s & } && echo $!' % (submit_command, output_name, output_name)
+        ret, stdout, stderr = run_cmd_with_all_output(cmd)
+
+        if ret != 0:
+            err_str = stderr.decode('utf-8')
+            raise RuntimeError\
+                    ("Command squeue fails to execute, error message:%s\nreturn code %d\n" % (err_str, ret))
+        job_id = int(stdout.decode('utf-8').strip())
+
+        self.context.write_file(job_id_name, str(job_id))
+        return job_id
+
+    def check_status(self, job):
+        job_id = job.job_id
+        if job_id == '' :
+            return JobStatus.unsubmitted
+
+        ret, stdout, stderr = run_cmd_with_all_output(f"if ps -p {job_id} > /dev/null; then echo 1; fi")
+        if ret != 0:
+            err_str = stderr.decode('utf-8')
+            raise RuntimeError \
+                ("Command fails to execute, error message:%s\nreturn code %d\n" % (err_str, ret))
+
+        if_job_exists = bool(stdout.decode('utf-8').strip())
+        if self.check_finish_tag(job=job):
+            dlog.info(f"job: {job.job_hash} {job.job_id} finished")
+            return JobStatus.finished
+
+        if if_job_exists:
+            return JobStatus.running
+        else:
+            return JobStatus.terminated
+
+    def check_finish_tag(self, job):
+        job_tag_finished = job.job_hash + '_job_tag_finished'
+        return self.context.check_file_exists(job_tag_finished)