rokroskar
diff --git a/‎.gitignore‎
Lines changed: 9 additions & 0 deletions b/‎.gitignore‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 4 deletions b/‎README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎scripts/hpcnotebook‎
Lines changed: 47 additions & 24 deletions b/‎scripts/hpcnotebook‎
Lines changed: 47 additions & 24 deletions
diff --git a/‎scripts/sparkcluster‎
Lines changed: 35 additions & 16 deletions b/‎scripts/sparkcluster‎
Lines changed: 35 additions & 16 deletions
diff --git a/‎sparkhpc/__init__.py‎
Lines changed: 7 additions & 60 deletions b/‎sparkhpc/__init__.py‎
Lines changed: 7 additions & 60 deletions
diff --git a/‎sparkhpc/lsfsparkjob.py‎
Lines changed: 19 additions & 0 deletions b/‎sparkhpc/lsfsparkjob.py‎
Lines changed: 19 additions & 0 deletions
@@ -0,0 +1,9 @@
+*pyc
+*~
+*.log
+_build
+build
+.ipynb_checkpoints
+dist
+*egg-info
+job
@@ -84,16 +84,16 @@ Job <31463649> is being terminated
 ### Python code
 
 ```python
-from sparkhpc.sparkjob import LSFSparkJob
+from sparkhpc import sparkjob
 import findspark 
 findspark.init() # this sets up the paths required to find spark libraries
 import pyspark
 
-sj = LSFSparkJob(ncores=10)
+sj = sparkjob.sparkjob(ncores=10)
 
 sj.wait_to_start()
 
-sc = pyspark.SparkContext(master=sj.master_url)
+sc = sj.start_spark()
 
 sc.parallelize(...)
 ```
@@ -125,7 +125,7 @@ Currently only LSF is supported. However, adding support for other schedulers is
 
 To implement support for a new scheduler you should subclass `SparkCluster`. You must define the following *class* variables: 
 
-* `_peek_command` (command to get stdout of current job)
+* `_peek()` (function to get stdout of the current job)
 * `_submit_command` (command to submit a job to the scheduler)
 * `_job_regex` (regex to get the job ID from return string of submit command)
 * `_kill_command` (scheduler command to kill a job)
 
@@ -11,6 +11,37 @@ import shutil
 import click
 import pkg_resources
 
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger('hpcnotebook')
+
+# try to figure out which scheduler we have
+def which(program):
+    import os
+    def is_exe(fpath):
+        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+
+    fpath, fname = os.path.split(program)
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ["PATH"].split(os.pathsep):
+            path = path.strip('"')
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+
+    return None
+
+if which('bjobs') is not None: 
+    SCHEDULER = 'LSF'
+    HOST_ENVIRON = 'LSB_HOSTS'
+elif which('squeue') is not None: 
+    SCHEDULER = 'SLURM'
+    HOST_ENVIRON = 'SLURM_NODELIST'
+
 #
 # Initial inspiration for this script from https://github.com/felixcheung/vagrant-projects
 #
@@ -43,15 +74,6 @@ home = expanduser("~")
 # setup the path to the jupyter notebook configuration
 jupyter_config_path = "{home}/.jupyter_notebook".format(home=home)
 
-def which(name):
-    """helper function to find whether a command exists"""
-    found = 0 
-    for path in os.getenv("PATH").split(os.path.pathsep):
-        full_path = path + os.sep + name
-        if os.path.exists(full_path):
-            found = full_path
-    return found
-
 @click.group()
 @click.option('--port', default=8889, help='Port for the notebook server')
 @click.pass_context
@@ -75,13 +97,16 @@ def setup(ctx, force):
         # get a password
         from notebook.auth import passwd
 
-        print(bc.WARNING + '[hpcnotebook] '+bc.ENDC+'This script will create a Jupyter notebook profile for working remotely')
-        print(bc.WARNING + '[hpcnotebook] '+bc.ENDC+'When it is finished, you can find the configuration in %s\n'%(bc.UNDERLINE + jupyter_config_path + bc.ENDC))
-        print(bc.WARNING + '[hpcnotebook] '+bc.ENDC+'First, we need a *new* password for your Jupyter notebook\n')
+        logger.info("""
+            This script will create a Jupyter notebook profile for working remotely.
+            When it is finished, you can find the configuration in {conf_path}.
+            First, please enter a *new* password for your Jupyter notebook:
+            """.format(conf_path=bc.UNDERLINE + jupyter_config_path + bc.ENDC))
+        
         new_pass = passwd()
 
-        print(bc.WARNING + '[hpcnotebook] '+bc.ENDC+'Creating an SSL certificate to enable a secure connection; the certificate will be in your ~/.ssh directory\n')
-
+        logger.info("Creating an SSL certificate to enable a secure connection; the certificate will be in your ~/.ssh directory")
+        
         # make sure the .ssh directory is there before continuing
         sshdir = '{home}/.ssh'.format(home=home)
         if not os.path.exists(sshdir):
@@ -95,7 +120,7 @@ def setup(ctx, force):
 
         lines = out.split('\n')
         for l in lines : 
-            print(bc.OKGREEN + '[openssl] ' + bc.ENDC + l)
+            logger.info(bc.OKGREEN + '[openssl] ' + bc.ENDC + l)
 
         # write the notebook config
 
@@ -105,10 +130,10 @@ def setup(ctx, force):
             f.write(notebook_config_template.format(
                 password=new_pass, certfile=certfile, port=port))
 
-        print(bc.WARNING + '[hpcnotebook] '+ bc.BOLD + 'Notebook setup complete' + bc.ENDC)
+        logger.info(bc.BOLD + 'Notebook setup complete' + bc.ENDC)
 
     else:
-        print(bc.FAIL + "The jupyter notebook already looks set up; if you want to force setup, use --force".format(dir=jupyter_config_path) + bc.ENDC)
+        logger.error(bc.FAIL + "The jupyter notebook already looks set up; if you want to force setup, use --force".format(dir=jupyter_config_path) + bc.ENDC)
 
 @cli.command()
 @click.pass_context
@@ -127,11 +152,11 @@ def launch(ctx):
         conf_port = int(re.findall('port = (\d+)', conf.read())[0])
 
     if conf_port != port:
-        print(bc.WARNING + "Overriding the port found in the existing configuration" + bc.ENDC)
+        logger.warning(bc.WARNING + "Overriding the port found in the existing configuration" + bc.ENDC)
         argv.append('--port={port}'.format(port=port))
 
     # determine if we're running on a compute node
-    if 'LSB_HOSTS' in os.environ:
+    if HOST_ENVIRON in os.environ:
         compute = True
     else:
         compute = False
@@ -143,7 +168,7 @@ def launch(ctx):
     else:
         ip = 'localhost'
 
-    print(bc.BOLD + "To access the notebook, inspect the output below for the port number, then point your browser to https://{ip}:<port_number>".format(ip=ip) + bc.ENDC)
+    logger.info(bc.BOLD + "To access the notebook, inspect the output below for the port number, then point your browser to https://{ip}:<port_number>".format(ip=ip) + bc.ENDC)
     sys.stdout.flush()
     launch_new_instance()
 
@@ -156,10 +181,8 @@ def launch(ctx):
 @click.pass_context
 def submit(ctx, ncores, walltime, memory, template, jobname):
     """Submit a notebook job to the scheduler"""
-    if which('bsub'): 
-        scheduler='LSF'
-    else: 
-        raise RuntimeWarning('bsub not found and no other schedulers are supported')
+    if SCHEDULER != 'LSF':
+        raise RuntimeError('only the LSF scheduler is supported at the moment')
 
     if template is None: 
         template_file = templates[scheduler]
 
@@ -7,6 +7,7 @@
 
 from __future__ import print_function
 import click
+import sparkhpc
 from sparkhpc import sparkjob
 import subprocess
 import os
@@ -17,30 +18,52 @@ logger = logging.getLogger('sparkhpc')
 
 home = os.path.expanduser('~')
 
+SCHEDULER = sparkjob.get_scheduler()
+
 @click.group()
-@click.option('--scheduler', type=click.Choice(['lsf', 'slurm']), default='lsf', help='Which scheduler to use')
+@click.option('--scheduler', type=click.Choice(['lsf', 'slurm']), default=SCHEDULER, help='Which scheduler to use')
 @click.pass_context
 def cli(ctx, scheduler):
-    ctx.obj['SJ'] = sparkjob._sparkjob_factory(scheduler)
+    ctx.obj['SJ'] = sparkjob.sparkjob
 
 
 @cli.command()
-@click.argument('ncores')
+@click.argument('ncores', type=int)
 @click.option('--walltime', default="00:30", help="Walltime in HH:MM format")
 @click.option('--jobname', default='sparkcluster', help='Name to use for the job')
 @click.option('--template', default=None, help='Job template path')
-@click.option('--memory', default='2000', envvar='SPARK_WORKER_MEMORY',
-              help='Memory for each worker in MB')
+@click.option('--memory-per-executor', default=2000, envvar='SPARK_EXECUTOR_MEMORY',
+              help='Memory to reserve for each executor (i.e. the JVM) in MB')
+@click.option('--memory-per-core', default=2000,
+              help='Memory per core to request from scheduler in MB')
+@click.option('--cores-per-executor', default=1,
+              help='Cores per executor')
 @click.option('--spark-home', default=os.path.join(home,'spark'), envvar='SPARK_HOME', 
               help='Location of the Spark distribution')
 @click.option('--wait', default=False, is_flag=True, help='Wait until the job starts')
 @click.pass_context
-def start(ctx, ncores, walltime, jobname, template, memory, spark_home, wait):
+def start(ctx, 
+          ncores, 
+          walltime, 
+          jobname, 
+          template, 
+          memory_per_executor, 
+          memory_per_core, 
+          cores_per_executor,
+          spark_home, 
+          wait):
     """Start the spark cluster as a batch job"""
 
     SJ = ctx.obj['SJ']
 
-    sj = SJ(ncores=ncores, walltime=walltime, jobname=jobname, template=template, memory=memory, spark_home=spark_home)
+    sj = SJ(ncores=ncores, 
+            walltime=walltime, 
+            jobname=jobname, 
+            template=template, 
+            memory_per_core=memory_per_core,
+            memory_per_executor=memory_per_executor, 
+            cores_per_executor=cores_per_executor,
+            spark_home=spark_home)
 
     if wait: 
         logger.info(' Waiting for job to start - ctrl-c to stop')
@@ -53,13 +76,8 @@ def start(ctx, ncores, walltime, jobname, template, memory, spark_home, wait):
 @click.pass_context
 def info(ctx):
     """Get info about currently running clusters"""
-    SJ = ctx.obj['SJ']
-    sjs = SJ.current_clusters()
+    sparkhpc.show_clusters()
 
-    if len(sjs)>0:
-        sjs[0].show_clusters()
-    else: 
-        logger.info(' No spark clusters running')
 
 @cli.command()
 @click.argument('clusterid')
@@ -81,12 +99,13 @@ def stop(ctx, clusterid):
 
 
 @cli.command()
-@click.option('--memory', default='2000', help='Memory for each worker in MB')
+@click.option('--memory', default='2000M', help='Memory for each executor using a Java memory string')
 @click.option('--timeout', default=30, help='Timeout for starting spark master')
+@click.option('--cores-per-executor', default=1, help='Number of cores per executor')
 @click.pass_context
-def launch(ctx, memory, timeout):
+def launch(ctx, memory, timeout, cores_per_executor):
     """Launch the Spark master and workers within a current job context"""
-    sparkjob.start_cluster(memory,timeout)
+    sparkjob.start_cluster(memory, timeout=timeout, cores_per_executor=cores_per_executor)
 
 if __name__ == "__main__":
     cli(obj={})
@@ -1,67 +1,14 @@
 from __future__ import print_function
 import os, sys
 import logging
+import sparkjob
+import lsfsparkjob
+from lsfsparkjob import LSFSparkJob
+from slurmsparkjob import SLURMSparkJob
+
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-from . import sparkjob
-
-try : 
-    def start_spark(master = 'local[*]', 
-                    spark_conf='./spark_conf', 
-                    executor_memory=None,
-                    profiling=False, 
-                    graphframes_package='graphframes:graphframes:0.3.0-spark2.0-s_2.11', 
-                    extra_conf = None):
-        """Launch a SparkContext 
-        
-        Inputs
-        ------
-
-        master : URL to spark master in the form 'spark://<master>:<port>'
-        
-        spark_conf : path to a spark configuration directory
-
-        executor_memory : executor memory in java memory string format, e.g. '4G'
-
-        profiling: whether to turn on python profiling or not
-
-        graphframes_package : which graphframes to load
-        """
-
-        os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages {graphframes_package} pyspark-shell"\
-                                            .format(graphframes_package=graphframes_package)
-        
-        os.environ['SPARK_CONF_DIR'] = os.path.realpath(spark_conf)
-
-        os.environ['PYSPARK_PYTHON'] = sys.executable
-
-        from pyspark import SparkContext, SparkConf
-
-        conf = SparkConf()
-
-        conf.set('spark.driver.maxResultSize', '0')
-
-        if executor_memory is not None: 
-            conf.set('spark.executor.memory', executor_memory)
-        if profiling: 
-            conf.set('spark.python.profile', 'true')
-        else:
-            conf.set('spark.python.profile', 'false')
-        
-        if extra_conf is not None: 
-            for k,v in extra_conf.iteritems(): 
-                conf.set(k,v)
-
-        sc = SparkContext(master=master, conf=conf)
-
-        return sc
-
-    def get_sqc(sc):
-        from pyspark.sql import SQLContext
-
-        return SQLContext(sc)
-
-except ImportError: 
-    logger.warning('Could not import pyspark -- make sure SPARK_HOME is set')
+def show_clusters():
+    sparkjob.sparkjob().show_clusters()
@@ -0,0 +1,19 @@
+import os
+import time
+from  .sparkjob import SparkJob
+import re
+import subprocess
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger('sparkhpc.lsfsparkjob')
+
+class LSFSparkJob(SparkJob):
+    """Class for submitting spark jobs with the LSF scheduler"""
+    _submit_command = 'bsub < %s'
+    _job_regex = 'Job <(\d+)>'
+    _kill_command = 'bkill'
+    _get_current_jobs = 'bjobs -o "job_name stat jobid"'
+
+    def _peek(self):
+        return subprocess.check_output(["bpeek", str(self.jobid)])
-Original file line number
+Diff line change
@@ @@ -0,0 +1,9 @@ @@
 +*pyc
 +*~
 +*.log
 +_build
 +build
 +.ipynb_checkpoints
 +dist
 +*egg-info
 +job