JeffersonLab · mfmceneaney · Mar 4, 2025 · Mar 5, 2025 · Mar 20, 2025 · Mar 24, 2025
diff --git a/bin/run-detectors-timelines.sh b/bin/run-detectors-timelines.sh
@@ -3,6 +3,12 @@
 set -e
 set -u
 source $(dirname $0)/environ.sh
+# constants ############################################################
+# slurm settings
+SLURM_MEMORY=1500
+SLURM_TIME=10:00:00
+SLURM_LOG=/farm_out/%u/%x-%A_%a
+########################################################################
 
 # default options
 match="^"
@@ -12,7 +18,7 @@ outputDir=""
 numThreads=8
 singleTimeline=""
 declare -A modes
-for key in list build skip-mya focus-timelines focus-qa debug help; do
+for key in list build skip-mya focus-timelines focus-qa run-slurm submit-slurm after-slurm debug help; do
   modes[$key]=false
 done
 
@@ -55,6 +61,16 @@ usage() {
     --focus-timelines   only produce the detector timelines, do not run detector QA code
     --focus-qa          only run the QA code (assumes you have detector timelines already)
 
+    *** EXECUTION CONTROL OPTIONS: choose only one, or the default will generate a
+         Slurm job description and print out the suggested \`sbatch\` command
+
+       --run-slurm      run timelines on SLURM instead of running multi-threaded locally
+       --submit-slurm   submit the slurm jobs, rather than just
+                        printing the \`sbatch\` command
+       --after-slurm    organize timelines **after** running them with --run-slurm
+                        **Note**, if this option is not used after running on slurm,
+                        all files from output directories will be removed.
+
     --debug             enable debug mode: run a single timeline with stderr and stdout printed to screen;
                         it is best to use this with the '-t' option to debug specific timeline issues
 
@@ -91,6 +107,10 @@ if ${modes['help']}; then
   usage
   exit 101
 fi
+if (${modes['run-slurm']} && ${modes['debug']}); then
+  echo "ERROR: --run-slurm and --debug are mutually exclusive" >&2
+  exit 100
+fi
 
 # set class path to include groovy's classpath, for `java` calls
 export CLASSPATH="$JYPATH${CLASSPATH:+:${CLASSPATH}}"
@@ -177,13 +197,13 @@ detDirs=(
   trigger
 )
 
-# cleanup output directories
-if ${modes['focus-all']} || ${modes['focus-timelines']}; then
+# cleanup output directories IF you are not just organizing files after running on SLURM
+if (${modes['focus-all']} || ${modes['focus-timelines']}) && ! ${modes['after-slurm']}; then
   if [ -d $finalDirPreQA ]; then
     rm -rv $finalDirPreQA
   fi
 fi
-if [ -d $logDir ]; then
+if [ -d $logDir ] && ! ${modes['after-slurm']}; then
   for fail in $(find $logDir -name "*.fail"); do
     rm $fail
   done
@@ -231,26 +251,135 @@ if ${modes['focus-all']} || ${modes['focus-timelines']}; then
   done
 
   # produce timelines, multithreaded
-  job_ids=()
-  job_names=()
-  for timelineObj in $timelineList; do
-    logFile=$logDir/$timelineObj
-    [ -n "$singleTimeline" -a "$timelineObj" != "$singleTimeline" ] && continue
-    echo ">>> producing timeline '$timelineObj' ..."
-    if ${modes['debug']}; then
-      java $TIMELINE_JAVA_OPTS $run_detectors_script $timelineObj $inputDir
-      echo "PREMATURE EXIT, since --debug option was used"
-      exit
+  if (! ${modes['run-slurm']} || ${modes['debug']}) && ! ${modes['after-slurm']}; then
+    job_ids=()
+    job_names=()
+    for timelineObj in $timelineList; do
+      logFile=$logDir/$timelineObj
+      [ -n "$singleTimeline" -a "$timelineObj" != "$singleTimeline" ] && continue
+      echo ">>> producing timeline '$timelineObj' ..."
+      if ${modes['debug']}; then
+        java $TIMELINE_JAVA_OPTS $run_detectors_script $timelineObj $inputDir
+        echo "PREMATURE EXIT, since --debug option was used"
+        exit
+      else
+        #sleep 1 
+        java $TIMELINE_JAVA_OPTS $run_detectors_script $timelineObj $inputDir > $logFile.out 2> $logFile.err || touch $logFile.fail &
+        job_ids+=($!)
+        job_names+=($timelineObj)
+      fi
+      wait_for_jobs $numThreads
+    done
+
+    wait_for_jobs 0
+
+  fi # condition end: produce timelines, multi-threaded
+
+  # produce timelines, distributed on SLURM
+  if ${modes['run-slurm']} && ! ${modes['after-slurm']}; then
+
+    # initial checks and preparations
+    echo $dataset | grep -q "/" && printError "dataset name must not contain '/' " && echo && exit 100
+    [ -z "$dataset" ] && printError "dataset name must not be empty" && echo && exit 100
+    slurmJobName=clas12-timeline--$dataset
+
+    # start job lists
+    echo """
+    Generating job scripts..."""
+    slurmDir=$outputDir/slurm/step2
+    mkdir -p $slurmDir/scripts
+    jobkeys=()
+    for timelineObj in $timelineList; do
+      [ -n "$singleTimeline" -a "$timelineObj" != "$singleTimeline" ] && continue
+      jobkeys+=($timelineObj)
+    done
+    #NOTE: A separate list is created for each key in run-monitoring.sh,
+    # but here we just want to submit all timelines in the same slurm job array so just create one job list.
+    joblist=$slurmDir/job.$dataset.detectors.list
+    > $joblist
+
+    # get list of input files, and append prefix for SWIF
+    echo "..... getting input files ....."
+    inputListFile=$slurmDir/files.$dataset.inputs.list
+    realpath $inputDir > $inputListFile
+
+    # generate job scripts
+    echo "..... generating job scripts ....."
+    for key in ${jobkeys[@]}; do
+
+      # make job scripts for each $key
+      jobscript=$slurmDir/scripts/$key.$dataset.sh
+
+      cat > $jobscript << EOF
+#!/usr/bin/env bash
+set -e
+set -u
+set -o pipefail
+echo "TIMELINE OBJECT $key"
+
+# set classpath
+export CLASSPATH=$CLASSPATH
+
+# produce detector timelines
+java $TIMELINE_JAVA_OPTS $run_detectors_script $key $inputDir
+EOF
+
+      # grant permission and add it `joblist`
+      chmod u+x $jobscript
+      echo $jobscript >> $joblist
+
+    done # loop over `jobkeys`
+
+    # now generate slurm descriptions and/or local scripts
+    echo """
+    Generating batch scripts..."""
+    exelist=()
+
+    # check if we have any jobs to run
+    [ ! -s $joblist ] && printError "there are no timeline jobs to run" && exit 100
+    slurm=$(echo $joblist | sed 's;.list$;.slurm;')
+
+    cat > $slurm << EOF
+#!/bin/sh
+#SBATCH --ntasks=1
+#SBATCH --job-name=$slurmJobName
+#SBATCH --output=$SLURM_LOG.out
+#SBATCH --error=$SLURM_LOG.err
+#SBATCH --partition=production
+#SBATCH --account=clas12
+
+#SBATCH --mem-per-cpu=$SLURM_MEMORY
+#SBATCH --time=$SLURM_TIME
+
+#SBATCH --array=1-$(cat $joblist | wc -l)
+#SBATCH --ntasks=1
+
+srun \$(head -n\$SLURM_ARRAY_TASK_ID $joblist | tail -n1)
+EOF
+    exelist+=($slurm)
+
+    # execution
+    [ ${#exelist[@]} -eq 0 ] && printError "no jobs were created at all; check errors and warnings above" && exit 100
+    echo """
+    $sep
+    """
+    if ${modes['submit-slurm']}; then
+      echo "SUBMITTING JOBS TO SLURM"
+      echo $sep
+      for exe in ${exelist[@]}; do sbatch $exe; done
+      echo $sep
+      echo "JOBS SUBMITTED!"
     else
-      #sleep 1 
-      java $TIMELINE_JAVA_OPTS $run_detectors_script $timelineObj $inputDir > $logFile.out 2> $logFile.err || touch $logFile.fail &
-      job_ids+=($!)
-      job_names+=($timelineObj)
+      echo """  SLURM JOB DESCRIPTIONS GENERATED
+      - Slurm job name prefix will be: $slurmJobName
+      - To submit all jobs to slurm, run:
+        ------------------------------------------"""
+      for exe in ${exelist[@]}; do echo "    sbatch $exe"; done
+      echo """    ------------------------------------------
+      """
     fi
-    wait_for_jobs $numThreads
-  done
-
-  wait_for_jobs 0
+    exit 0
+  fi # condition end: produce timelines, distributed on SLURM
 
   # organize output timelines
   echo ">>> organizing output timelines..."

diff --git a/bin/run-monitoring.sh b/bin/run-monitoring.sh
@@ -281,7 +281,7 @@ slurmJobName=clas12-timeline--$dataset
 # start job lists
 echo """
 Generating job scripts..."""
-slurmDir=./slurm
+slurmDir=./slurm/step1
 mkdir -p $slurmDir/scripts
 jobkeys=()
 for key in detectors physics; do

diff --git a/doc/chef_guide.md b/doc/chef_guide.md
@@ -17,10 +17,12 @@ Output files will appear in your chosen output directory, within `hist/detectors
 ## :green_circle: Step 2: Make the timelines
 
 ```bash
-run-detectors-timelines.sh -d $dataset -i $out_dir/hist/detectors
+run-detectors-timelines.sh -d $dataset -i $out_dir/hist/detectors --run-slurm
 ```
 where `$out_dir` is your output directory from **Step 1** and `$dataset` is a unique name for this cook, _e.g._, `rga_v1.23`.
 
+Notice the `--run-slurm` option which will set up a slurm script for each detector timeline.  To submit the jobs, run the printed sbatch command **or** run with the `--submit-slurm` option to automatically submit.  Once all jobs have finished successfully, rerun using the `--after-slurm` option instead to complete the organization of the output.
+
 Output will appear in `./outfiles/$dataset/`.
 
 ## :green_circle: Step 3: Deploy the timelines

diff --git a/doc/procedure.md b/doc/procedure.md
@@ -74,6 +74,20 @@ bin/run-physics-timelines.sh   -d rga_sp19_v5   # for physics timelines
 ```
 - the dataset name must match that of Step 1, otherwise you need to specify the path to the input files with `-i`
 
+### Distributing Detector Timelines with SLURM
+The detector timelines can take quite a while to run locally, and this is only recommended if you are running a single timeline or debugging.  Instead, it is more efficient to distribute each timeline to a different slurm job.  Continuing along with the example scenario above, you would first run with the `--run-slurm` option:
+```bash
+bin/run-detectors-timelines.sh -d rga_sp19_v5 --run-slurm   # for detector timelines
+```
+which will set up the slurm scripts and print out the appropriate sbatch command to submit all the detector timeline jobs.  This is analagous to when you are submitting jobs with `bin/run-monitoring.sh` in [Step 1](#-step-1-data-monitoring) so see the directions there for monitoring your jobs and checking the slurm output files.  If you trust yourself well enough, you can just submit automatically too by adding the `--submit-slurm` option:
+```bash
+bin/run-detectors-timelines.sh -d rga_sp19_v5 --run-slurm --submit-slurm   # for detector timelines
+```
+**After** all the detector timeline jobs finish successfully, you will then need to run the script again with the `--after-slurm` option:
+```bash
+bin/run-detectors-timelines.sh -d rga_sp19_v5 --after-slurm   # for detector timelines
+```
+which will finish organizing all the output files in the appropriate directories.
 
 > [!NOTE]
 > - detector timeline production is handled by the [`detectors/` subdirectory](/detectors);