Updated to add GPU farms and use the accelerator format

mp15 · mp15 · commit b2bd7ff00343 · 2025-12-09T16:51:11.000Z
diff --git a/conf/sanger.config b/conf/sanger.config
@@ -17,40 +17,12 @@ process {
     cpus   = 1
     memory = 6.Gb
 
-    // Currently a single set of rules for all clusters, but we could apply
-    // different rules to different clusters in their respective configs under ./sanger/
-    queue = {
-        if (task.time >= 15.day) {
-            if (task.memory > 680.GB) {
-                error "There is no queue for jobs that need >680 GB and >15 days"
-            } else {
-                return "basement"
-            }
-        } else if (task.memory > 720.GB) {
-            return "teramem"
-        } else if (task.memory > 350.GB) {
-            return "hugemem"
-        } else if (task.time > 7.day) {
-            return "basement"
-        } else if (task.time > 2.day) {
-            return "week"
-        } else if (task.time > 12.hour) {
-            return "long"
-        } else if (task.time > 1.min || !task.time) {
-            return "normal"
-        } else {
-            return "small"
-        }
-    }
+    clusterOptions = { task.accelerator ? "-gpu \"num=${task.accelerator.request}/host:mode=shared:j_exclusive=yes\"" : null }
 
-    withLabel: gpu {
-        clusterOptions = {
-            "-M "+task.memory.toMega()+" -R 'select[mem>="+task.memory.toMega()+"] rusage[mem="+task.memory.toMega()+"] span[ptile=1]' -gpu 'num=1:j_exclusive=yes'"
-        }
-        queue = { task.time > 12.h ? 'gpu-huge' : task.time > 48.h ? 'gpu-basement' : 'gpu-normal' }
-        containerOptions = {
+    containerOptions = {
+        if (task.accelerator) {
             workflow.containerEngine == "singularity" ? '--containall --cleanenv --nv':
-            ( workflow.containerEngine == "docker" ? '--gpus all': null )
+                ( workflow.containerEngine == "docker" ? '--gpus all': null )
         }
     }
 }
@@ -76,8 +48,10 @@ includeConfig ({
 
     if (clustername == "tol22") {
         return "sanger/tol22.config"
-    } else if (clustername == "farm22") {
-        return "sanger/farm22.config"
+    } else if (clustername == "farm22" || clustername == "casm22") {
+        return "sanger/cpu-farms22.config"
+    } else if (clustername == "tiger22" || clustername == "cub22") {
+        return "sanger/gpu-farms22.config"
     } else {
         return "/dev/null"
     }
diff --git a/conf/sanger/cpu-farms22.config b/conf/sanger/cpu-farms22.config
@@ -0,0 +1,50 @@
+// farm22 cluster at Wellcome Sanger Institute
+
+params {
+    max_memory = 2.9.TB
+    max_cpus = 256
+    max_time = 43200.min // 30 days
+}
+
+process {
+    resourceLimits = [
+        memory: 2.9.TB,
+        cpus: 256,
+        time: 43200.min
+    ]
+}
+
+singularity {
+    // Mount all filesystems by default
+    runOptions = '--bind /lustre --bind /nfs --bind /data --bind /software'
+}
+
+// Currently a single set of rules for all clusters, but we could apply
+// different rules to different clusters in their respective configs under ./sanger/
+queue = {
+    if (task.accelerator) {
+        return task.time > 12.h ? 'gpu-huge' : task.time > 48.h ? 'gpu-basement' : 'gpu-normal'
+    } else {
+        if (task.time >= 15.day) {
+            if (task.memory > 680.GB) {
+                error "There is no queue for jobs that need >680 GB and >15 days we suggest you use checkpointing"
+            } else {
+                return "basement"
+            }
+        } else if (task.memory > 720.GB) {
+            return "teramem"
+        } else if (task.memory > 350.GB) {
+            return "hugemem"
+        } else if (task.time > 7.day) {
+            return "basement"
+        } else if (task.time > 2.day) {
+            return "week"
+        } else if (task.time > 12.hour) {
+            return "long"
+        } else if (task.time > 1.min || !task.time) {
+            return "normal"
+        } else {
+            return "small"
+        }
+    }
+}
diff --git a/conf/sanger/farm22.config b/conf/sanger/farm22.config
diff --git a/conf/sanger/gpu-farms22.config b/conf/sanger/gpu-farms22.config
@@ -0,0 +1,31 @@
+// farm22 cluster at Wellcome Sanger Institute
+
+params {
+    max_memory = 2.9.TB
+    max_cpus = 256
+    max_time = 43200.min // 30 days
+}
+
+process {
+    resourceLimits = [
+        memory: 2.9.TB,
+        cpus: 256,
+        time: 43200.min
+    ]
+}
+
+singularity {
+    // Mount all filesystems by default
+    runOptions = '--bind /lustre --bind /nfs --bind /data --bind /software'
+}
+
+// Currently a single set of rules for all clusters, but we could apply
+// different rules to different clusters in their respective configs under ./sanger/
+queue = {
+    if (task.accelerator) {
+        return "inference"
+    } else {
+        // Temporary oversubscribed queue for CPU jobs until we assign more CPU resources to these clusters
+        return "oversubscribed"
+    }
+}
diff --git a/conf/sanger/tol22.config b/conf/sanger/tol22.config
@@ -15,3 +15,33 @@ process {
         time: 89280.min
     ]
 }
+
+// Currently a single set of rules for all clusters, but we could apply
+// different rules to different clusters in their respective configs under ./sanger/
+queue = {
+    if (task.accelerator) {
+        error "There is no queue for GPU jobs on tol22"
+    } else {
+        if (task.time >= 15.day) {
+            if (task.memory > 680.GB) {
+                error "There is no queue for jobs that need >680 GB and >15 days we suggest you use checkpointing"
+            } else {
+                return "basement"
+            }
+        } else if (task.memory > 720.GB) {
+            return "teramem"
+        } else if (task.memory > 350.GB) {
+            return "hugemem"
+        } else if (task.time > 7.day) {
+            return "basement"
+        } else if (task.time > 2.day) {
+            return "week"
+        } else if (task.time > 12.hour) {
+            return "long"
+        } else if (task.time > 1.min || !task.time) {
+            return "normal"
+        } else {
+            return "small"
+        }
+    }
+}