Skip to content

Commit b2bd7ff

Browse files
committed
Updated to add GPU farms and use the accelerator format
1 parent bc051cb commit b2bd7ff

File tree

5 files changed

+119
-54
lines changed

5 files changed

+119
-54
lines changed

conf/sanger.config

Lines changed: 8 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -17,40 +17,12 @@ process {
1717
cpus = 1
1818
memory = 6.Gb
1919

20-
// Currently a single set of rules for all clusters, but we could apply
21-
// different rules to different clusters in their respective configs under ./sanger/
22-
queue = {
23-
if (task.time >= 15.day) {
24-
if (task.memory > 680.GB) {
25-
error "There is no queue for jobs that need >680 GB and >15 days"
26-
} else {
27-
return "basement"
28-
}
29-
} else if (task.memory > 720.GB) {
30-
return "teramem"
31-
} else if (task.memory > 350.GB) {
32-
return "hugemem"
33-
} else if (task.time > 7.day) {
34-
return "basement"
35-
} else if (task.time > 2.day) {
36-
return "week"
37-
} else if (task.time > 12.hour) {
38-
return "long"
39-
} else if (task.time > 1.min || !task.time) {
40-
return "normal"
41-
} else {
42-
return "small"
43-
}
44-
}
20+
clusterOptions = { task.accelerator ? "-gpu \"num=${task.accelerator.request}/host:mode=shared:j_exclusive=yes\"" : null }
4521

46-
withLabel: gpu {
47-
clusterOptions = {
48-
"-M "+task.memory.toMega()+" -R 'select[mem>="+task.memory.toMega()+"] rusage[mem="+task.memory.toMega()+"] span[ptile=1]' -gpu 'num=1:j_exclusive=yes'"
49-
}
50-
queue = { task.time > 12.h ? 'gpu-huge' : task.time > 48.h ? 'gpu-basement' : 'gpu-normal' }
51-
containerOptions = {
22+
containerOptions = {
23+
if (task.accelerator) {
5224
workflow.containerEngine == "singularity" ? '--containall --cleanenv --nv':
53-
( workflow.containerEngine == "docker" ? '--gpus all': null )
25+
( workflow.containerEngine == "docker" ? '--gpus all': null )
5426
}
5527
}
5628
}
@@ -76,8 +48,10 @@ includeConfig ({
7648

7749
if (clustername == "tol22") {
7850
return "sanger/tol22.config"
79-
} else if (clustername == "farm22") {
80-
return "sanger/farm22.config"
51+
} else if (clustername == "farm22" || clustername == "casm22") {
52+
return "sanger/cpu-farms22.config"
53+
} else if (clustername == "tiger22" || clustername == "cub22") {
54+
return "sanger/gpu-farms22.config"
8155
} else {
8256
return "/dev/null"
8357
}

conf/sanger/cpu-farms22.config

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// farm22 cluster at Wellcome Sanger Institute
2+
3+
params {
4+
max_memory = 2.9.TB
5+
max_cpus = 256
6+
max_time = 43200.min // 30 days
7+
}
8+
9+
process {
10+
resourceLimits = [
11+
memory: 2.9.TB,
12+
cpus: 256,
13+
time: 43200.min
14+
]
15+
}
16+
17+
singularity {
18+
// Mount all filesystems by default
19+
runOptions = '--bind /lustre --bind /nfs --bind /data --bind /software'
20+
}
21+
22+
// Currently a single set of rules for all clusters, but we could apply
23+
// different rules to different clusters in their respective configs under ./sanger/
24+
queue = {
25+
if (task.accelerator) {
26+
return task.time > 12.h ? 'gpu-huge' : task.time > 48.h ? 'gpu-basement' : 'gpu-normal'
27+
} else {
28+
if (task.time >= 15.day) {
29+
if (task.memory > 680.GB) {
30+
error "There is no queue for jobs that need >680 GB and >15 days we suggest you use checkpointing"
31+
} else {
32+
return "basement"
33+
}
34+
} else if (task.memory > 720.GB) {
35+
return "teramem"
36+
} else if (task.memory > 350.GB) {
37+
return "hugemem"
38+
} else if (task.time > 7.day) {
39+
return "basement"
40+
} else if (task.time > 2.day) {
41+
return "week"
42+
} else if (task.time > 12.hour) {
43+
return "long"
44+
} else if (task.time > 1.min || !task.time) {
45+
return "normal"
46+
} else {
47+
return "small"
48+
}
49+
}
50+
}

conf/sanger/farm22.config

Lines changed: 0 additions & 20 deletions
This file was deleted.

conf/sanger/gpu-farms22.config

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
// farm22 cluster at Wellcome Sanger Institute
2+
3+
params {
4+
max_memory = 2.9.TB
5+
max_cpus = 256
6+
max_time = 43200.min // 30 days
7+
}
8+
9+
process {
10+
resourceLimits = [
11+
memory: 2.9.TB,
12+
cpus: 256,
13+
time: 43200.min
14+
]
15+
}
16+
17+
singularity {
18+
// Mount all filesystems by default
19+
runOptions = '--bind /lustre --bind /nfs --bind /data --bind /software'
20+
}
21+
22+
// Currently a single set of rules for all clusters, but we could apply
23+
// different rules to different clusters in their respective configs under ./sanger/
24+
queue = {
25+
if (task.accelerator) {
26+
return "inference"
27+
} else {
28+
// Temporary oversubscribed queue for CPU jobs until we assign more CPU resources to these clusters
29+
return "oversubscribed"
30+
}
31+
}

conf/sanger/tol22.config

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,33 @@ process {
1515
time: 89280.min
1616
]
1717
}
18+
19+
// Currently a single set of rules for all clusters, but we could apply
20+
// different rules to different clusters in their respective configs under ./sanger/
21+
queue = {
22+
if (task.accelerator) {
23+
error "There is no queue for GPU jobs on tol22"
24+
} else {
25+
if (task.time >= 15.day) {
26+
if (task.memory > 680.GB) {
27+
error "There is no queue for jobs that need >680 GB and >15 days we suggest you use checkpointing"
28+
} else {
29+
return "basement"
30+
}
31+
} else if (task.memory > 720.GB) {
32+
return "teramem"
33+
} else if (task.memory > 350.GB) {
34+
return "hugemem"
35+
} else if (task.time > 7.day) {
36+
return "basement"
37+
} else if (task.time > 2.day) {
38+
return "week"
39+
} else if (task.time > 12.hour) {
40+
return "long"
41+
} else if (task.time > 1.min || !task.time) {
42+
return "normal"
43+
} else {
44+
return "small"
45+
}
46+
}
47+
}

0 commit comments

Comments
 (0)