Skip to content

Commit 54270c1

Browse files
authored
Merge pull request #600 from GoogleCloudPlatform/release-candidate
Release v1.6.0
2 parents 6e8e1f7 + 8878f3f commit 54270c1

File tree

141 files changed

+1433
-563
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

141 files changed

+1433
-563
lines changed

Makefile

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,21 @@ ENG = ./cmd/... ./pkg/...
1414
TERRAFORM_FOLDERS=$(shell find ./modules ./community/modules ./tools -type f -name "*.tf" -not -path '*/\.*' -exec dirname "{}" \; | sort -u)
1515
PACKER_FOLDERS=$(shell find ./modules ./community/modules ./tools -type f -name "*.pkr.hcl" -not -path '*/\.*' -exec dirname "{}" \; | sort -u)
1616

17+
ifneq (, $(shell which git))
18+
## GIT IS PRESENT
19+
ifneq (,$(wildcard .git))
20+
## GIT DIRECTORY EXISTS
21+
GIT_TAG_VERSION=$(shell git tag --points-at HEAD)
22+
GIT_BRANCH=$(shell git branch --show-current)
23+
GIT_COMMIT_INFO=$(shell git describe --tags --dirty --long)
24+
endif
25+
endif
26+
1727
# RULES MEANT TO BE USED DIRECTLY
1828

1929
ghpc: warn-go-version warn-terraform-version warn-packer-version $(shell find ./cmd ./pkg ghpc.go -type f)
2030
$(info **************** building ghpc ************************)
21-
go build ghpc.go
31+
@go build -ldflags="-X 'main.gitTagVersion=$(GIT_TAG_VERSION)' -X 'main.gitBranch=$(GIT_BRANCH)' -X 'main.gitCommitInfo=$(GIT_COMMIT_INFO)'" ghpc.go
2232

2333
install-user:
2434
$(info ******** installing ghpc in ~/bin *********************)

README.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,59 @@ In the right side, expand the Filters view and then filter by label, specifying
204204

205205
## Troubleshooting
206206

207+
### Network is unreachable (Slurm V5)
208+
209+
Slurm requires access to google APIs to function. This can be achieved through one of the following methods:
210+
211+
1. Create a [Cloud NAT](https://cloud.google.com/nat) (preferred).
212+
2. Setting `disable_controller_public_ips: false` &
213+
`disable_login_public_ips: false` on the controller and login nodes
214+
respectively.
215+
3. Enable
216+
[private access to Google APIs](https://cloud.google.com/vpc/docs/private-access-options).
217+
218+
By default the Toolkit VPC module will create an associated Cloud NAT so this is
219+
typically seen when working with the pre-existing-vpc module. If no access
220+
exists you will see the following errors:
221+
222+
When you ssh into the login node or controller you will see the following
223+
message:
224+
225+
```text
226+
*** Slurm setup failed! Please view log: /slurm/scripts/setup.log ***
227+
```
228+
229+
> **_NOTE:_**: Many different potential issues could be indicated by the above
230+
> message, so be sure to verify issue in logs.
231+
232+
To confirm the issue, ssh onto the controller and call `sudo cat /slurm/scripts/setup.log`. Look for
233+
the following logs:
234+
235+
```text
236+
google_metadata_script_runner: startup-script: ERROR: [Errno 101] Network is unreachable
237+
google_metadata_script_runner: startup-script: OSError: [Errno 101] Network is unreachable
238+
google_metadata_script_runner: startup-script: ERROR: Aborting setup...
239+
google_metadata_script_runner: startup-script exit status 0
240+
google_metadata_script_runner: Finished running startup scripts.
241+
```
242+
243+
You may also notice mount failure logs on the login node:
244+
245+
```text
246+
INFO: Waiting for '/usr/local/etc/slurm' to be mounted...
247+
INFO: Waiting for '/home' to be mounted...
248+
INFO: Waiting for '/opt/apps' to be mounted...
249+
INFO: Waiting for '/etc/munge' to be mounted...
250+
ERROR: mount of path '/usr/local/etc/slurm' failed: <class 'subprocess.CalledProcessError'>: Command '['mount', '/usr/local/etc/slurm']' returned non-zero exit status 32.
251+
ERROR: mount of path '/opt/apps' failed: <class 'subprocess.CalledProcessError'>: Command '['mount', '/opt/apps']' returned non-zero exit status 32.
252+
ERROR: mount of path '/home' failed: <class 'subprocess.CalledProcessError'>: Command '['mount', '/home']' returned non-zero exit status 32.
253+
ERROR: mount of path '/etc/munge' failed: <class 'subprocess.CalledProcessError'>: Command '['mount', '/etc/munge']' returned non-zero exit status 32.
254+
```
255+
256+
> **_NOTE:_**: The above logs only indicate that something went wrong with the
257+
> startup of the controller. Check logs on the controller to be sure it is a
258+
> network issue.
259+
207260
### Failure to Create Auto Scale Nodes (Slurm)
208261

209262
If your deployment succeeds but your jobs fail with the following error:

cmd/root.go

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,16 @@ import (
2323
"github.com/spf13/cobra"
2424
)
2525

26+
// Git references when use Makefile
2627
var (
27-
rootCmd = &cobra.Command{
28+
GitTagVersion string
29+
GitBranch string
30+
GitCommitInfo string
31+
)
32+
33+
var (
34+
annotation = make(map[string]string)
35+
rootCmd = &cobra.Command{
2836
Use: "ghpc",
2937
Short: "A blueprint and deployment engine for HPC clusters in GCP.",
3038
Long: `gHPC provides a flexible and simple to use interface to accelerate
@@ -34,12 +42,28 @@ HPC deployments on the Google Cloud Platform.`,
3442
log.Fatalf("cmd.Help function failed: %s", err)
3543
}
3644
},
37-
Version: "v1.5.0",
45+
Version: "v1.6.0",
46+
Annotations: annotation,
3847
}
3948
)
4049

4150
// Execute the root command
4251
func Execute() error {
52+
if len(GitCommitInfo) > 0 {
53+
if len(GitTagVersion) == 0 {
54+
GitTagVersion = "- not built from oficial release"
55+
}
56+
if len(GitBranch) == 0 {
57+
GitBranch = "detached HEAD"
58+
}
59+
annotation["version"] = GitTagVersion
60+
annotation["branch"] = GitBranch
61+
annotation["commitInfo"] = GitCommitInfo
62+
rootCmd.SetVersionTemplate(`ghpc version {{index .Annotations "version"}}
63+
Built from '{{index .Annotations "branch"}}' branch.
64+
Commit info: {{index .Annotations "commitInfo"}}
65+
`)
66+
}
4367
return rootCmd.Execute()
4468
}
4569

community/examples/AMD/README.md

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,20 +75,15 @@ remounted and that you should logout and login. Follow its instructions.
7575
Once configuration is complete, install AOCC by running:
7676
7777
```shell
78-
sudo -i bash /var/tmp/install_aocc.sh
78+
sudo bash /var/tmp/install_aocc.sh
7979
```
8080
8181
Spack will prompt you to accept the AOCC End User License Agreement by opening a
8282
text file containing information about the license. Leave the file unmodified
8383
and write it to disk by typing `:q` as two characters in sequence
8484
([VI help][vihelp]).
8585
86-
Installation of AOCC and OpenMPI will take approximately 15 minutes. Once they
87-
are installed, you can install additional packages such as `amdblis`:
88-
89-
```shell
90-
sudo -i spack -d install -v amdblis %aocc@3.2.0
91-
```
86+
Installation of AOCC and OpenMPI will take approximately 15 minutes.
9287
9388
Configure SSH user keys for access between cluster nodes:
9489

community/examples/AMD/hpc-cluster-amd-slurmv5.yaml

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,20 +65,46 @@ deployment_groups:
6565
- type: shell
6666
source: modules/startup-script/examples/install_ansible.sh
6767
destination: install_ansible.sh
68+
- $(swfs.install_nfs_client_runner)
69+
- $(swfs.mount_runner)
6870
- $(spack.install_spack_deps_runner)
6971
- $(spack.install_spack_runner)
72+
- type: shell
73+
content: "shutdown -h +1"
74+
destination: shutdown.sh
75+
76+
- id: slurm_startup
77+
source: modules/scripts/startup-script
78+
settings:
79+
runners:
80+
- type: data
81+
destination: /etc/profile.d/spack.sh
82+
content: |
83+
#!/bin/sh
84+
if [ -f /sw/spack/share/spack/setup-env.sh ]; then
85+
. /sw/spack/share/spack/setup-env.sh
86+
fi
7087
# the following installation of AOCC may be automated in the future
7188
# with a clear direction to the user to read the EULA at
7289
# https://developer.amd.com/aocc-compiler-eula/
7390
- type: data
7491
destination: /var/tmp/install_aocc.sh
7592
content: |
7693
#!/bin/bash
94+
source /sw/spack/share/spack/setup-env.sh
7795
spack install aocc@3.2.0 +license-agreed
7896
spack load aocc@3.2.0
7997
spack compiler find --scope site
8098
spack -d install -v openmpi@4.1.3 %aocc@3.2.0 +legacylaunchers +pmi schedulers=slurm
8199
100+
# must restart vm to re-initiate subsequent installs
101+
- id: spack_builder
102+
source: modules/compute/vm-instance
103+
use: [network1, swfs, spack-startup]
104+
settings:
105+
name_prefix: spack-builder
106+
machine_type: c2d-standard-16
107+
82108
- id: low_cost_partition
83109
source: community/modules/compute/schedmd-slurm-gcp-v5-partition
84110
use:
@@ -118,6 +144,6 @@ deployment_groups:
118144
use:
119145
- network1
120146
- slurm_controller
121-
- spack-startup
147+
- slurm_startup
122148
settings:
123149
machine_type: c2d-standard-4

community/examples/cloud-batch.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,14 @@ deployment_groups:
2929
modules:
3030
- id: network1
3131
source: modules/network/pre-existing-vpc
32-
kind: terraform
3332

3433
- id: appfs
3534
source: modules/file-system/filestore
36-
kind: terraform
3735
use: [network1]
3836
settings: {local_mount: /sw}
3937

4038
- id: hello-startup-script
4139
source: modules/scripts/startup-script
42-
kind: terraform
4340
settings:
4441
runners:
4542
- type: shell
@@ -55,7 +52,6 @@ deployment_groups:
5552
5653
- id: batch-job
5754
source: community/modules/scheduler/cloud-batch-job
58-
kind: terraform
5955
use: [network1, appfs, hello-startup-script]
6056
settings:
6157
runnable: "cat /sw/hello.txt"
@@ -66,6 +62,5 @@ deployment_groups:
6662

6763
- id: batch-login
6864
source: community/modules/scheduler/cloud-batch-login-node
69-
kind: terraform
7065
use: [batch-job]
7166
outputs: [instructions]

community/examples/hpc-cluster-small-sharedvpc.yaml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,15 +43,13 @@ deployment_groups:
4343
modules:
4444
- id: network1
4545
source: modules/network/pre-existing-vpc
46-
kind: terraform
4746
settings:
4847
project_id: $(vars.host_project_id)
4948
network_name: your-shared-network
5049
subnetwork_name: your-shared-subnetwork
5150

5251
- id: homefs
5352
source: modules/file-system/filestore
54-
kind: terraform
5553
use: [network1]
5654
settings:
5755
local_mount: /home
@@ -61,7 +59,6 @@ deployment_groups:
6159
# This debug_partition will work out of the box without requesting additional GCP quota.
6260
- id: debug_partition
6361
source: community/modules/compute/SchedMD-slurm-on-gcp-partition
64-
kind: terraform
6562
use:
6663
- network1
6764
- homefs
@@ -75,7 +72,6 @@ deployment_groups:
7572
# This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first.
7673
- id: compute_partition
7774
source: community/modules/compute/SchedMD-slurm-on-gcp-partition
78-
kind: terraform
7975
use:
8076
- network1
8177
- homefs
@@ -85,7 +81,6 @@ deployment_groups:
8581

8682
- id: slurm_controller
8783
source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller
88-
kind: terraform
8984
use:
9085
- network1
9186
- homefs
@@ -97,7 +92,6 @@ deployment_groups:
9792

9893
- id: slurm_login
9994
source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node
100-
kind: terraform
10195
use:
10296
- network1
10397
- homefs

community/examples/htcondor-pool.yaml

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ deployment_groups:
2929
modules:
3030
- id: network1
3131
source: modules/network/vpc
32-
kind: terraform
3332
settings:
3433
network_name: htcondor-pool
3534
subnetwork_name: htcondor-pool-usc1
@@ -38,21 +37,17 @@ deployment_groups:
3837

3938
- id: htcondor_install
4039
source: community/modules/scripts/htcondor-install
41-
kind: terraform
4240

4341
- id: htcondor_services
4442
source: community/modules/project/service-enablement
45-
kind: terraform
4643
use:
4744
- htcondor_install
4845

4946
- id: htcondor_configure
5047
source: community/modules/scheduler/htcondor-configure
51-
kind: terraform
5248

5349
- id: htcondor_configure_central_manager
5450
source: modules/scripts/startup-script
55-
kind: terraform
5651
settings:
5752
runners:
5853
- type: shell
@@ -63,7 +58,6 @@ deployment_groups:
6358

6459
- id: htcondor_cm
6560
source: modules/compute/vm-instance
66-
kind: terraform
6761
use:
6862
- network1
6963
- htcondor_configure_central_manager
@@ -80,7 +74,6 @@ deployment_groups:
8074

8175
- id: htcondor_configure_execute_point
8276
source: modules/scripts/startup-script
83-
kind: terraform
8477
settings:
8578
runners:
8679
- type: shell
@@ -91,7 +84,6 @@ deployment_groups:
9184

9285
- id: htcondor_execute_point
9386
source: community/modules/compute/htcondor-execute-point
94-
kind: terraform
9587
use:
9688
- network1
9789
- htcondor_configure_execute_point
@@ -106,7 +98,6 @@ deployment_groups:
10698

10799
- id: htcondor_configure_access_point
108100
source: modules/scripts/startup-script
109-
kind: terraform
110101
settings:
111102
runners:
112103
- type: shell
@@ -130,7 +121,6 @@ deployment_groups:
130121
queue
131122
- id: htcondor_access
132123
source: modules/compute/vm-instance
133-
kind: terraform
134124
use:
135125
- network1
136126
- htcondor_configure_access_point

community/examples/intel/daos-cluster.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,12 @@ deployment_groups:
3030
modules:
3131
- id: network1
3232
source: modules/network/pre-existing-vpc
33-
kind: terraform
3433

3534
# This module creates a DAOS server. Server images MUST be created before running this.
3635
# https://github.com/daos-stack/google-cloud-daos/tree/main/images
3736
# more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_server
3837
- id: daos-server
3938
source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.2.1
40-
kind: terraform
4139
use: [network1]
4240
settings:
4341
number_of_instances: 2
@@ -48,7 +46,6 @@ deployment_groups:
4846
# more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_client
4947
- id: daos-client
5048
source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_client?ref=v0.2.1
51-
kind: terraform
5249
use: [network1, daos-server]
5350
settings:
5451
number_of_instances: 2

0 commit comments

Comments
 (0)