GoogleCloudPlatform
diff --git a/‎README.md‎
Lines changed: 72 additions & 0 deletions b/‎README.md‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎cmd/create.go‎
Lines changed: 8 additions & 8 deletions b/‎cmd/create.go‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎cmd/expand.go‎
Lines changed: 6 additions & 6 deletions b/‎cmd/expand.go‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎cmd/root.go‎
Lines changed: 1 addition & 1 deletion b/‎cmd/root.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎community/examples/intel/README.md‎
Lines changed: 152 additions & 0 deletions b/‎community/examples/intel/README.md‎
Lines changed: 152 additions & 0 deletions
@@ -406,6 +406,78 @@ to `false` on the partition in question.
 
 [partition-enable-placement]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/compute/SchedMD-slurm-on-gcp-partition#input_enable_placement
 
+#### Insufficient Service Account Permissions
+
+By default, the slurm controller, login and compute nodes use the
+[Google Compute Engine Service Account (GCE SA)][def-compute-sa]. If this
+service account or a custom SA used by the Slurm modules does not have
+sufficient permissions, configuring the controller or running a job in Slurm may
+fail.
+
+If configuration of the Slurm controller fails, the error can be
+seen by viewing the startup script on the controller:
+
+```shell
+sudo journalctl -u google-startup-scripts.service | less
+```
+
+An error similar to the following indicates missing permissions for the serivce
+account:
+
+```shell
+Required 'compute.machineTypes.get' permission for ...
+```
+
+To solve this error, ensure your service account has the
+`compute.instanceAdmin.v1` IAM role:
+
+```shell
+SA_ADDRESS=<SET SERVICE ACCOUNT ADDRESS HERE>
+
+gcloud projects add-iam-policy-binding ${PROJECT_ID} \
+    --member=serviceAccount:${SA_ADDRESS} --role=roles/compute.instanceAdmin.v1
+```
+
+If Slurm failed to run a job, view the resume log on the controller instance
+with the following command:
+
+```shell
+sudo cat /var/log/slurm/resume.log
+```
+
+An error in `resume.log` simlar to the following indicates a permissions issue
+as well:
+
+```shell
+The user does not have access to service account 'PROJECT_NUMBER-compute@developer.gserviceaccount.com'.  User: ''.  Ask a project owner to grant you the iam.serviceAccountUser role on the service account": ['slurm-hpc-small-compute-0-0']
+```
+
+As indicated, the service account must have the compute.serviceAccountUser IAM
+role. This can be set with the following command:
+
+```shell
+SA_ADDRESS=<SET SERVICE ACCOUNT ADDRESS HERE>
+
+gcloud projects add-iam-policy-binding ${PROJECT_ID} \
+    --member=serviceAccount:${SA_ADDRESS} --role=roles/iam.serviceAccountUser
+```
+
+If the GCE SA is being used and cannot be updated, a new service account can be
+created and used with the correct permissions. Instructions for how to do this
+can be found in the [Slurm on Google Cloud User Guide][slurm-on-gcp-ug],
+specifically the section titled "Create Service Accounts".
+
+After creating the service account, it can be set via the
+"compute_node_service_account" and "controller_service_account" settings on the
+[slurm-on-gcp controller module][slurm-on-gcp-con] and the
+"login_service_account" setting on the
+[slurm-on-gcp login module][slurm-on-gcp-login].
+
+[def-compute-sa]: https://cloud.google.com/compute/docs/access/service-accounts#default_service_account
+[slurm-on-gcp-ug]: https://goo.gle/slurm-gcp-user-guide
+[slurm-on-gcp-con]: community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md
+[slurm-on-gcp-login]: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md
+
 ### Terraform Deployment
 
 When `terraform apply` fails, Terraform generally provides a useful error
 
@@ -21,7 +21,7 @@ import (
 	"errors"
 	"fmt"
 	"hpc-toolkit/pkg/config"
-	"hpc-toolkit/pkg/reswriter"
+	"hpc-toolkit/pkg/modulewriter"
 	"log"
 
 	"github.com/spf13/cobra"
@@ -77,19 +77,19 @@ func runCreateCmd(cmd *cobra.Command, args []string) {
 		bpFilename = args[0]
 	}
 
-	blueprintConfig := config.NewBlueprintConfig(bpFilename)
-	if err := blueprintConfig.SetCLIVariables(cliVariables); err != nil {
+	deploymentConfig := config.NewDeploymentConfig(bpFilename)
+	if err := deploymentConfig.SetCLIVariables(cliVariables); err != nil {
 		log.Fatalf("Failed to set the variables at CLI: %v", err)
 	}
-	if err := blueprintConfig.SetBackendConfig(cliBEConfigVars); err != nil {
+	if err := deploymentConfig.SetBackendConfig(cliBEConfigVars); err != nil {
 		log.Fatalf("Failed to set the backend config at CLI: %v", err)
 	}
-	if err := blueprintConfig.SetValidationLevel(validationLevel); err != nil {
+	if err := deploymentConfig.SetValidationLevel(validationLevel); err != nil {
 		log.Fatal(err)
 	}
-	blueprintConfig.ExpandConfig()
-	if err := reswriter.WriteBlueprint(&blueprintConfig.Config, outputDir, overwriteDeployment); err != nil {
-		var target *reswriter.OverwriteDeniedError
+	deploymentConfig.ExpandConfig()
+	if err := modulewriter.WriteDeployment(&deploymentConfig.Config, outputDir, overwriteDeployment); err != nil {
+		var target *modulewriter.OverwriteDeniedError
 		if errors.As(err, &target) {
 			fmt.Printf("\n%s\n", err.Error())
 		} else {
 
@@ -58,18 +58,18 @@ func runExpandCmd(cmd *cobra.Command, args []string) {
 		bpFilename = args[0]
 	}
 
-	blueprintConfig := config.NewBlueprintConfig(bpFilename)
-	if err := blueprintConfig.SetCLIVariables(cliVariables); err != nil {
+	deploymentConfig := config.NewDeploymentConfig(bpFilename)
+	if err := deploymentConfig.SetCLIVariables(cliVariables); err != nil {
 		log.Fatalf("Failed to set the variables at CLI: %v", err)
 	}
-	if err := blueprintConfig.SetBackendConfig(cliBEConfigVars); err != nil {
+	if err := deploymentConfig.SetBackendConfig(cliBEConfigVars); err != nil {
 		log.Fatalf("Failed to set the backend config at CLI: %v", err)
 	}
-	if err := blueprintConfig.SetValidationLevel(validationLevel); err != nil {
+	if err := deploymentConfig.SetValidationLevel(validationLevel); err != nil {
 		log.Fatal(err)
 	}
-	blueprintConfig.ExpandConfig()
-	blueprintConfig.ExportYamlConfig(outputFilename)
+	deploymentConfig.ExpandConfig()
+	deploymentConfig.ExportBlueprint(outputFilename)
 	fmt.Printf(
 		"Expanded Environment Definition created successfully, saved as %s.\n", outputFilename)
 }
@@ -34,7 +34,7 @@ HPC deployments on the Google Cloud Platform.`,
 				log.Fatalf("cmd.Help function failed: %s", err)
 			}
 		},
-		Version: "v0.7.0-alpha (private preview)",
+		Version: "v0.7.1-alpha (private preview)",
 	}
 )
 
 
@@ -0,0 +1,152 @@
+# Intel Solutions for the HPC Toolkit
+
+## Intel-Optimized Slurm Cluster
+
+This document is adapted from a [Cloud Shell tutorial][tutorial] developed to
+demonstrate Intel Select Solutions within the Toolkit. It expands upon that
+tutorial by building custom images that save provisioning time and improve
+reliability when scaling up compute nodes.
+
+The Google Cloud [HPC VM Image][hpcvmimage] has a built-in feature enabling it
+to install a Google Cloud-tested release of Intel compilers and libraries that
+are known to achieve optimal performance on Google Cloud.
+
+[tutorial]: ../../../docs/tutorials/intel-select/intel-select.md
+[hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm
+
+## Provisioning the Intel-optimized Slurm cluster
+
+Identify a project to work in and substitute its unique id wherever you see
+`<<PROJECT_ID>>` in the instructions below.
+
+## Initial Setup
+
+Before provisioning any infrastructure in this project you should follow the
+Toolkit guidance to enable [APIs][apis] and establish minimum resource
+[quotas][quotas]. In particular, the following APIs should be enabled
+
+* file.googleapis.com (Cloud Filestore)
+* compute.googleapis.com (Google Compute Engine)
+
+[apis]: ../../../README.md#enable-gcp-apis
+[quotas]: ../../../README.md#gcp-quotas
+
+And the following available quota is required in the region used by the cluster:
+
+* Filestore: 2560GB
+* C2 CPUs: 6000 (fully-scaled "compute" partition)
+  * This quota is not necessary at initial deployment, but will be required to
+    successfully scale the partition to its maximum size
+* C2 CPUs: 4 (login node)
+
+## Deploying the Blueprint
+
+Use `ghpc` to provision the blueprint, supplying your project ID:
+
+```shell
+ghpc create --vars project_id=<<PROJECT_ID>> hpc-cluster-intel-select.yaml
+```
+
+It will create a set of directories containing Terraform modules and Packer
+templates. **Please ignore the printed instructions** in favor of the following:
+
+1. Provision the network and startup scripts that install Intel software.
+
+  ```shell
+  terraform -chdir=hpc-intel-select/primary init
+  terraform -chdir=hpc-intel-select/primary validate
+  terraform -chdir=hpc-intel-select/primary apply
+  ```
+
+1. Capture the startup scripts to files that will be used by Packer to build the
+   images.
+
+  ```shell
+  terraform -chdir=hpc-intel-select/primary output \
+      -raw startup_script_startup_controller > \
+      hpc-intel-select/packer/controller-image/startup_script.sh
+  terraform -chdir=hpc-intel-select/primary output \
+      -raw startup_script_startup_compute > \
+      hpc-intel-select/packer/compute-image/startup_script.sh
+  ```
+
+1. Build the custom Slurm controller image. While this step is executing, you
+   may begin the next step in parallel.
+
+  ```shell
+  cd hpc-intel-select/packer/controller-image
+  packer init .
+  packer validate .
+  packer build -var startup_script_file=startup_script.sh .
+  ```
+
+1. Build the custom Slurm image for login and compute nodes
+
+  ```shell
+  cd -
+  cd hpc-intel-select/packer/compute-image
+  packer init .
+  packer validate .
+  packer build -var startup_script_file=startup_script.sh .
+  ```
+
+1. Provision the Slurm cluster
+
+  ```shell
+  cd -
+  terraform -chdir=hpc-intel-select/cluster init
+  terraform -chdir=hpc-intel-select/cluster validate
+  terraform -chdir=hpc-intel-select/cluster apply
+  ```
+
+## Connecting to the login node
+
+Once the startup script has completed and Slurm reports readiness, connect to the login node.
+
+1. Open the following URL in a new tab. This will take you to `Compute Engine` >
+   `VM instances` in the Google Cloud Console:
+
+  ```text
+  https://console.cloud.google.com/compute
+  ```
+
+  Ensure that you select the project in which you are provisioning the cluster.
+
+1. Click on the `SSH` button associated with the `slurm-hpc-intel-select-login0`
+   instance.
+
+   This will open a separate pop up window with a terminal into our newly created
+   Slurm login VM.
+
+## Access the cluster and provision an example job
+
+   **The commands below should be run on the login node.**
+
+1. Create a default ssh key to be able to ssh between nodes:
+
+  ```shell
+  ssh-keygen -q -N '' -f ~/.ssh/id_rsa
+  cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys
+  chmod 0600 ~/.ssh/authorized_keys
+  ```
+
+1. Submit an example job:
+
+  ```shell
+  cp /var/tmp/dgemm_job.sh .
+  sbatch dgemm_job.sh
+  ```
+
+## Delete the infrastructure when not in use
+
+> **_NOTE:_** If the Slurm controller is shut down before the auto-scale nodes
+> are destroyed then they will be left running.
+
+Open your browser to the VM instances page and ensure that nodes named "compute"
+have been shutdown and deleted by the Slurm autoscaler. Delete the remaining
+infrastructure in reverse order of creation:
+
+```shell
+terraform -chdir=hpc-intel-select/cluster destroy
+terraform -chdir=hpc-intel-select/primary destroy
+```
Original file line number	Diff line number	Diff line change
`@@ -58,18 +58,18 @@ func runExpandCmd(cmd *cobra.Command, args []string) {`
`58`	`58`	`bpFilename = args[0]`
`59`	`59`	`}`
`60`	`60`
`61`		`- blueprintConfig := config.NewBlueprintConfig(bpFilename)`
`62`		`- if err := blueprintConfig.SetCLIVariables(cliVariables); err != nil {`
	`61`	`+ deploymentConfig := config.NewDeploymentConfig(bpFilename)`
	`62`	`+ if err := deploymentConfig.SetCLIVariables(cliVariables); err != nil {`
`63`	`63`	`log.Fatalf("Failed to set the variables at CLI: %v", err)`
`64`	`64`	`}`
`65`		`- if err := blueprintConfig.SetBackendConfig(cliBEConfigVars); err != nil {`
	`65`	`+ if err := deploymentConfig.SetBackendConfig(cliBEConfigVars); err != nil {`
`66`	`66`	`log.Fatalf("Failed to set the backend config at CLI: %v", err)`
`67`	`67`	`}`
`68`		`- if err := blueprintConfig.SetValidationLevel(validationLevel); err != nil {`
	`68`	`+ if err := deploymentConfig.SetValidationLevel(validationLevel); err != nil {`
`69`	`69`	`log.Fatal(err)`
`70`	`70`	`}`
`71`		`- blueprintConfig.ExpandConfig()`
`72`		`- blueprintConfig.ExportYamlConfig(outputFilename)`
	`71`	`+ deploymentConfig.ExpandConfig()`
	`72`	`+ deploymentConfig.ExportBlueprint(outputFilename)`
`73`	`73`	`fmt.Printf(`
`74`	`74`	`"Expanded Environment Definition created successfully, saved as %s.\n", outputFilename)`
`75`	`75`	`}`
Original file line number	Diff line number	Diff line change
@@ -34,7 +34,7 @@ HPC deployments on the Google Cloud Platform.`,
`34`	`34`	`log.Fatalf("cmd.Help function failed: %s", err)`
`35`	`35`	`}`
`36`	`36`	`},`
`37`		`- Version: "v0.7.0-alpha (private preview)",`
	`37`	`+ Version: "v0.7.1-alpha (private preview)",`
`38`	`38`	`}`
`39`	`39`	`)`
`40`	`40`