Skip to content
12 changes: 12 additions & 0 deletions docs/blueprint-validation.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ Each validator is described below:
region
* Common failure: changing 1 value but not the other
* Manual test: `gcloud compute regions describe us-central1 --format="text(zones)" --project $(vars.project_id)`
* `test_machine_type_in_zone`
* Inputs: `project_id` (string), `zone` (string), `machine_type` (string)
* PASS: If the machine type is available in the specified zone and project.
* SKIP (Soft Warning): If the Compute Engine API is disabled or the credentials lack `compute.machineTypes.get` permissions, the validator prints a warning and the check is skipped.
* FAIL: If the machine type is invalid or unavailable in that zone.
* Note: To explicitly verify multiple machine types in a zone, add this validator to the blueprint multiple times.
* Manual test: `gcloud compute machine-types describe $(vars.machine_type) --zone $(vars.zone) --project $(vars.project_id)`
* `test_module_not_used`
* Inputs: none; reads whole blueprint
* PASS: if all instances of use keyword pass matching variables
Expand Down Expand Up @@ -100,6 +107,11 @@ validators:
project_id: $(vars.project_id)
region: $(vars.region)
zone: $(vars.zone)
- validator: test_machine_type_in_zone
inputs:
project_id: $(vars.project_id)
zone: $(vars.zone)
machine_type: c2-standard-60 # any machine type to verify in the zone
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hardcoded machines type here..?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As machine_type is not a global variable in most blueprints, I specified a hardcoded one. But if a global variable is defined that can be referenced as well.

```

## Module-level (Metadata) Validators
Expand Down
2 changes: 2 additions & 0 deletions pkg/config/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,4 +167,6 @@ const (
errMsgValueEmptyString = string("value is an empty string")
errMsgLabelValueReqs = string("value can only contain lowercase letters, numeric characters, underscores and dashes, and must be between 0 and 63 characters long")
errMsgSlurmClusterNameReqs = string("must start with a lowercase letter, contain only lowercase letters and numbers, and be between 1 and 10 characters long")
ErrMsgResourceInZone = "%s %q is not available in zone %q in project %q"
ErrMsgResourceInAnyZone = "%s %q is not available in any requested zones [%s] in project %q"
)
53 changes: 52 additions & 1 deletion pkg/validators/cloud.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2023 "Google LLC"
// Copyright 2026 "Google LLC"
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -71,6 +71,15 @@ func handleServiceUsageError(err error, pid string) error {
return fmt.Errorf("unhandled error: %s", herr)
}

func isValidatorExplicit(bp config.Blueprint, validatorName string) bool {
for _, v := range bp.Validators {
if v.Validator == validatorName {
return true
}
}
return false
}

// TestApisEnabled tests whether APIs are enabled in given project
func TestApisEnabled(projectID string, requiredAPIs []string) error {
// can return immediately if there are 0 APIs to test
Expand Down Expand Up @@ -240,3 +249,45 @@ func testZoneInRegion(bp config.Blueprint, inputs config.Dict) error {
}
return TestZoneInRegion(m["project_id"], m["zone"], m["region"])
}

func testMachineTypeInZoneAvailability(bp config.Blueprint, inputs config.Dict) error {
// 1. Determine if the validator was explicitly added to the blueprint YAML
const validatorName = "test_machine_type_in_zone"
required := []string{"project_id", "zone"}
if isValidatorExplicit(bp, validatorName) {
required = append(required, "machine_type")
}

if err := checkInputs(inputs, required); err != nil {
return err
}

s, err := compute.NewService(context.Background())
if err != nil {
return handleClientError(err)
}
m, err := inputsAsStrings(inputs)
if err != nil {
return err
}

projectID, globalZone, explicitMachineType := m["project_id"], m["zone"], m["machine_type"]

if explicitMachineType != "" {
// When explicitly called, we MUST validate the zone provided in the inputs
if err := TestZoneExists(projectID, globalZone); err != nil {
return err
}
err := validateMachineTypeInZone(s, projectID, globalZone, explicitMachineType)

// Catch the sentinel and return nil so the deployment proceeds
if errors.Is(err, errSoftWarning) {
return nil
}
return err
}

return validateSettingsInModules(bp, globalZone, projectID, "machine_type", "machine type", func(z, name string) error {
return validateMachineTypeInZone(s, projectID, z, name)
})
}
240 changes: 240 additions & 0 deletions pkg/validators/cloud_discovery.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
// Copyright 2026 "Google LLC"
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package validators

import (
"errors"
"fmt"
"hpc-toolkit/pkg/config"
"strings"

"github.com/zclconf/go-cty/cty"
compute "google.golang.org/api/compute/v1"
"google.golang.org/api/googleapi"
)

// errSoftWarning is a private sentinel error used to signal that a soft warning
// was triggered and discovery should stop immediately to avoid console spam.
var errSoftWarning = errors.New("abort")

// getSoftWarningMessage checks if a Google Cloud API error represents a permission issue (403)
// or a disabled API (400). When these occur, it prints a warning to the console
// and returns true, signaling the validator to "skip" the check rather than failing the deployment.
func getSoftWarningMessage(err error, validatorName, projectID, apiName, permission string) (string, bool) {
var gerr *googleapi.Error
if errors.As(err, &gerr) {
// 403 is always a Soft Warning (Permission)
is403 := gerr.Code == 403

// 400 is ONLY a Soft Warning if it's about the API not being enabled/used.
// If it's an "Invalid Value" (like your custom machine error), it should be a Hard Failure.
isAPIOff := gerr.Code == 400 && (strings.Contains(strings.ToLower(gerr.Message), "not enabled") ||
strings.Contains(strings.ToLower(gerr.Message), "not been used"))

if is403 || isAPIOff {
msg := fmt.Sprintf("\n[!] WARNING (%d): validator %q for project %q. Identity lacks permissions. Skipping check.\n", gerr.Code, validatorName, projectID)
msg += fmt.Sprintf(" Hint: Ensure %s is enabled and check IAM permissions (%s).\n", apiName, permission)
return msg, true
}
}
return "", false
}

// 1. Helper for cty resolution
func resolveStringSetting(bp config.Blueprint, val cty.Value) string {
v := val
if resolved, err := bp.Eval(v); err == nil {
v = resolved
}
if v != cty.NilVal && !v.IsNull() && v.Type() == cty.String {
return v.AsString()
}
return ""
}

// extractZones converts a cty.Value (String, List, Set, or Tuple) into a string slice.
// This helper removes complex type-checking branches from the main resolution logic.
func extractZones(val cty.Value) ([]string, error) {
if val.IsNull() || !val.IsKnown() {
return nil, nil
}

var zones []string
// evaluateAndFlatten handles single strings, lists, and tuples for us
for _, v := range evaluateAndFlatten(val) {
if v.Type() == cty.String {
zones = append(zones, v.AsString())
} else {
// Reuse toolkit standard error generation for non-string types
_, err := inputsAsStrings(config.NewDict(map[string]cty.Value{"zone": v}))
return nil, err
}
}
return zones, nil
}

// resolveZones identifies all target zones for a module by scanning its settings.
// It implements a priority system to ensure that specific module placements
// (like 'gpu_zones' or 'zones') take precedence over the inherited singular 'zone' variable.
func resolveZones(blueprint config.Blueprint, module *config.Module, globalZone string) ([]string, error) {
plural, singular := make(map[string]bool), make(map[string]bool)

for key, val := range module.Settings.Items() {
// Identify settings ending in 'zone' or 'zones'.
// strings.HasSuffix also matches the words "zone" and "zones" themselves.
isPlural := strings.HasSuffix(key, "zones")
if !isPlural && !strings.HasSuffix(key, "zone") {
continue
}

resolved, err := blueprint.Eval(val)
if err != nil {
continue
}

// Normalize the value (handling single strings vs. lists) via extractZones
// and categorize the result based on the key suffix.
zones, err := extractZones(resolved)
if err != nil {
return nil, err // Return type error immediately
}

for _, z := range zones {
if isPlural {
plural[z] = true
} else {
singular[z] = true
}
}
}

// Priority: 1. Plural zone list, 2. Singular zone override, 3. Global default
source := plural
if len(plural) == 0 {
source = singular
}
if len(source) == 0 {
return []string{globalZone}, nil
}

zones := make([]string, 0, len(source))
for z := range source {
zones = append(zones, z)
}
return zones, nil
}

// checkResourceInZones implements the "OR" logic: passes if found in at least one valid zone.
func checkResourceInZones(projectID string, zones []string, globalZone, resourceLabel, resourceName string, validateFn func(string, string) error) (bool, error) {
var attempted []string
for _, z := range zones {
if z == "" {
continue
}

if z != globalZone {
if err := TestZoneExists(projectID, z); err != nil {
// Check if the zone-check error is actually a permission issue (403)
if msg, isSoft := getSoftWarningMessage(err, "test_machine_type_in_zone", projectID, "Compute Engine API", "compute.zones.get"); isSoft {
fmt.Println(msg)
return true, errSoftWarning // Trigger the abort sentinel
}
// If it's a real typo (not a 403), return it as a Hard Failure
return false, err
}
}

attempted = append(attempted, z)
err := validateFn(z, resourceName)
if err == nil {
return true, nil
}
if errors.Is(err, errSoftWarning) {
return true, errSoftWarning
}
}

if len(attempted) > 0 {
return false, fmt.Errorf(config.ErrMsgResourceInAnyZone, resourceLabel, resourceName, strings.Join(attempted, ", "), projectID)
}
return true, nil
}

// validateSettingsInModules walks through every module in the blueprint,
// identifies settings that match a specific suffix (e.g., "machine_type"),
// and validates them against the zones where that module is allowed to reside.
func validateSettingsInModules(blueprint config.Blueprint, globalZone, projectID, suffix, resourceLabel string, validateResource func(zone string, name string) error) error {
validationErrors := config.Errors{}
// Anti-Spam Logic: This flag is set if we encounter an environmental issue
// (like a 403 Permission Denied). It allows us to stop making slow API calls
// and stop printing repetitive warnings for the rest of the blueprint walk.
var aborted bool

blueprint.WalkModulesSafe(func(path config.ModulePath, module *config.Module) {
if aborted {
return
}

// Identify which zones this module is targeting.
// This handles singular overrides, plural lists, and global defaults.
// Handle the new error return from resolveZones
targetZones, err := resolveZones(blueprint, module, globalZone)
if err != nil {
validationErrors.Add(fmt.Errorf("in module %q: %w", module.ID, err))
return // Skip discovery for this module due to type error
}
for key, val := range module.Settings.Items() {
if aborted || !strings.HasSuffix(key, suffix) {
continue
}

resourceName := resolveStringSetting(blueprint, val)
if resourceName == "" {
continue
}

found, err := checkResourceInZones(projectID, targetZones, globalZone, resourceLabel, resourceName, validateResource)
// If we hit the private sentinel error (403/400), set the abort flag.
if errors.Is(err, errSoftWarning) {
aborted = true
return
}
if !found && err != nil {
validationErrors.Add(fmt.Errorf("in module %q setting %q: %w", module.ID, key, err))
}
}
})
return validationErrors.OrNil()
}

// validateMachineTypeInZone calls the Compute Engine API to verify if a specific
// machine type is available in the given zone and project.
func validateMachineTypeInZone(s *compute.Service, projectID, zone, machineType string) error {
_, err := s.MachineTypes.Get(projectID, zone, machineType).Do()

// Case 1: Success - The machine type exists
if err == nil {
return nil
}

// Case 2: Environmental Issue - API disabled or permissions missing (Soft Warning)
if msg, isSoft := getSoftWarningMessage(err, "test_machine_type_in_zone", projectID, "Compute Engine API", "compute.machineTypes.get"); isSoft {
fmt.Println(msg)
return errSoftWarning
}

// Case 3: Validation Failure - The machine type is genuinely invalid or unavailable
return fmt.Errorf(config.ErrMsgResourceInZone, "machine type", machineType, zone, projectID)
}
8 changes: 8 additions & 0 deletions pkg/validators/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ const (
testZoneInRegionName = "test_zone_in_region"
testModuleNotUsedName = "test_module_not_used"
testDeploymentVariableNotUsedName = "test_deployment_variable_not_used"
testMachineTypeInZone = "test_machine_type_in_zone"
)

func implementations() map[string]func(config.Blueprint, config.Dict) error {
Expand All @@ -66,6 +67,7 @@ func implementations() map[string]func(config.Blueprint, config.Dict) error {
testZoneInRegionName: testZoneInRegion,
testModuleNotUsedName: testModuleNotUsed,
testDeploymentVariableNotUsedName: testDeploymentVariableNotUsed,
testMachineTypeInZone: testMachineTypeInZoneAvailability,
}
}

Expand Down Expand Up @@ -237,6 +239,12 @@ func defaults(bp config.Blueprint) []config.Validator {
"project_id": projectRef,
"zone": zoneRef,
}),
}, config.Validator{
Validator: testMachineTypeInZone,
Inputs: config.NewDict(map[string]cty.Value{
"project_id": projectRef,
"zone": zoneRef,
}),
})
}

Expand Down
Loading
Loading