aws · gmarciani · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
@@ -0,0 +1,30 @@
+# ParallelCluster Diagnostics
+
+A collection of scripts to diagnose common ParallelCluster issues.
+The diagnostics suite is meant to be executed within the cluster head node.
+
+## Available Scripts
+
+| Script | Description |
+|---|---|
+| `diagnose-slurm-accounting.py` | Diagnoses SLURM accounting setup |
+
+## Usage
+
+### 1. Deploy to the head node
+
+Run `deploy.sh` from your local machine. It uploads the diagnostics folder to the head node and installs dependencies.
+
+```bash
+bash deploy.sh --cluster-name <cluster-name> --region <region> --ssh-key <path-to-key>
+```
+
+At the end it prints the SSH command to log directly into the diagnostics folder on the head node.
+
+### 2. Run a diagnostic script (example)
+
+Once logged into the head node:
+
+```bash
+./diagnose-slurm-accounting.py --help
+```
@@ -0,0 +1,88 @@
+import json
+import logging
+import re
+import subprocess
+
+import boto3
+import yaml
+from botocore.exceptions import ClientError
+
+CHEF_DNA_JSON_FILE = "/etc/chef/dna.json"
+
+
+def setup_logging():
+    """Set up common logging configuration for all diagnosis scripts."""
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
+    return logging.getLogger(__name__)
+
+
+def print_success(message):
+    print(f"\033[32m[✓] {message}\033[0m")
+
+
+def print_failure(message):
+    print(f"\033[31m[✗] {message}\033[0m")
+
+
+def read_dna_json():
+    try:
+        with open(CHEF_DNA_JSON_FILE, "r") as f:
+            return json.load(f)
+    except Exception as e:
+        raise RuntimeError(f"Failed to read {CHEF_DNA_JSON_FILE}: {str(e)}") from e
+
+
+def parse_db_uri(uri):
+    if ":" in uri:
+        endpoint, port_str = uri.split(":", 1)
+        return endpoint, int(port_str)
+    else:
+        return uri, 3306
+
+
+def get_config_from_s3(region=None):
+    try:
+        dna = read_dna_json()
+        bucket = dna["cluster"]["cluster_s3_bucket"]
+        key = dna["cluster"]["cluster_config_s3_key"]
+        version = dna["cluster"]["cluster_config_version"]
+
+        s3 = boto3.client("s3", region_name=region) if region else boto3.client("s3")
+
+        response = s3.get_object(Bucket=bucket, Key=key, VersionId=version)
+        config = yaml.safe_load(response["Body"])
+
+        print_success("Downloaded cluster configuration from S3")
+        return config
+    except RuntimeError:
+        raise
+    except Exception as e:
+        raise RuntimeError(f"Failed to get config from S3: {str(e)}") from e
+
+
+def get_config_value(conf_file, property_name):
+    try:
+        result = subprocess.run(["sudo", "cat", conf_file], capture_output=True, text=True, check=True)
+        content = result.stdout
+        match = re.search(f"{property_name}=(.*?)(?:\n|$)", content)
+        if match:
+            return match.group(1)
+        print_failure(f"{property_name} not found in configuration file")
+        return None
+    except subprocess.CalledProcessError:
+        print_failure(f"Failed to read configuration file (permission denied): {conf_file}")
+        return None
+    except FileNotFoundError:
+        print_failure("'sudo' command not found. Cannot read protected configuration files.")
+        return None
+
+
+def get_secret(secret_arn, region=None):
+    try:
+        session = boto3.session.Session()
+        client = session.client(service_name="secretsmanager", region_name=region)
+        response = client.get_secret_value(SecretId=secret_arn)
+        return response["SecretString"]
+    except ClientError as e:
+        print_failure(f"Failed to retrieve secret from AWS Secrets Manager: {str(e)}")
+        return None
@@ -0,0 +1,95 @@
+#!/bin/bash
+# Uploads the diagnostics folder to the head node of a ParallelCluster and installs dependencies.
+#
+# Usage:
+#   bash deploy.sh --cluster-name <cluster-name> --region <region> [--ssh-key <path-to-key>]
+#
+# Example:
+#   bash deploy.sh --cluster-name my-cluster --region us-east-1 --ssh-key ~/.ssh/my-key.pem
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+usage() {
+    echo "Usage: $0 --cluster-name <cluster-name> --region <region> [--ssh-key <path>]"
+    echo ""
+    echo "Upload the diagnostics folder to the head node of a ParallelCluster."
+    echo ""
+    echo "Options:"
+    echo "  --cluster-name, -n   Name of the cluster"
+    echo "  --region, -r         AWS region"
+    echo "  --ssh-key, -i        Path to the SSH private key"
+    exit "${1:-1}"
+}
+
+CLUSTER_NAME=""
+REGION=""
+SSH_KEY=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --cluster-name|-n) CLUSTER_NAME="$2"; shift 2 ;;
+        --region|-r) REGION="$2"; shift 2 ;;
+        --ssh-key|-i) SSH_KEY="$2"; shift 2 ;;
+        --help|-h) usage 0 ;;
+        *) echo "[ERROR] Unknown option: $1"; usage ;;
+    esac
+done
+
+if [[ -z "$CLUSTER_NAME" || -z "$REGION" ]]; then
+    usage
+fi
+
+if [[ -n "$SSH_KEY" && ! -f "$SSH_KEY" ]]; then
+    echo "[ERROR] SSH key file not found: $SSH_KEY"
+    exit 1
+fi
+
+echo "[INFO] Retrieving head node connection info for cluster '${CLUSTER_NAME}' in region '${REGION}'..."
+
+SSH_CMD=$(pcluster ssh -n "$CLUSTER_NAME" -r "$REGION" --dryrun true 2>/dev/null \
+    | python3 -c "import sys, json; print(json.load(sys.stdin)['command'])" 2>/dev/null || true)
+
+if [[ -z "$SSH_CMD" ]]; then
+    echo "[ERROR] Failed to retrieve head node connection info. Check that the cluster exists, is running, and your credentials are valid."
+    exit 1
+fi
+
+# Extract user and IP from "ssh <user>@<ip>"
+USER_AT_IP=$(echo "$SSH_CMD" | awk '{print $2}')
+DEFAULT_USER="${USER_AT_IP%%@*}"
+HEAD_NODE_IP="${USER_AT_IP##*@}"
+
+if [[ -z "$DEFAULT_USER" || -z "$HEAD_NODE_IP" ]]; then
+    echo "[ERROR] Could not parse user and IP from pcluster ssh output: '${SSH_CMD}'"
+    exit 1
+fi
+
+echo "[INFO] Head node IP: ${HEAD_NODE_IP}"
+echo "[INFO] Default user: ${DEFAULT_USER}"
+echo "[INFO] Uploading ${SCRIPT_DIR} to ${DEFAULT_USER}@${HEAD_NODE_IP}:~/"
+
+REMOTE_DIR="$(basename "$SCRIPT_DIR")"
+
+# Build rsync and ssh args as arrays to safely handle paths with spaces
+RSYNC_ARGS=(-av --exclude="deploy.sh" --exclude="__pycache__")
+SSH_ARGS=()
+if [[ -n "$SSH_KEY" ]]; then
+    RSYNC_ARGS+=(-e "ssh -i ${SSH_KEY}")
+    SSH_ARGS+=(-i "${SSH_KEY}")
+fi
+
+rsync "${RSYNC_ARGS[@]}" "$SCRIPT_DIR" "${DEFAULT_USER}@${HEAD_NODE_IP}:~/"
+
+echo "[INFO] Done. Files uploaded to /home/${DEFAULT_USER}/${REMOTE_DIR}/"
+
+echo "[INFO] Installing requirements on head node..."
+
+ssh "${SSH_ARGS[@]}" "${DEFAULT_USER}@${HEAD_NODE_IP}" "pip install -r ~/${REMOTE_DIR}/requirements.txt"
+
+echo "[INFO] Requirements installed successfully."
+echo "[INFO] Next steps: log into the head node and run the diagnostics scripts from ~/${REMOTE_DIR}/"
+SSH_LOGIN_CMD="ssh"
+[[ -n "$SSH_KEY" ]] && SSH_LOGIN_CMD+=" -i ${SSH_KEY}"
+SSH_LOGIN_CMD+=" ${DEFAULT_USER}@${HEAD_NODE_IP} -t 'cd ~/${REMOTE_DIR} && bash -l'"
+echo "[INFO]   ${SSH_LOGIN_CMD}"