Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions util/diagnostics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# ParallelCluster Diagnostics

A collection of scripts to diagnose common ParallelCluster issues.
The diagnostics suite is meant to be executed within the cluster head node.

## Available Scripts

| Script | Description |
|---|---|
| `diagnose-slurm-accounting.py` | Diagnoses SLURM accounting setup |

## Usage

### 1. Deploy to the head node

Run `deploy.sh` from your local machine. It uploads the diagnostics folder to the head node and installs dependencies.

```bash
bash deploy.sh --cluster-name <cluster-name> --region <region> --ssh-key <path-to-key>
```

At the end it prints the SSH command to log directly into the diagnostics folder on the head node.

### 2. Run a diagnostic script (example)

Once logged into the head node:

```bash
./diagnose-slurm-accounting.py --help
```
88 changes: 88 additions & 0 deletions util/diagnostics/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import json
import logging
import re
import subprocess

import boto3
import yaml
from botocore.exceptions import ClientError

CHEF_DNA_JSON_FILE = "/etc/chef/dna.json"


def setup_logging():
"""Set up common logging configuration for all diagnosis scripts."""
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
return logging.getLogger(__name__)


def print_success(message):
print(f"\033[32m[✓] {message}\033[0m")


def print_failure(message):
print(f"\033[31m[✗] {message}\033[0m")


def read_dna_json():
try:
with open(CHEF_DNA_JSON_FILE, "r") as f:
return json.load(f)
except Exception as e:
raise RuntimeError(f"Failed to read {CHEF_DNA_JSON_FILE}: {str(e)}") from e


def parse_db_uri(uri):
if ":" in uri:
endpoint, port_str = uri.split(":", 1)
return endpoint, int(port_str)
else:
return uri, 3306


def get_config_from_s3(region=None):
try:
dna = read_dna_json()
bucket = dna["cluster"]["cluster_s3_bucket"]
key = dna["cluster"]["cluster_config_s3_key"]
version = dna["cluster"]["cluster_config_version"]

s3 = boto3.client("s3", region_name=region) if region else boto3.client("s3")

response = s3.get_object(Bucket=bucket, Key=key, VersionId=version)
config = yaml.safe_load(response["Body"])

print_success("Downloaded cluster configuration from S3")
return config
except RuntimeError:
raise
except Exception as e:
raise RuntimeError(f"Failed to get config from S3: {str(e)}") from e


def get_config_value(conf_file, property_name):
try:
result = subprocess.run(["sudo", "cat", conf_file], capture_output=True, text=True, check=True)
content = result.stdout
match = re.search(f"{property_name}=(.*?)(?:\n|$)", content)
if match:
return match.group(1)
print_failure(f"{property_name} not found in configuration file")
return None
except subprocess.CalledProcessError:
print_failure(f"Failed to read configuration file (permission denied): {conf_file}")
return None
except FileNotFoundError:
print_failure("'sudo' command not found. Cannot read protected configuration files.")
return None


def get_secret(secret_arn, region=None):
try:
session = boto3.session.Session()
client = session.client(service_name="secretsmanager", region_name=region)
response = client.get_secret_value(SecretId=secret_arn)
return response["SecretString"]
except ClientError as e:
print_failure(f"Failed to retrieve secret from AWS Secrets Manager: {str(e)}")
return None
95 changes: 95 additions & 0 deletions util/diagnostics/deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash
# Uploads the diagnostics folder to the head node of a ParallelCluster and installs dependencies.
#
# Usage:
# bash deploy.sh --cluster-name <cluster-name> --region <region> [--ssh-key <path-to-key>]
#
# Example:
# bash deploy.sh --cluster-name my-cluster --region us-east-1 --ssh-key ~/.ssh/my-key.pem
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

usage() {
echo "Usage: $0 --cluster-name <cluster-name> --region <region> [--ssh-key <path>]"
echo ""
echo "Upload the diagnostics folder to the head node of a ParallelCluster."
echo ""
echo "Options:"
echo " --cluster-name, -n Name of the cluster"
echo " --region, -r AWS region"
echo " --ssh-key, -i Path to the SSH private key"
exit "${1:-1}"
}

CLUSTER_NAME=""
REGION=""
SSH_KEY=""

while [[ $# -gt 0 ]]; do
case "$1" in
--cluster-name|-n) CLUSTER_NAME="$2"; shift 2 ;;
--region|-r) REGION="$2"; shift 2 ;;
--ssh-key|-i) SSH_KEY="$2"; shift 2 ;;
--help|-h) usage 0 ;;
*) echo "[ERROR] Unknown option: $1"; usage ;;
esac
done

if [[ -z "$CLUSTER_NAME" || -z "$REGION" ]]; then
usage
fi

if [[ -n "$SSH_KEY" && ! -f "$SSH_KEY" ]]; then
echo "[ERROR] SSH key file not found: $SSH_KEY"
exit 1
fi

echo "[INFO] Retrieving head node connection info for cluster '${CLUSTER_NAME}' in region '${REGION}'..."

SSH_CMD=$(pcluster ssh -n "$CLUSTER_NAME" -r "$REGION" --dryrun true 2>/dev/null \
| python3 -c "import sys, json; print(json.load(sys.stdin)['command'])" 2>/dev/null || true)

if [[ -z "$SSH_CMD" ]]; then
echo "[ERROR] Failed to retrieve head node connection info. Check that the cluster exists, is running, and your credentials are valid."
exit 1
fi

# Extract user and IP from "ssh <user>@<ip>"
USER_AT_IP=$(echo "$SSH_CMD" | awk '{print $2}')
DEFAULT_USER="${USER_AT_IP%%@*}"
HEAD_NODE_IP="${USER_AT_IP##*@}"

if [[ -z "$DEFAULT_USER" || -z "$HEAD_NODE_IP" ]]; then
echo "[ERROR] Could not parse user and IP from pcluster ssh output: '${SSH_CMD}'"
exit 1
fi

echo "[INFO] Head node IP: ${HEAD_NODE_IP}"
echo "[INFO] Default user: ${DEFAULT_USER}"
echo "[INFO] Uploading ${SCRIPT_DIR} to ${DEFAULT_USER}@${HEAD_NODE_IP}:~/"

REMOTE_DIR="$(basename "$SCRIPT_DIR")"

# Build rsync and ssh args as arrays to safely handle paths with spaces
RSYNC_ARGS=(-av --exclude="deploy.sh" --exclude="__pycache__")
SSH_ARGS=()
if [[ -n "$SSH_KEY" ]]; then
RSYNC_ARGS+=(-e "ssh -i ${SSH_KEY}")
SSH_ARGS+=(-i "${SSH_KEY}")
fi

rsync "${RSYNC_ARGS[@]}" "$SCRIPT_DIR" "${DEFAULT_USER}@${HEAD_NODE_IP}:~/"

echo "[INFO] Done. Files uploaded to /home/${DEFAULT_USER}/${REMOTE_DIR}/"

echo "[INFO] Installing requirements on head node..."

ssh "${SSH_ARGS[@]}" "${DEFAULT_USER}@${HEAD_NODE_IP}" "pip install -r ~/${REMOTE_DIR}/requirements.txt"

echo "[INFO] Requirements installed successfully."
echo "[INFO] Next steps: log into the head node and run the diagnostics scripts from ~/${REMOTE_DIR}/"
SSH_LOGIN_CMD="ssh"
[[ -n "$SSH_KEY" ]] && SSH_LOGIN_CMD+=" -i ${SSH_KEY}"
SSH_LOGIN_CMD+=" ${DEFAULT_USER}@${HEAD_NODE_IP} -t 'cd ~/${REMOTE_DIR} && bash -l'"
echo "[INFO] ${SSH_LOGIN_CMD}"
Loading
Loading