diff --git a/lablink-infrastructure/README.md b/lablink-infrastructure/README.md
index 13573b4..c19a6bd 100644
--- a/lablink-infrastructure/README.md
+++ b/lablink-infrastructure/README.md
@@ -14,8 +14,8 @@ If you plan to deploy via GitHub Actions workflows, you must configure one repos
 2. **Click "New repository secret"**
 3. **Add the following secret:**
 
-   | Name | Value | Description |
-   |------|-------|-------------|
+   | Name           | Value                                              | Description                          |
+   | -------------- | -------------------------------------------------- | ------------------------------------ |
    | `AWS_ROLE_ARN` | `arn:aws:iam::YOUR-ACCOUNT-ID:role/YOUR-ROLE-NAME` | IAM role ARN for OIDC authentication |
 
 **How to create the AWS IAM role for OIDC:**
@@ -65,6 +65,7 @@ aws iam get-role \
 **Note:** If deploying locally with Terraform (not via GitHub Actions), you don't need this secret. Just configure AWS CLI credentials instead.
 
 ### Prerequisites
+
 - AWS account with credentials configured
 - Terraform installed (v1.6.6+) for local deployments
 - Docker images available on GHCR (or use public LabLink images)
@@ -78,6 +79,7 @@ cp config/example.config.yaml config/config.yaml
 ```
 
 **Edit `config/config.yaml`:**
+
 - **REQUIRED**: Change `db.password` and `app.admin_password` (security!)
 - **REQUIRED**: Set `bucket_name` to a globally unique S3 bucket name (for test/prod)
 - Customize `machine.ami_id` for your AWS region
@@ -86,31 +88,32 @@ cp config/example.config.yaml config/config.yaml
 - Set `app.region` to your AWS region
 
 **Example configuration:**
+
 ```yaml
 db:
-  password: "YOUR-SECURE-DB-PASSWORD"  # CHANGE THIS!
+  password: "YOUR-SECURE-DB-PASSWORD" # CHANGE THIS!
 
 machine:
   machine_type: "g4dn.xlarge"
   image: "ghcr.io/talmolab/lablink-client-base-image:latest"
-  ami_id: "ami-0601752c11b394251"  # Ubuntu 24.04 with Docker + Nvidia (us-west-2)
+  ami_id: "ami-0601752c11b394251" # Ubuntu 24.04 with Docker + Nvidia (us-west-2)
   repository: "https://github.com/talmolab/sleap-tutorial-data.git"
   software: "sleap"
 
 app:
-  admin_password: "YOUR-SECURE-ADMIN-PASSWORD"  # CHANGE THIS!
+  admin_password: "YOUR-SECURE-ADMIN-PASSWORD" # CHANGE THIS!
   region: "us-west-2"
 
-bucket_name: "tf-state-lablink-yourname"  # Must be globally unique
+bucket_name: "tf-state-lablink-yourname" # Must be globally unique
 
 dns:
   enabled: true
   terraform_managed: true
   domain: "lablink.yourdomain.com"
-  zone_id: "Z..."  # Your Route 53 hosted zone ID
+  zone_id: "Z..." # Your Route 53 hosted zone ID
 
 ssl:
-  provider: "letsencrypt"  # or "cloudflare" or "none"
+  provider: "letsencrypt" # or "cloudflare" or "none"
   email: "admin@yourdomain.com"
 ```
 
@@ -131,6 +134,8 @@ terraform plan
 terraform apply
 ```
 
+> **Note**: After running `terraform apply`, check your email and confirm your subscription to receive notifications.
+
 **Option B: Manual Terraform commands**
 
 ```bash
@@ -145,6 +150,8 @@ terraform plan
 terraform apply
 ```
 
+> **Note**: After running `terraform apply`, check your email and confirm your subscription to receive notifications.
+
 ### 3. Verify Deployment (Optional)
 
 After deployment completes, you can verify everything is working:
@@ -159,6 +166,7 @@ IP=$(terraform output -raw ec2_public_ip)
 ```
 
 The verification script checks:
+
 - DNS resolution (if domain configured)
 - HTTP connectivity
 - HTTPS/SSL certificate (if Let's Encrypt enabled)
@@ -166,12 +174,14 @@ The verification script checks:
 ### 4. Access Your Allocator
 
 **With DNS configured:**
+
 ```
 Allocator: https://lablink.yourdomain.com
 Admin UI:  https://lablink.yourdomain.com/admin
 ```
 
 **Without DNS (IP-only):**
+
 ```
 Allocator: http://<ec2-public-ip>:5000
 Admin UI:  http://<ec2-public-ip>:5000/admin
@@ -184,7 +194,7 @@ Admin UI:  http://<ec2-public-ip>:5000/admin
 - **Lambda Function**: Processes CloudWatch logs from client VMs
 - **Route 53 DNS**: Automatic DNS record management (if configured)
 - **Security Groups**: Network security rules
-- **IAM Roles**: 
+- **IAM Roles**:
   - **Allocator Instance Role**: A role for the allocator EC2 instance with permissions to manage the lifecycle of client VMs (run, terminate, tag, etc.) and their associated resources (IAM roles, instance profiles).
   - **CloudWatch Agent Role**: A role for client VMs to send logs to CloudWatch.
 - **CloudWatch Log Groups**: Centralized logging for troubleshooting
@@ -193,22 +203,24 @@ Admin UI:  http://<ec2-public-ip>:5000/admin
 
 LabLink supports three deployment environments:
 
-| Environment | Backend State | Use Case | S3 Bucket Required? |
-|-------------|---------------|----------|---------------------|
-| `dev`       | Local file    | Local testing, experimentation | No |
-| `test`      | S3            | Staging, pre-production testing | Yes |
-| `prod`      | S3            | Production deployments | Yes |
+| Environment | Backend State | Use Case                        | S3 Bucket Required? |
+| ----------- | ------------- | ------------------------------- | ------------------- |
+| `dev`       | Local file    | Local testing, experimentation  | No                  |
+| `test`      | S3            | Staging, pre-production testing | Yes                 |
+| `prod`      | S3            | Production deployments          | Yes                 |
 
 Each environment maintains separate Terraform state to avoid conflicts.
 
 ## Configuration Reference
 
 ### Database (`db`)
+
 - `password`: PostgreSQL password (**CHANGE THIS!**)
 - `dbname`: Database name (default: `lablink_db`)
 - `user`: Database username (default: `lablink`)
 
 ### Machine Settings (`machine`)
+
 - `machine_type`: AWS EC2 instance type for client VMs (e.g., `g4dn.xlarge`, `g5.2xlarge`)
 - `image`: Docker image for client container (e.g., `ghcr.io/talmolab/lablink-client-base-image:latest`)
 - `ami_id`: Amazon Machine Image for client VMs (region-specific)
@@ -216,22 +228,40 @@ Each environment maintains separate Terraform state to avoid conflicts.
 - `software`: Software identifier (e.g., `sleap`)
 
 ### Application (`app`)
+
 - `admin_password`: Admin UI password (**CHANGE THIS!**)
 - `admin_user`: Admin username (default: `admin`)
 - `region`: AWS region (e.g., `us-west-2`)
 
 ### DNS Configuration (`dns`)
+
 - `enabled`: Enable DNS management (true/false)
 - `terraform_managed`: Let Terraform manage Route 53 records (true/false)
 - `domain`: Your domain name (e.g., `lablink.example.com`)
 - `zone_id`: Route 53 hosted zone ID (required if `terraform_managed: true`)
 
 ### SSL Configuration (`ssl`)
+
 - `provider`: SSL provider (`letsencrypt`, `cloudflare`, or `none`)
 - `email`: Email for Let's Encrypt notifications
 - `staging`: When `true`, serve HTTP only for unlimited testing. When `false`, serve HTTPS with trusted Let's Encrypt certificates (rate limited to 5 duplicate certificates per week)
 
+### Monitoring (`monitoring`)
+
+- `enabled`: `true` to enable monitoring and alerts, `false` to disable.
+- `email`: Email address to send alerts to.
+- `thresholds`:
+  - `max_instances_per_5min`: Max instances created in 5 minutes.
+  - `max_terminations_per_5min`: Max instances terminated in 5 minutes.
+  - `max_unauthorized_calls_per_15min`: Max unauthorized API calls in 15 minutes.
+- `budget`:
+  - `enabled`: `true` to enable budget monitoring, `false` to disable.
+  - `monthly_budget_usd`: Monthly budget in USD.
+- `cloudtrail`:
+  - `retention_days`: Number of days to retain CloudTrail logs.
+
 **Staging Mode:**
+
 - Use `staging: true` for rapid infrastructure testing without SSL complications
 - Caddy serves HTTP only (no certificates, no redirects)
 - Client VMs connect via HTTP
@@ -239,21 +269,25 @@ Each environment maintains separate Terraform state to avoid conflicts.
 - **Not for production use** - no encryption
 
 **Production Mode:**
+
 - Use `staging: false` for production deployments
 - Caddy obtains trusted Let's Encrypt certificates
 - Serves HTTPS with automatic HTTP→HTTPS redirects
 - Subject to Let's Encrypt rate limits
 
 ### Terraform State (`bucket_name`)
+
 - S3 bucket name for Terraform state storage (test/prod only)
 - Must be globally unique
 
 ### Startup Script (`startup_script`)
+
 - `enabled`: `true` to run the custom startup script on client VMs, `false` to disable.
 - `path`: Path to the custom startup script file. Default: `config/custom-startup.sh`.
 - `on_error`: Behavior on script error. `continue` (default) ignores errors, `fail` stops VM setup.
 
 **Additional Resources:**
+
 - [Configuration Guide](../docs/configuration.md#ssltls-options-ssl) - Detailed SSL configuration reference
 - [Troubleshooting](../docs/troubleshooting.md#browser-cannot-access-http-staging-mode) - Browser HSTS cache issues
 - [Security](../docs/security.md#staging-mode-security) - Security implications of staging mode
@@ -261,27 +295,33 @@ Each environment maintains separate Terraform state to avoid conflicts.
 ## Included Scripts
 
 ### `init-terraform.sh` (Optional Helper)
+
 Simplifies Terraform initialization by automatically reading the S3 bucket name from your config file.
 
 **Usage:**
+
 ```bash
 ../scripts/init-terraform.sh [dev|test|prod|ci-test]
 ```
 
 **What it does:**
+
 - Reads `bucket_name` from `config/config.yaml`
 - Runs `terraform init` with appropriate backend configuration
 - Validates configuration before initializing
 
 **Equivalent manual command:**
+
 ```bash
 terraform init -backend-config=backend-test.hcl -backend-config="bucket=YOUR-BUCKET"
 ```
 
 ### `verify-deployment.sh` (Optional Manual Verification)
+
 Comprehensive deployment verification script for post-deployment testing.
 
 **Usage:**
+
 ```bash
 ./verify-deployment.sh [domain] [ip]
 
@@ -291,11 +331,13 @@ Comprehensive deployment verification script for post-deployment testing.
 ```
 
 **What it checks:**
+
 1. DNS resolution (waits up to 5 minutes for propagation)
 2. HTTP connectivity (waits for allocator to start)
 3. HTTPS/SSL certificate (waits for Let's Encrypt, if enabled)
 
 **When to use:**
+
 - After first deployment to verify everything works
 - When troubleshooting DNS or SSL issues
 - To confirm HTTPS certificate was obtained
@@ -303,19 +345,23 @@ Comprehensive deployment verification script for post-deployment testing.
 **Note:** GitHub Actions workflows include automatic verification, so this script is mainly for local deployments or manual troubleshooting.
 
 ### `config/custom-startup.sh` (Customizable Client Startup)
+
 The `config/custom-startup.sh` script is a customizable script that is executed upon the startup of a client VM. This script provides a way to automate the setup and configuration of the client environment.
 
 **Customization:**
 You can add custom startup behavior by modifying the `config/custom-startup.sh` script. For example, you could:
+
 - Install additional software packages.
 - Start additional services.
 
 Any changes made to this script will be reflected in the client VMs upon their next startup.
 
 ### `user_data.sh` (Automatic - DO NOT RUN MANUALLY)
+
 EC2 instance initialization script embedded in Terraform configuration.
 
 **What it does:**
+
 - Installs Docker and Caddy on the allocator EC2 instance
 - Pulls the allocator Docker image
 - Starts the allocator container
@@ -340,23 +386,27 @@ AMI IDs are region-specific. If deploying to a different region:
 3. Update `machine.ami_id` in `config/config.yaml`
 
 **Pre-configured custom AMIs (us-west-2):**
+
 - Client VM: `ami-0601752c11b394251` (Ubuntu 24.04 + Docker + Nvidia GPU drivers)
 - Allocator VM: `ami-0bd08c9d4aa9f0bc6` (Ubuntu 24.04 + Docker)
 
 ## Using Custom Docker Images
 
 ### Option 1: Use LabLink Public Images
+
 ```yaml
 machine:
   image: "ghcr.io/talmolab/lablink-client-base-image:latest"
 ```
 
 **Available tags:**
+
 - `latest` - Latest stable release
 - `linux-amd64-test` - Latest development build
 - `0.0.8a0` - Specific version tag
 
 ### Option 2: Build Your Own Images
+
 1. Fork the [LabLink repository](https://github.com/talmolab/lablink)
 2. Customize the client package in `packages/client/`
 3. Build and publish your images via GitHub Actions
@@ -387,21 +437,25 @@ See the workflows in the `.github` directory for automated deployment examples.
 ### Common Issues
 
 **DNS not resolving:**
+
 - Check Route 53 hosted zone exists and `zone_id` is correct
 - Wait up to 5 minutes for DNS propagation
 - Verify domain registrar nameservers point to Route 53
 
 **SSL certificate not obtained:**
+
 - Check DNS resolves correctly first (SSL requires valid DNS)
 - Verify port 80 and 443 are accessible (Let's Encrypt validation)
 - Check Caddy logs: `ssh ubuntu@<ip> sudo journalctl -u caddy -f`
 
 **Allocator not responding:**
+
 - Check Docker container is running: `ssh ubuntu@<ip> sudo docker ps`
 - View container logs: `ssh ubuntu@<ip> sudo docker logs $(sudo docker ps -q)`
 - Verify security group allows inbound traffic on port 5000
 
 **Terraform state locked:**
+
 - Check DynamoDB lock table in AWS console
 - Manually remove lock if workflow was interrupted
 - Use `terraform force-unlock <lock-id>` as last resort
@@ -423,6 +477,7 @@ terraform destroy
 ```
 
 This removes:
+
 - Allocator EC2 instance
 - Lambda function
 - Security groups
@@ -445,6 +500,7 @@ If `terraform destroy` fails or leaves orphaned resources, use the automated cle
 ```
 
 The script automatically handles:
+
 - Reading configuration from `config/config.yaml`
 - Backing up Terraform state files before deletion
 - Deleting resources in correct dependency order
diff --git a/lablink-infrastructure/budget.tf b/lablink-infrastructure/budget.tf
new file mode 100644
index 0000000..acfb04f
--- /dev/null
+++ b/lablink-infrastructure/budget.tf
@@ -0,0 +1,59 @@
+resource "aws_budgets_budget" "lablink_monthly" {
+  count = try(local.config_file.monitoring.budget.enabled, false) ? 1 : 0
+
+  name              = "lablink-monthly-budget-${var.resource_suffix}"
+  budget_type       = "COST"
+  limit_amount      = try(local.config_file.monitoring.budget.monthly_budget_usd, "100")
+  limit_unit        = "USD"
+  time_period_start = formatdate("YYYY-MM-01_00:00", timestamp())
+  time_unit         = "MONTHLY"
+
+  cost_filter {
+    name = "TagKeyValue"
+    values = [
+      "user:Environment$${var.resource_suffix}",
+      "user:ManagedBy$lablink-allocator-${var.resource_suffix}"
+    ]
+  }
+
+  # 50% warning
+  notification {
+    comparison_operator        = "GREATER_THAN"
+    threshold                  = 50
+    threshold_type             = "PERCENTAGE"
+    notification_type          = "ACTUAL"
+    subscriber_email_addresses = [local.config_file.monitoring.email]
+  }
+
+  # 80% urgent
+  notification {
+    comparison_operator        = "GREATER_THAN"
+    threshold                  = 80
+    threshold_type             = "PERCENTAGE"
+    notification_type          = "ACTUAL"
+    subscriber_email_addresses = [local.config_file.monitoring.email]
+  }
+
+  # 100% critical
+  notification {
+    comparison_operator        = "GREATER_THAN"
+    threshold                  = 100
+    threshold_type             = "PERCENTAGE"
+    notification_type          = "ACTUAL"
+    subscriber_email_addresses = [local.config_file.monitoring.email]
+  }
+
+  # 150% severe overage
+  notification {
+    comparison_operator        = "GREATER_THAN"
+    threshold                  = 150
+    threshold_type             = "PERCENTAGE"
+    notification_type          = "ACTUAL"
+    subscriber_email_addresses = [local.config_file.monitoring.email]
+  }
+
+  tags = {
+    Name        = "lablink-monthly-budget-${var.resource_suffix}"
+    Environment = var.resource_suffix
+  }
+}
\ No newline at end of file
diff --git a/lablink-infrastructure/cloudtrail.tf b/lablink-infrastructure/cloudtrail.tf
new file mode 100644
index 0000000..d920b84
--- /dev/null
+++ b/lablink-infrastructure/cloudtrail.tf
@@ -0,0 +1,127 @@
+# S3 bucket for CloudTrail logs
+resource "aws_s3_bucket" "cloudtrail_logs" {
+  bucket = "lablink-cloudtrail-${var.resource_suffix}-${data.aws_caller_identity.current.account_id}"
+  force_destroy = true
+
+  tags = {
+    Name        = "lablink-cloudtrail-${var.resource_suffix}"
+    Environment = var.resource_suffix
+  }
+}
+
+# S3 bucket encryption
+resource "aws_s3_bucket_server_side_encryption_configuration" "cloudtrail_encryption" {
+  bucket = aws_s3_bucket.cloudtrail_logs.id
+
+  rule {
+    apply_server_side_encryption_by_default {
+      sse_algorithm = "AES256"
+    }
+  }
+}
+
+# S3 bucket policy for CloudTrail
+resource "aws_s3_bucket_policy" "cloudtrail_policy" {
+  bucket = aws_s3_bucket.cloudtrail_logs.id
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Sid    = "AWSCloudTrailAclCheck"
+        Effect = "Allow"
+        Principal = {
+          Service = "cloudtrail.amazonaws.com"
+        }
+        Action   = "s3:GetBucketAcl"
+        Resource = aws_s3_bucket.cloudtrail_logs.arn
+      },
+      {
+        Sid    = "AWSCloudTrailWrite"
+        Effect = "Allow"
+        Principal = {
+          Service = "cloudtrail.amazonaws.com"
+        }
+        Action   = "s3:PutObject"
+        Resource = "${aws_s3_bucket.cloudtrail_logs.arn}/*"
+        Condition = {
+          StringEquals = {
+            "s3:x-amz-acl" = "bucket-owner-full-control"
+          }
+        }
+      }
+    ]
+  })
+}
+
+# CloudWatch Log Group for CloudTrail
+resource "aws_cloudwatch_log_group" "cloudtrail_logs" {
+  name              = "lablink-cloudtrail-${var.resource_suffix}"
+  retention_in_days = try(local.config_file.monitoring.cloudtrail.retention_days, 90)
+}
+
+# IAM role for CloudTrail to write to CloudWatch
+resource "aws_iam_role" "cloudtrail_cloudwatch_role" {
+  name = "lablink_cloudtrail_cloudwatch_${var.resource_suffix}"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect = "Allow"
+        Principal = {
+          Service = "cloudtrail.amazonaws.com"
+        }
+        Action = "sts:AssumeRole"
+      }
+    ]
+  })
+}
+
+# IAM policy for CloudTrail CloudWatch access
+resource "aws_iam_role_policy" "cloudtrail_cloudwatch_policy" {
+  name = "lablink_cloudtrail_cloudwatch_policy_${var.resource_suffix}"
+  role = aws_iam_role.cloudtrail_cloudwatch_role.id
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect = "Allow"
+        Action = [
+          "logs:CreateLogStream",
+          "logs:PutLogEvents"
+        ]
+        Resource = "${aws_cloudwatch_log_group.cloudtrail_logs.arn}:*"
+      }
+    ]
+  })
+}
+
+# CloudTrail
+resource "aws_cloudtrail" "lablink_trail" {
+  name                          = "lablink-trail-${var.resource_suffix}"
+  s3_bucket_name                = aws_s3_bucket.cloudtrail_logs.id
+  include_global_service_events = true
+  is_multi_region_trail         = true
+  enable_log_file_validation    = true
+
+  cloud_watch_logs_group_arn = "${aws_cloudwatch_log_group.cloudtrail_logs.arn}:*"
+  cloud_watch_logs_role_arn  = aws_iam_role.cloudtrail_cloudwatch_role.arn
+
+  event_selector {
+    read_write_type           = "All"
+    include_management_events = true
+  }
+
+  depends_on = [
+    aws_s3_bucket_policy.cloudtrail_policy,
+    aws_cloudwatch_log_group.cloudtrail_logs,
+    aws_iam_role_policy.cloudtrail_cloudwatch_policy
+  ]
+
+  tags = {
+    Name        = "lablink-trail-${var.resource_suffix}"
+    Environment = var.resource_suffix
+  }
+}
\ No newline at end of file
diff --git a/lablink-infrastructure/cloudwatch_alarms.tf b/lablink-infrastructure/cloudwatch_alarms.tf
new file mode 100644
index 0000000..8e57a07
--- /dev/null
+++ b/lablink-infrastructure/cloudwatch_alarms.tf
@@ -0,0 +1,170 @@
+# SNS Topic for Admin Alerts
+resource "aws_sns_topic" "admin_alerts" {
+  name = "lablink-admin-alerts-${var.resource_suffix}"
+
+  tags = {
+    Name        = "lablink-admin-alerts-${var.resource_suffix}"
+    Environment = var.resource_suffix
+  }
+}
+
+# SNS Email Subscription
+resource "aws_sns_topic_subscription" "admin_email" {
+  count     = try(local.config_file.monitoring.enabled, false) ? 1 : 0
+  topic_arn = aws_sns_topic.admin_alerts.arn
+  protocol  = "email"
+  endpoint  = try(local.config_file.monitoring.email, "")
+}
+
+# Metric Filter: Mass Instance Launches
+resource "aws_cloudwatch_log_metric_filter" "run_instances" {
+  name           = "lablink-run-instances-${var.resource_suffix}"
+  log_group_name = aws_cloudwatch_log_group.cloudtrail_logs.name
+
+  pattern = <<PATTERN
+{ ($.eventName = RunInstances) && ($.userIdentity.principalId = *lablink_instance_role*) }
+PATTERN
+
+  metric_transformation {
+    name      = "RunInstancesCount"
+    namespace = "LabLinkSecurity/${var.resource_suffix}"
+    value     = "$.requestParameters.maxCount"
+    unit      = "Count"
+    default_value = 1
+  }
+}
+
+# Alarm: Mass Instance Launches
+resource "aws_cloudwatch_metric_alarm" "mass_instance_launch" {
+  alarm_name          = "lablink-mass-instance-launch-${var.resource_suffix}"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = 1
+  metric_name         = "RunInstancesCount"
+  namespace           = "LabLinkSecurity/${var.resource_suffix}"
+  period              = 300  # 5 minutes
+  statistic           = "Sum"
+  threshold           = try(local.config_file.monitoring.thresholds.max_instances_per_5min, 10)
+  alarm_description   = "Alert when allocator launches >${try(local.config_file.monitoring.thresholds.max_instances_per_5min, 10)} instances in 5 minutes"
+  alarm_actions       = [aws_sns_topic.admin_alerts.arn]
+  treat_missing_data  = "notBreaching"
+
+  tags = {
+    Name        = "lablink-mass-instance-launch-${var.resource_suffix}"
+    Environment = var.resource_suffix
+    Severity    = "high"
+  }
+}
+
+# Metric Filter: Large Instance Types
+resource "aws_cloudwatch_log_metric_filter" "large_instances" {
+  name           = "lablink-large-instances-${var.resource_suffix}"
+  log_group_name = aws_cloudwatch_log_group.cloudtrail_logs.name
+
+  pattern = <<PATTERN
+{ ($.eventName = RunInstances) && ($.userIdentity.principalId = *lablink_instance_role*) && (($.requestParameters.instanceType = p4d.*) || ($.requestParameters.instanceType = p3.*) || ($.requestParameters.instanceType = g5.*)) }
+PATTERN
+
+  metric_transformation {
+    name      = "LargeInstanceLaunched"
+    namespace = "LabLinkSecurity/${var.resource_suffix}"
+    value     = "1"
+    unit      = "Count"
+  }
+}
+
+# Alarm: Large Instance Types
+resource "aws_cloudwatch_metric_alarm" "large_instance_launched" {
+  alarm_name          = "lablink-large-instance-launched-${var.resource_suffix}"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = 1
+  metric_name         = "LargeInstanceLaunched"
+  namespace           = "LabLinkSecurity/${var.resource_suffix}"
+  period              = 300
+  statistic           = "Sum"
+  threshold           = 0  # Alert on ANY large instance
+  alarm_description   = "Alert when allocator launches expensive instance types (p4d, p3, g5)"
+  alarm_actions       = [aws_sns_topic.admin_alerts.arn]
+  treat_missing_data  = "notBreaching"
+
+  tags = {
+    Name        = "lablink-large-instance-launched-${var.resource_suffix}"
+    Environment = var.resource_suffix
+    Severity    = "critical"
+  }
+}
+
+# Metric Filter: Unauthorized API Calls
+resource "aws_cloudwatch_log_metric_filter" "unauthorized_calls" {
+  name           = "lablink-unauthorized-calls-${var.resource_suffix}"
+  log_group_name = aws_cloudwatch_log_group.cloudtrail_logs.name
+
+  pattern = <<PATTERN
+{ ($.errorCode = AccessDenied) || ($.errorCode = UnauthorizedOperation) && ($.userIdentity.principalId = *lablink_instance_role*) }
+PATTERN
+
+  metric_transformation {
+    name      = "UnauthorizedAPICalls"
+    namespace = "LabLinkSecurity/${var.resource_suffix}"
+    value     = "1"
+    unit      = "Count"
+  }
+}
+
+# Alarm: Unauthorized API Calls
+resource "aws_cloudwatch_metric_alarm" "unauthorized_calls" {
+  alarm_name          = "lablink-unauthorized-calls-${var.resource_suffix}"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = 1
+  metric_name         = "UnauthorizedAPICalls"
+  namespace           = "LabLinkSecurity/${var.resource_suffix}"
+  period              = 900  # 15 minutes
+  statistic           = "Sum"
+  threshold           = try(local.config_file.monitoring.thresholds.max_unauthorized_calls_per_15min, 5)
+  alarm_description   = "Alert when allocator makes unauthorized API calls (possible attack or permission issue)"
+  alarm_actions       = [aws_sns_topic.admin_alerts.arn]
+  treat_missing_data  = "notBreaching"
+
+  tags = {
+    Name        = "lablink-unauthorized-calls-${var.resource_suffix}"
+    Environment = var.resource_suffix
+    Severity    = "critical"
+  }
+}
+
+# Metric Filter: High Termination Rate
+resource "aws_cloudwatch_log_metric_filter" "high_termination_rate" {
+  name           = "lablink-high-termination-rate-${var.resource_suffix}"
+  log_group_name = aws_cloudwatch_log_group.cloudtrail_logs.name
+  pattern        = <<PATTERN
+{ ($.eventName = TerminateInstances) && ($.userIdentity.principalId = *lablink_instance_role*) }
+PATTERN
+
+  metric_transformation {
+    name      = "TerminateInstancesCount"
+    namespace = "LabLinkSecurity/${var.resource_suffix}"
+    value     = "$.requestParameters.instancesSet.items"
+    unit      = "Count"
+    default_value = 0
+  }
+}
+
+# Alarm: High Termination Rate
+resource "aws_cloudwatch_metric_alarm" "high_termination_rate" {
+  alarm_name          = "lablink-high-termination-rate-${var.resource_suffix}"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = 1
+  metric_name         = "TerminateInstancesCount"
+  namespace           = "LabLinkSecurity/${var.resource_suffix}"
+  period              = 300 # 5 minutes
+  statistic           = "Sum"
+  threshold           = try(local.config_file.monitoring.thresholds.max_terminations_per_5min, 10)
+  alarm_description   = "Alert when allocator terminates >${try(local.config_file.monitoring.thresholds.max_terminations_per_5min, 10)} instances in 5 minutes (possible cleanup or attack)"
+  alarm_actions       = [aws_sns_topic.admin_alerts.arn]
+  treat_missing_data  = "notBreaching"
+
+  tags = {
+    Name        = "lablink-high-termination-rate-${var.resource_suffix}"
+    Environment = var.resource_suffix
+    Severity    = "high"
+  }
+}
\ No newline at end of file
diff --git a/lablink-infrastructure/config/ci-test.example.yaml b/lablink-infrastructure/config/ci-test.example.yaml
index 34a2220..73de14f 100644
--- a/lablink-infrastructure/config/ci-test.example.yaml
+++ b/lablink-infrastructure/config/ci-test.example.yaml
@@ -17,7 +17,7 @@
 db:
   dbname: "lablink_db"
   user: "lablink"
-  password: "PLACEHOLDER_DB_PASSWORD"  # Injected from GitHub secret at deploy time
+  password: "PLACEHOLDER_DB_PASSWORD" # Injected from GitHub secret at deploy time
   host: "localhost"
   port: 5432
   table_name: "vms"
@@ -25,13 +25,13 @@ db:
 
 machine:
   # Client VM instance type (can use smaller for testing)
-  machine_type: "t3.medium"  # Cheaper for template testing
+  machine_type: "t3.medium" # Cheaper for template testing
 
   # Docker image for client VMs
   image: "ghcr.io/talmolab/lablink-client-base-image:linux-amd64-latest-test"
 
   # AMI ID for your AWS region (Ubuntu 24.04 with Docker + Nvidia GPU Driver)
-  ami_id: "ami-0601752c11b394251"  # us-west-2 Ubuntu 24.04 custom AMI
+  ami_id: "ami-0601752c11b394251" # us-west-2 Ubuntu 24.04 custom AMI
 
   # Git repository with test/tutorial data
   repository: "https://github.com/talmolab/sleap-tutorial-data.git"
@@ -49,30 +49,48 @@ allocator:
 
 app:
   admin_user: "admin"
-  admin_password: "PLACEHOLDER_ADMIN_PASSWORD"  # Injected from GitHub secret at deploy time
+  admin_password: "PLACEHOLDER_ADMIN_PASSWORD" # Injected from GitHub secret at deploy time
   region: "us-west-2"
 
 dns:
   # DNS enabled for testing (required to work around Caddy bug)
   enabled: true
-  terraform_managed: true  # Let Terraform manage DNS records
-  domain: "lablink-template-testing.com"  # Template testing domain
-  zone_id: ""  # Will be set by setup-aws-infrastructure.sh
-  app_name: ""  # Not used with custom pattern
-  pattern: "custom"  # Using custom pattern (workaround for allocator bug - see issue #212)
-  custom_subdomain: "ci-test"  # Creates: ci-test.lablink-template-testing.com
-  create_zone: false  # Use existing zone created by setup script
+  terraform_managed: true # Let Terraform manage DNS records
+  domain: "lablink-template-testing.com" # Template testing domain
+  zone_id: "" # Will be set by setup-aws-infrastructure.sh
+  app_name: "" # Not used with custom pattern
+  pattern: "custom" # Using custom pattern (workaround for allocator bug - see issue #212)
+  custom_subdomain: "ci-test" # Creates: ci-test.lablink-template-testing.com
+  create_zone: false # Use existing zone created by setup script
 
 eip:
   # Use dynamic EIPs for ci-test (creates new, releases on destroy)
   strategy: "dynamic"
-  tag_name: "lablink-eip"  # Will become lablink-eip-ci-test
+  tag_name: "lablink-eip" # Will become lablink-eip-ci-test
 
 ssl:
   # Use Let's Encrypt staging certs for testing (unlimited, not trusted)
   provider: "letsencrypt"
-  email: "admin@example.com"  # Update with maintainer email
-  staging: true  # Staging certs = HTTP only, no rate limits
+  email: "admin@example.com" # Update with maintainer email
+  staging: true # Staging certs = HTTP only, no rate limits
+
+startup_script:
+  enabled: false
+  path: ""
+  on_error: "continue"
+
+monitoring:
+  enabled: false
+  email: ""
+  thresholds:
+    max_instances_per_5min: 10
+    max_terminations_per_5min: 20
+    max_unauthorized_calls_per_15min: 5
+  budget:
+    enabled: false
+    monthly_budget_usd: 500
+  cloudtrail:
+    retention_days: 90
 
 # S3 bucket for Terraform state (template testing infrastructure)
 # Separate from production deployments to avoid conflicts
diff --git a/lablink-infrastructure/config/config.yaml b/lablink-infrastructure/config/config.yaml
index 3034daf..8b5f366 100644
--- a/lablink-infrastructure/config/config.yaml
+++ b/lablink-infrastructure/config/config.yaml
@@ -5,7 +5,7 @@
 db:
   dbname: "lablink_db"
   user: "lablink"
-  password: "PLACEHOLDER_DB_PASSWORD"  # Injected from GitHub secret at deploy time
+  password: "PLACEHOLDER_DB_PASSWORD" # Injected from GitHub secret at deploy time
   host: "localhost"
   port: 5432
   table_name: "vms"
@@ -13,13 +13,13 @@ db:
 
 machine:
   # Client VM instance type (must support your workload)
-  machine_type: "g4dn.xlarge"  # GPU instance for ML workloads
+  machine_type: "g4dn.xlarge" # GPU instance for ML workloads
 
   # Docker image for client VMs (use version tags for production)
-  image: "ghcr.io/talmolab/lablink-client-base-image:linux-amd64-4087237fac8335040c5a1ab44861fd61a5f1d9e9-test"
+  image: "ghcr.io/talmolab/lablink-client-base-image:linux-amd64-latest-test"
 
   # AMI ID for your AWS region (Ubuntu 24.04 with Docker + Nvidia GPU Driver)
-  ami_id: "ami-0601752c11b394251"  # us-west-2 Ubuntu 24.04 custom AMI
+  ami_id: "ami-0601752c11b394251" # us-west-2 Ubuntu 24.04 custom AMI
 
   # Git repository with your data/code (optional - leave empty if not needed)
   repository: "https://github.com/talmolab/sleap-tutorial-data.git"
@@ -35,38 +35,51 @@ allocator:
   # Docker image tag for the allocator service
   # For test: use "linux-amd64-latest-test"
   # For prod: use specific version like "linux-amd64-v1.2.3"
-  image_tag: "linux-amd64-4087237fac8335040c5a1ab44861fd61a5f1d9e9-test"
+  image_tag: "linux-amd64-latest-test"
 
 app:
   admin_user: "admin"
-  admin_password: "PLACEHOLDER_ADMIN_PASSWORD"  # Injected from GitHub secret at deploy time
-  region: "us-west-2"  # Your AWS region
+  admin_password: "PLACEHOLDER_ADMIN_PASSWORD" # Injected from GitHub secret at deploy time
+  region: "us-west-2" # Your AWS region
 
 dns:
-  enabled: true  # true = use DNS for allocator URL, false = IP-only access
-  terraform_managed: true  # false = manual DNS records (you create in Route53), true = Terraform creates/destroys records
-  domain: "lablink-template-testing.com"  # Base hosted zone domain
-  zone_id: "Z1038183268T83E91AYJF"  # (Optional) Hardcode zone ID to skip lookup - leave empty for auto-lookup
-  app_name: ""  # Not used with custom pattern
-  pattern: "custom"  # Using custom pattern
-  custom_subdomain: "ci-test"  # Creates: ci-test.lablink-template-testing.com
-  create_zone: false  # false = use existing zone, true = create new zone
+  enabled: true # true = use DNS for allocator URL, false = IP-only access
+  terraform_managed: true # false = manual DNS records (you create in Route53), true = Terraform creates/destroys records
+  domain: "lablink-template-testing.com" # Base hosted zone domain
+  zone_id: "Z1038183268T83E91AYJF" # (Optional) Hardcode zone ID to skip lookup - leave empty for auto-lookup
+  app_name: "" # Not used with custom pattern
+  pattern: "custom" # Using custom pattern
+  custom_subdomain: "test" # Creates: ci-test.lablink-template-testing.com
+  create_zone: false # false = use existing zone, true = create new zone
 
 eip:
-  strategy: "dynamic"  # "persistent" = reuse EIP with tag {tag_name}-{env}, "dynamic" = create new EIP with tag {tag_name}-{env}
-  tag_name: "lablink-eip"  # Tag prefix for EIP name. Both strategies use {tag_name}-{env} format (e.g., lablink-eip-prod).
+  strategy: "dynamic" # "persistent" = reuse EIP with tag {tag_name}-{env}, "dynamic" = create new EIP with tag {tag_name}-{env}
+  tag_name: "lablink-eip" # Tag prefix for EIP name. Both strategies use {tag_name}-{env} format (e.g., lablink-eip-prod).
 
 ssl:
-  provider: "letsencrypt"  # "letsencrypt" = Caddy auto-SSL, "cloudflare" = CloudFlare proxy, "none" = HTTP only
-  email: "berri104@gmail.com"  # Email for Let's Encrypt notifications
-  staging: true  # true = staging/testing certs (unlimited), false = production Let's Encrypt certs (rate limited)
+  provider: "letsencrypt" # "letsencrypt" = Caddy auto-SSL, "cloudflare" = CloudFlare proxy, "none" = HTTP only
+  email: "berri104@gmail.com" # Email for Let's Encrypt notifications
+  staging: true # true = staging/testing certs (unlimited), false = production Let's Encrypt certs (rate limited)
 
 startup_script:
   enabled: true # true = run custom startup script on client VMs, false = no script
-  path: "config/custom-startup.sh"  # Path to custom startup script (optional)
-  on_error: "continue"  # "continue" = ignore errors, "fail" = stop VM setup on error
+  path: "config/custom-startup.sh" # Path to custom startup script (optional)
+  on_error: "continue" # "continue" = ignore errors, "fail" = stop VM setup on error
 
 # S3 bucket for Terraform state (must be unique globally)
 # Only used for test/prod environments with S3 backend (not used in dev)
 # Template testing uses separate bucket from production deployments
 bucket_name: "tf-state-lablink-template-testing"
+
+monitoring:
+  enabled: true # true = enable monitoring and alerts, false = disable
+  email: "your-email@example.com" # Email to receive budget alerts (must be valid for AWS SNS notifications)
+  thresholds:
+    max_instances_per_5min: 10 # Maximum number of instances launched in 5 minutes
+    max_security_group_changes_per_hour: 10 # Maximum number of security group changes in 1 hour
+    max_unauthorized_calls_per_15min: 5 # Maximum number of unauthorized API calls in 15 minutes
+  budget:
+    enabled: true
+    monthly_budget_usd: 500
+  cloudtrail:
+    retention_days: 90
diff --git a/lablink-infrastructure/config/dev.example.yaml b/lablink-infrastructure/config/dev.example.yaml
index 7ae748b..bdcba0a 100644
--- a/lablink-infrastructure/config/dev.example.yaml
+++ b/lablink-infrastructure/config/dev.example.yaml
@@ -12,7 +12,7 @@
 db:
   dbname: "lablink_db"
   user: "lablink"
-  password: "PLACEHOLDER_DB_PASSWORD"  # Injected from GitHub secret at deploy time
+  password: "PLACEHOLDER_DB_PASSWORD" # Injected from GitHub secret at deploy time
   host: "localhost"
   port: 5432
   table_name: "vms"
@@ -20,13 +20,13 @@ db:
 
 machine:
   # Client VM instance type (smaller for dev to save costs)
-  machine_type: "t3.medium"  # Smaller, cheaper for testing
+  machine_type: "t3.medium" # Smaller, cheaper for testing
 
   # Docker image for client VMs
   image: "ghcr.io/talmolab/lablink-client-base-image:linux-amd64-latest-test"
 
   # AMI ID for your AWS region (Ubuntu 24.04 with Docker + Nvidia GPU Driver)
-  ami_id: "ami-0601752c11b394251"  # us-west-2 Ubuntu 24.04 custom AMI
+  ami_id: "ami-0601752c11b394251" # us-west-2 Ubuntu 24.04 custom AMI
 
   # Git repository (use test data for dev)
   repository: "https://github.com/talmolab/sleap-tutorial-data.git"
@@ -44,25 +44,25 @@ allocator:
 
 app:
   admin_user: "admin"
-  admin_password: "PLACEHOLDER_ADMIN_PASSWORD"  # Injected from GitHub secret at deploy time
+  admin_password: "PLACEHOLDER_ADMIN_PASSWORD" # Injected from GitHub secret at deploy time
   region: "us-west-2"
 
 dns:
   # Disable DNS for local dev (use IP only)
   enabled: false
   terraform_managed: false
-  domain: "example.com"  # Your base domain (not used when DNS disabled)
+  domain: "example.com" # Your base domain (not used when DNS disabled)
   zone_id: ""
-  app_name: ""  # Not used when DNS disabled
-  pattern: "custom"  # Using custom pattern (workaround for allocator bug - see issue #212)
-  custom_subdomain: "dev"  # Not used when DNS disabled
+  app_name: "" # Not used when DNS disabled
+  pattern: "custom" # Using custom pattern (workaround for allocator bug - see issue #212)
+  custom_subdomain: "dev" # Not used when DNS disabled
   create_zone: false
 
 eip:
   # Always use dynamic for dev (don't reuse EIPs)
   strategy: "dynamic"
   # Use unique tag name to avoid conflicts with other environments
-  tag_name: "lablink-eip-dev-YOURNAME"  # Replace YOURNAME with your username
+  tag_name: "lablink-eip-dev-YOURNAME" # Replace YOURNAME with your username
 
 ssl:
   # No SSL for local dev (HTTP only)
@@ -70,6 +70,24 @@ ssl:
   email: "dev@example.com"
   staging: true
 
+startup_script:
+  enabled: false
+  path: ""
+  on_error: "continue"
+
+monitoring:
+  enabled: false
+  email: ""
+  thresholds:
+    max_instances_per_5min: 10
+    max_terminations_per_5min: 20
+    max_unauthorized_calls_per_15min: 5
+  budget:
+    enabled: false
+    monthly_budget_usd: 500
+  cloudtrail:
+    retention_days: 90
+
 # S3 bucket NOT USED for dev environment (uses local state)
 # Dev uses backend-dev.hcl which stores state in ./terraform.tfstate
 # Leave this as-is - the value doesn't matter for dev
diff --git a/lablink-infrastructure/config/example.config.yaml b/lablink-infrastructure/config/example.config.yaml
index f368a83..5c9a045 100644
--- a/lablink-infrastructure/config/example.config.yaml
+++ b/lablink-infrastructure/config/example.config.yaml
@@ -5,7 +5,7 @@
 db:
   dbname: "lablink_db"
   user: "lablink"
-  password: "PLACEHOLDER_DB_PASSWORD"  # Injected from GitHub secret at deploy time
+  password: "PLACEHOLDER_DB_PASSWORD" # Injected from GitHub secret at deploy time
   host: "localhost"
   port: 5432
   table_name: "vms"
@@ -13,13 +13,13 @@ db:
 
 machine:
   # Client VM instance type (must support your workload)
-  machine_type: "g4dn.xlarge"  # GPU instance for ML workloads
+  machine_type: "g4dn.xlarge" # GPU instance for ML workloads
 
   # Docker image for client VMs (use version tags for production)
   image: "ghcr.io/talmolab/lablink-client-base-image:linux-amd64-latest-test"
 
   # AMI ID for your AWS region (Ubuntu 24.04 with Docker + Nvidia GPU Driver)
-  ami_id: "ami-0601752c11b394251"  # us-west-2 Ubuntu 24.04 custom AMI
+  ami_id: "ami-0601752c11b394251" # us-west-2 Ubuntu 24.04 custom AMI
 
   # Git repository with your data/code (optional - leave empty if not needed)
   repository: "https://github.com/YOUR_ORG/YOUR_REPO.git"
@@ -39,32 +39,45 @@ allocator:
 
 app:
   admin_user: "admin"
-  admin_password: "PLACEHOLDER_ADMIN_PASSWORD"  # Injected from GitHub secret at deploy time
-  region: "us-west-2"  # Your AWS region
+  admin_password: "PLACEHOLDER_ADMIN_PASSWORD" # Injected from GitHub secret at deploy time
+  region: "us-west-2" # Your AWS region
 
 dns:
-  enabled: false  # true = use DNS for allocator URL, false = IP-only access
-  terraform_managed: false  # false = manual DNS records (you create in Route53), true = Terraform creates/destroys records
-  domain: "lablink.example.com"  # Your Route53 hosted zone domain
-  zone_id: ""  # (Optional) Hardcode zone ID to skip lookup - leave empty for auto-lookup
-  app_name: "lablink"  # Used with "auto" pattern
-  pattern: "auto"  # "auto" = {env}.{app_name}.{domain}, "custom" = {custom_subdomain}.{domain}
-  custom_subdomain: ""  # Only used with "custom" pattern
-  create_zone: false  # false = use existing zone, true = create new zone
+  enabled: false # true = use DNS for allocator URL, false = IP-only access
+  terraform_managed: false # false = manual DNS records (you create in Route53), true = Terraform creates/destroys records
+  domain: "lablink.example.com" # Your Route53 hosted zone domain
+  zone_id: "" # (Optional) Hardcode zone ID to skip lookup - leave empty for auto-lookup
+  app_name: "lablink" # Used with "auto" pattern
+  pattern: "auto" # "auto" = {env}.{app_name}.{domain}, "custom" = {custom_subdomain}.{domain}
+  custom_subdomain: "" # Only used with "custom" pattern
+  create_zone: false # false = use existing zone, true = create new zone
 
 eip:
-  strategy: "dynamic"  # "persistent" = reuse EIP with tag {tag_name}-{env}, "dynamic" = create new EIP with tag {tag_name}-{env}
-  tag_name: "lablink-eip"  # Tag prefix for EIP name. Both strategies use {tag_name}-{env} format (e.g., lablink-eip-prod).
+  strategy: "dynamic" # "persistent" = reuse EIP with tag {tag_name}-{env}, "dynamic" = create new EIP with tag {tag_name}-{env}
+  tag_name: "lablink-eip" # Tag prefix for EIP name. Both strategies use {tag_name}-{env} format (e.g., lablink-eip-prod).
 
 ssl:
-  provider: "none"  # "letsencrypt" = Caddy auto-SSL, "cloudflare" = CloudFlare proxy, "none" = HTTP only
-  email: "admin@example.com"  # Email for Let's Encrypt notifications
-  staging: true  # true = staging/testing certs (unlimited), false = production Let's Encrypt certs (rate limited)
+  provider: "none" # "letsencrypt" = Caddy auto-SSL, "cloudflare" = CloudFlare proxy, "none" = HTTP only
+  email: "admin@example.com" # Email for Let's Encrypt notifications
+  staging: true # true = staging/testing certs (unlimited), false = production Let's Encrypt certs (rate limited)
 
 startup_script:
   enabled: true # true = run custom startup script on client VMs, false = no script
-  path: "config/custom-startup.sh"  # Path to custom startup script (optional)
-  on_error: "continue"  # "continue" = ignore errors, "fail" = stop VM setup on error
+  path: "config/custom-startup.sh" # Path to custom startup script (optional)
+  on_error: "continue" # "continue" = ignore errors, "fail" = stop VM setup on error
+
+monitoring:
+  enabled: false
+  email: ""
+  thresholds:
+    max_instances_per_5min: 10
+    max_terminations_per_5min: 20
+    max_unauthorized_calls_per_15min: 5
+  budget:
+    enabled: false
+    monthly_budget_usd: 500
+  cloudtrail:
+    retention_days: 90
 
 # S3 bucket for Terraform state (must be unique globally)
 # Only used for test/prod environments with S3 backend (not used in dev)
diff --git a/lablink-infrastructure/config/prod.example.yaml b/lablink-infrastructure/config/prod.example.yaml
index 9739a54..9a648fd 100644
--- a/lablink-infrastructure/config/prod.example.yaml
+++ b/lablink-infrastructure/config/prod.example.yaml
@@ -13,7 +13,7 @@
 db:
   dbname: "lablink_db"
   user: "lablink"
-  password: "PLACEHOLDER_DB_PASSWORD"  # Injected from GitHub secret at deploy time
+  password: "PLACEHOLDER_DB_PASSWORD" # Injected from GitHub secret at deploy time
   host: "localhost"
   port: 5432
   table_name: "vms"
@@ -21,13 +21,13 @@ db:
 
 machine:
   # Client VM instance type (production workloads)
-  machine_type: "g4dn.xlarge"  # GPU instance for ML workloads
+  machine_type: "g4dn.xlarge" # GPU instance for ML workloads
 
   # Docker image for client VMs (use PINNED version for prod)
-  image: "ghcr.io/talmolab/lablink-client-base-image:linux-amd64-v1.0.0"  # PINNED VERSION
+  image: "ghcr.io/talmolab/lablink-client-base-image:linux-amd64-v1.0.0" # PINNED VERSION
 
   # AMI ID for your AWS region (Ubuntu 24.04 with Docker + Nvidia GPU Driver)
-  ami_id: "ami-0601752c11b394251"  # us-west-2 Ubuntu 24.04 custom AMI
+  ami_id: "ami-0601752c11b394251" # us-west-2 Ubuntu 24.04 custom AMI
 
   # Git repository with your production data/code
   repository: "https://github.com/YOUR_ORG/YOUR_PROD_REPO.git"
@@ -41,35 +41,53 @@ machine:
 # Allocator service configuration
 allocator:
   # IMPORTANT: Use specific version tag for production (not latest!)
-  image_tag: "linux-amd64-v1.0.0"  # PINNED VERSION - update deliberately
+  image_tag: "linux-amd64-v1.0.0" # PINNED VERSION - update deliberately
 
 app:
   admin_user: "admin"
-  admin_password: "PLACEHOLDER_ADMIN_PASSWORD"  # Injected from GitHub secret at deploy time
+  admin_password: "PLACEHOLDER_ADMIN_PASSWORD" # Injected from GitHub secret at deploy time
   region: "us-west-2"
 
 dns:
   # Enable DNS for production
   enabled: true
-  terraform_managed: true  # Let Terraform manage DNS records
-  domain: "example.com"  # Your base domain (Route53 hosted zone)
-  zone_id: ""  # Will be set by setup-aws-infrastructure.sh
-  app_name: ""  # Not used with custom pattern
-  pattern: "custom"  # Using custom pattern (workaround for allocator bug - see issue #212)
-  custom_subdomain: ""  # Leave empty for apex domain: example.com
-  create_zone: false  # Use existing zone
+  terraform_managed: true # Let Terraform manage DNS records
+  domain: "example.com" # Your base domain (Route53 hosted zone)
+  zone_id: "" # Will be set by setup-aws-infrastructure.sh
+  app_name: "" # Not used with custom pattern
+  pattern: "custom" # Using custom pattern (workaround for allocator bug - see issue #212)
+  custom_subdomain: "" # Leave empty for apex domain: example.com
+  create_zone: false # Use existing zone
 
 eip:
   # Consider persistent EIP for production (keeps IP across redeploys)
-  strategy: "persistent"  # Reuses existing EIP with tag
-  tag_name: "lablink-eip"  # Will become lablink-eip-prod
+  strategy: "persistent" # Reuses existing EIP with tag
+  tag_name: "lablink-eip" # Will become lablink-eip-prod
 
 ssl:
   # Use Let's Encrypt production certs
   provider: "letsencrypt"
-  email: "admin@example.com"  # Your email for Let's Encrypt notifications
-  staging: false  # Production certs = trusted HTTPS (rate limited)
+  email: "admin@example.com" # Your email for Let's Encrypt notifications
+  staging: false # Production certs = trusted HTTPS (rate limited)
+
+startup_script:
+  enabled: false
+  path: ""
+  on_error: "continue"
+
+monitoring:
+  enabled: false
+  email: ""
+  thresholds:
+    max_instances_per_5min: 10
+    max_terminations_per_5min: 20
+    max_unauthorized_calls_per_15min: 5
+  budget:
+    enabled: false
+    monthly_budget_usd: 500
+  cloudtrail:
+    retention_days: 90
 
 # S3 bucket for Terraform state (must be unique across ALL of AWS)
 # Example: if your org is "acme", use "tf-state-lablink-acme-prod"
-bucket_name: "tf-state-lablink-YOURORG-prod"  # Replace YOURORG with something unique
+bucket_name: "tf-state-lablink-YOURORG-prod" # Replace YOURORG with something unique
diff --git a/lablink-infrastructure/config/test.example.yaml b/lablink-infrastructure/config/test.example.yaml
index b41b8f5..ec2108f 100644
--- a/lablink-infrastructure/config/test.example.yaml
+++ b/lablink-infrastructure/config/test.example.yaml
@@ -13,7 +13,7 @@
 db:
   dbname: "lablink_db"
   user: "lablink"
-  password: "PLACEHOLDER_DB_PASSWORD"  # Injected from GitHub secret at deploy time
+  password: "PLACEHOLDER_DB_PASSWORD" # Injected from GitHub secret at deploy time
   host: "localhost"
   port: 5432
   table_name: "vms"
@@ -21,13 +21,13 @@ db:
 
 machine:
   # Client VM instance type
-  machine_type: "g4dn.xlarge"  # GPU instance for ML workloads
+  machine_type: "g4dn.xlarge" # GPU instance for ML workloads
 
   # Docker image for client VMs (use test tags)
   image: "ghcr.io/talmolab/lablink-client-base-image:linux-amd64-latest-test"
 
   # AMI ID for your AWS region (Ubuntu 24.04 with Docker + Nvidia GPU Driver)
-  ami_id: "ami-0601752c11b394251"  # us-west-2 Ubuntu 24.04 custom AMI
+  ami_id: "ami-0601752c11b394251" # us-west-2 Ubuntu 24.04 custom AMI
 
   # Git repository with test/tutorial data
   repository: "https://github.com/talmolab/sleap-tutorial-data.git"
@@ -45,31 +45,49 @@ allocator:
 
 app:
   admin_user: "admin"
-  admin_password: "PLACEHOLDER_ADMIN_PASSWORD"  # Injected from GitHub secret at deploy time
+  admin_password: "PLACEHOLDER_ADMIN_PASSWORD" # Injected from GitHub secret at deploy time
   region: "us-west-2"
 
 dns:
   # Enable DNS for test environment
   enabled: true
-  terraform_managed: true  # Let Terraform manage DNS records
-  domain: "example.com"  # Your base domain (Route53 hosted zone)
-  zone_id: ""  # Will be set by setup-aws-infrastructure.sh
-  app_name: ""  # Not used with custom pattern
-  pattern: "custom"  # Using custom pattern (workaround for allocator bug - see issue #212)
-  custom_subdomain: "test"  # Creates: test.example.com
-  create_zone: false  # Use existing zone
+  terraform_managed: true # Let Terraform manage DNS records
+  domain: "example.com" # Your base domain (Route53 hosted zone)
+  zone_id: "" # Will be set by setup-aws-infrastructure.sh
+  app_name: "" # Not used with custom pattern
+  pattern: "custom" # Using custom pattern (workaround for allocator bug - see issue #212)
+  custom_subdomain: "test" # Creates: test.example.com
+  create_zone: false # Use existing zone
 
 eip:
   # Use persistent EIP for test (keeps same IP across redeploys)
-  strategy: "persistent"  # Reuses existing EIP with matching tag
-  tag_name: "lablink-eip"  # Will become lablink-eip-test
+  strategy: "persistent" # Reuses existing EIP with matching tag
+  tag_name: "lablink-eip" # Will become lablink-eip-test
 
 ssl:
   # Use Let's Encrypt staging certs for test (unlimited, not trusted)
   provider: "letsencrypt"
-  email: "admin@example.com"  # Your email for Let's Encrypt notifications
-  staging: true  # Staging certs = HTTP only, no rate limits
+  email: "admin@example.com" # Your email for Let's Encrypt notifications
+  staging: true # Staging certs = HTTP only, no rate limits
+
+startup_script:
+  enabled: false
+  path: ""
+  on_error: "continue"
+
+monitoring:
+  enabled: false
+  email: ""
+  thresholds:
+    max_instances_per_5min: 10
+    max_terminations_per_5min: 20
+    max_unauthorized_calls_per_15min: 5
+  budget:
+    enabled: false
+    monthly_budget_usd: 500
+  cloudtrail:
+    retention_days: 90
 
 # S3 bucket for Terraform state (must be unique across ALL of AWS)
 # Example: if your org is "acme", use "tf-state-lablink-acme"
-bucket_name: "tf-state-lablink-YOURORG"  # Replace YOURORG with something unique
+bucket_name: "tf-state-lablink-YOURORG" # Replace YOURORG with something unique
diff --git a/lablink-infrastructure/main.tf b/lablink-infrastructure/main.tf
index ed8fd0e..9289675 100644
--- a/lablink-infrastructure/main.tf
+++ b/lablink-infrastructure/main.tf
@@ -169,11 +169,11 @@ data "aws_iam_policy_document" "ec2_vm_management_doc" {
       "iam:AttachRolePolicy",
       "iam:DetachRolePolicy",
     ]
-    resources = [  
-      "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/lablink_cloud_watch_agent_role_*"  
+    resources = [
+      "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/lablink_cloud_watch_agent_role_*"
     ]
     condition {
-      test     = "ArnEquals"  
+      test     = "ArnEquals"
       variable = "iam:PolicyArn"
       values   = ["arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"]
     }
@@ -402,7 +402,7 @@ resource "aws_iam_role" "instance_role" {
 }
 
 resource "aws_iam_role_policy_attachment" "attach_ec2_management" {
-  role      = aws_iam_role.instance_role.name
+  role       = aws_iam_role.instance_role.name
   policy_arn = aws_iam_policy.ec2_vm_management_policy.arn
 }