diff --git a/lablink-infrastructure/README.md b/lablink-infrastructure/README.md index 13573b4..c19a6bd 100644 --- a/lablink-infrastructure/README.md +++ b/lablink-infrastructure/README.md @@ -14,8 +14,8 @@ If you plan to deploy via GitHub Actions workflows, you must configure one repos 2. **Click "New repository secret"** 3. **Add the following secret:** - | Name | Value | Description | - |------|-------|-------------| + | Name | Value | Description | + | -------------- | -------------------------------------------------- | ------------------------------------ | | `AWS_ROLE_ARN` | `arn:aws:iam::YOUR-ACCOUNT-ID:role/YOUR-ROLE-NAME` | IAM role ARN for OIDC authentication | **How to create the AWS IAM role for OIDC:** @@ -65,6 +65,7 @@ aws iam get-role \ **Note:** If deploying locally with Terraform (not via GitHub Actions), you don't need this secret. Just configure AWS CLI credentials instead. ### Prerequisites + - AWS account with credentials configured - Terraform installed (v1.6.6+) for local deployments - Docker images available on GHCR (or use public LabLink images) @@ -78,6 +79,7 @@ cp config/example.config.yaml config/config.yaml ``` **Edit `config/config.yaml`:** + - **REQUIRED**: Change `db.password` and `app.admin_password` (security!) - **REQUIRED**: Set `bucket_name` to a globally unique S3 bucket name (for test/prod) - Customize `machine.ami_id` for your AWS region @@ -86,31 +88,32 @@ cp config/example.config.yaml config/config.yaml - Set `app.region` to your AWS region **Example configuration:** + ```yaml db: - password: "YOUR-SECURE-DB-PASSWORD" # CHANGE THIS! + password: "YOUR-SECURE-DB-PASSWORD" # CHANGE THIS! machine: machine_type: "g4dn.xlarge" image: "ghcr.io/talmolab/lablink-client-base-image:latest" - ami_id: "ami-0601752c11b394251" # Ubuntu 24.04 with Docker + Nvidia (us-west-2) + ami_id: "ami-0601752c11b394251" # Ubuntu 24.04 with Docker + Nvidia (us-west-2) repository: "https://github.com/talmolab/sleap-tutorial-data.git" software: "sleap" app: - admin_password: "YOUR-SECURE-ADMIN-PASSWORD" # CHANGE THIS! + admin_password: "YOUR-SECURE-ADMIN-PASSWORD" # CHANGE THIS! region: "us-west-2" -bucket_name: "tf-state-lablink-yourname" # Must be globally unique +bucket_name: "tf-state-lablink-yourname" # Must be globally unique dns: enabled: true terraform_managed: true domain: "lablink.yourdomain.com" - zone_id: "Z..." # Your Route 53 hosted zone ID + zone_id: "Z..." # Your Route 53 hosted zone ID ssl: - provider: "letsencrypt" # or "cloudflare" or "none" + provider: "letsencrypt" # or "cloudflare" or "none" email: "admin@yourdomain.com" ``` @@ -131,6 +134,8 @@ terraform plan terraform apply ``` +> **Note**: After running `terraform apply`, check your email and confirm your subscription to receive notifications. + **Option B: Manual Terraform commands** ```bash @@ -145,6 +150,8 @@ terraform plan terraform apply ``` +> **Note**: After running `terraform apply`, check your email and confirm your subscription to receive notifications. + ### 3. Verify Deployment (Optional) After deployment completes, you can verify everything is working: @@ -159,6 +166,7 @@ IP=$(terraform output -raw ec2_public_ip) ``` The verification script checks: + - DNS resolution (if domain configured) - HTTP connectivity - HTTPS/SSL certificate (if Let's Encrypt enabled) @@ -166,12 +174,14 @@ The verification script checks: ### 4. Access Your Allocator **With DNS configured:** + ``` Allocator: https://lablink.yourdomain.com Admin UI: https://lablink.yourdomain.com/admin ``` **Without DNS (IP-only):** + ``` Allocator: http://:5000 Admin UI: http://:5000/admin @@ -184,7 +194,7 @@ Admin UI: http://:5000/admin - **Lambda Function**: Processes CloudWatch logs from client VMs - **Route 53 DNS**: Automatic DNS record management (if configured) - **Security Groups**: Network security rules -- **IAM Roles**: +- **IAM Roles**: - **Allocator Instance Role**: A role for the allocator EC2 instance with permissions to manage the lifecycle of client VMs (run, terminate, tag, etc.) and their associated resources (IAM roles, instance profiles). - **CloudWatch Agent Role**: A role for client VMs to send logs to CloudWatch. - **CloudWatch Log Groups**: Centralized logging for troubleshooting @@ -193,22 +203,24 @@ Admin UI: http://:5000/admin LabLink supports three deployment environments: -| Environment | Backend State | Use Case | S3 Bucket Required? | -|-------------|---------------|----------|---------------------| -| `dev` | Local file | Local testing, experimentation | No | -| `test` | S3 | Staging, pre-production testing | Yes | -| `prod` | S3 | Production deployments | Yes | +| Environment | Backend State | Use Case | S3 Bucket Required? | +| ----------- | ------------- | ------------------------------- | ------------------- | +| `dev` | Local file | Local testing, experimentation | No | +| `test` | S3 | Staging, pre-production testing | Yes | +| `prod` | S3 | Production deployments | Yes | Each environment maintains separate Terraform state to avoid conflicts. ## Configuration Reference ### Database (`db`) + - `password`: PostgreSQL password (**CHANGE THIS!**) - `dbname`: Database name (default: `lablink_db`) - `user`: Database username (default: `lablink`) ### Machine Settings (`machine`) + - `machine_type`: AWS EC2 instance type for client VMs (e.g., `g4dn.xlarge`, `g5.2xlarge`) - `image`: Docker image for client container (e.g., `ghcr.io/talmolab/lablink-client-base-image:latest`) - `ami_id`: Amazon Machine Image for client VMs (region-specific) @@ -216,22 +228,40 @@ Each environment maintains separate Terraform state to avoid conflicts. - `software`: Software identifier (e.g., `sleap`) ### Application (`app`) + - `admin_password`: Admin UI password (**CHANGE THIS!**) - `admin_user`: Admin username (default: `admin`) - `region`: AWS region (e.g., `us-west-2`) ### DNS Configuration (`dns`) + - `enabled`: Enable DNS management (true/false) - `terraform_managed`: Let Terraform manage Route 53 records (true/false) - `domain`: Your domain name (e.g., `lablink.example.com`) - `zone_id`: Route 53 hosted zone ID (required if `terraform_managed: true`) ### SSL Configuration (`ssl`) + - `provider`: SSL provider (`letsencrypt`, `cloudflare`, or `none`) - `email`: Email for Let's Encrypt notifications - `staging`: When `true`, serve HTTP only for unlimited testing. When `false`, serve HTTPS with trusted Let's Encrypt certificates (rate limited to 5 duplicate certificates per week) +### Monitoring (`monitoring`) + +- `enabled`: `true` to enable monitoring and alerts, `false` to disable. +- `email`: Email address to send alerts to. +- `thresholds`: + - `max_instances_per_5min`: Max instances created in 5 minutes. + - `max_terminations_per_5min`: Max instances terminated in 5 minutes. + - `max_unauthorized_calls_per_15min`: Max unauthorized API calls in 15 minutes. +- `budget`: + - `enabled`: `true` to enable budget monitoring, `false` to disable. + - `monthly_budget_usd`: Monthly budget in USD. +- `cloudtrail`: + - `retention_days`: Number of days to retain CloudTrail logs. + **Staging Mode:** + - Use `staging: true` for rapid infrastructure testing without SSL complications - Caddy serves HTTP only (no certificates, no redirects) - Client VMs connect via HTTP @@ -239,21 +269,25 @@ Each environment maintains separate Terraform state to avoid conflicts. - **Not for production use** - no encryption **Production Mode:** + - Use `staging: false` for production deployments - Caddy obtains trusted Let's Encrypt certificates - Serves HTTPS with automatic HTTP→HTTPS redirects - Subject to Let's Encrypt rate limits ### Terraform State (`bucket_name`) + - S3 bucket name for Terraform state storage (test/prod only) - Must be globally unique ### Startup Script (`startup_script`) + - `enabled`: `true` to run the custom startup script on client VMs, `false` to disable. - `path`: Path to the custom startup script file. Default: `config/custom-startup.sh`. - `on_error`: Behavior on script error. `continue` (default) ignores errors, `fail` stops VM setup. **Additional Resources:** + - [Configuration Guide](../docs/configuration.md#ssltls-options-ssl) - Detailed SSL configuration reference - [Troubleshooting](../docs/troubleshooting.md#browser-cannot-access-http-staging-mode) - Browser HSTS cache issues - [Security](../docs/security.md#staging-mode-security) - Security implications of staging mode @@ -261,27 +295,33 @@ Each environment maintains separate Terraform state to avoid conflicts. ## Included Scripts ### `init-terraform.sh` (Optional Helper) + Simplifies Terraform initialization by automatically reading the S3 bucket name from your config file. **Usage:** + ```bash ../scripts/init-terraform.sh [dev|test|prod|ci-test] ``` **What it does:** + - Reads `bucket_name` from `config/config.yaml` - Runs `terraform init` with appropriate backend configuration - Validates configuration before initializing **Equivalent manual command:** + ```bash terraform init -backend-config=backend-test.hcl -backend-config="bucket=YOUR-BUCKET" ``` ### `verify-deployment.sh` (Optional Manual Verification) + Comprehensive deployment verification script for post-deployment testing. **Usage:** + ```bash ./verify-deployment.sh [domain] [ip] @@ -291,11 +331,13 @@ Comprehensive deployment verification script for post-deployment testing. ``` **What it checks:** + 1. DNS resolution (waits up to 5 minutes for propagation) 2. HTTP connectivity (waits for allocator to start) 3. HTTPS/SSL certificate (waits for Let's Encrypt, if enabled) **When to use:** + - After first deployment to verify everything works - When troubleshooting DNS or SSL issues - To confirm HTTPS certificate was obtained @@ -303,19 +345,23 @@ Comprehensive deployment verification script for post-deployment testing. **Note:** GitHub Actions workflows include automatic verification, so this script is mainly for local deployments or manual troubleshooting. ### `config/custom-startup.sh` (Customizable Client Startup) + The `config/custom-startup.sh` script is a customizable script that is executed upon the startup of a client VM. This script provides a way to automate the setup and configuration of the client environment. **Customization:** You can add custom startup behavior by modifying the `config/custom-startup.sh` script. For example, you could: + - Install additional software packages. - Start additional services. Any changes made to this script will be reflected in the client VMs upon their next startup. ### `user_data.sh` (Automatic - DO NOT RUN MANUALLY) + EC2 instance initialization script embedded in Terraform configuration. **What it does:** + - Installs Docker and Caddy on the allocator EC2 instance - Pulls the allocator Docker image - Starts the allocator container @@ -340,23 +386,27 @@ AMI IDs are region-specific. If deploying to a different region: 3. Update `machine.ami_id` in `config/config.yaml` **Pre-configured custom AMIs (us-west-2):** + - Client VM: `ami-0601752c11b394251` (Ubuntu 24.04 + Docker + Nvidia GPU drivers) - Allocator VM: `ami-0bd08c9d4aa9f0bc6` (Ubuntu 24.04 + Docker) ## Using Custom Docker Images ### Option 1: Use LabLink Public Images + ```yaml machine: image: "ghcr.io/talmolab/lablink-client-base-image:latest" ``` **Available tags:** + - `latest` - Latest stable release - `linux-amd64-test` - Latest development build - `0.0.8a0` - Specific version tag ### Option 2: Build Your Own Images + 1. Fork the [LabLink repository](https://github.com/talmolab/lablink) 2. Customize the client package in `packages/client/` 3. Build and publish your images via GitHub Actions @@ -387,21 +437,25 @@ See the workflows in the `.github` directory for automated deployment examples. ### Common Issues **DNS not resolving:** + - Check Route 53 hosted zone exists and `zone_id` is correct - Wait up to 5 minutes for DNS propagation - Verify domain registrar nameservers point to Route 53 **SSL certificate not obtained:** + - Check DNS resolves correctly first (SSL requires valid DNS) - Verify port 80 and 443 are accessible (Let's Encrypt validation) - Check Caddy logs: `ssh ubuntu@ sudo journalctl -u caddy -f` **Allocator not responding:** + - Check Docker container is running: `ssh ubuntu@ sudo docker ps` - View container logs: `ssh ubuntu@ sudo docker logs $(sudo docker ps -q)` - Verify security group allows inbound traffic on port 5000 **Terraform state locked:** + - Check DynamoDB lock table in AWS console - Manually remove lock if workflow was interrupted - Use `terraform force-unlock ` as last resort @@ -423,6 +477,7 @@ terraform destroy ``` This removes: + - Allocator EC2 instance - Lambda function - Security groups @@ -445,6 +500,7 @@ If `terraform destroy` fails or leaves orphaned resources, use the automated cle ``` The script automatically handles: + - Reading configuration from `config/config.yaml` - Backing up Terraform state files before deletion - Deleting resources in correct dependency order diff --git a/lablink-infrastructure/budget.tf b/lablink-infrastructure/budget.tf new file mode 100644 index 0000000..acfb04f --- /dev/null +++ b/lablink-infrastructure/budget.tf @@ -0,0 +1,59 @@ +resource "aws_budgets_budget" "lablink_monthly" { + count = try(local.config_file.monitoring.budget.enabled, false) ? 1 : 0 + + name = "lablink-monthly-budget-${var.resource_suffix}" + budget_type = "COST" + limit_amount = try(local.config_file.monitoring.budget.monthly_budget_usd, "100") + limit_unit = "USD" + time_period_start = formatdate("YYYY-MM-01_00:00", timestamp()) + time_unit = "MONTHLY" + + cost_filter { + name = "TagKeyValue" + values = [ + "user:Environment$${var.resource_suffix}", + "user:ManagedBy$lablink-allocator-${var.resource_suffix}" + ] + } + + # 50% warning + notification { + comparison_operator = "GREATER_THAN" + threshold = 50 + threshold_type = "PERCENTAGE" + notification_type = "ACTUAL" + subscriber_email_addresses = [local.config_file.monitoring.email] + } + + # 80% urgent + notification { + comparison_operator = "GREATER_THAN" + threshold = 80 + threshold_type = "PERCENTAGE" + notification_type = "ACTUAL" + subscriber_email_addresses = [local.config_file.monitoring.email] + } + + # 100% critical + notification { + comparison_operator = "GREATER_THAN" + threshold = 100 + threshold_type = "PERCENTAGE" + notification_type = "ACTUAL" + subscriber_email_addresses = [local.config_file.monitoring.email] + } + + # 150% severe overage + notification { + comparison_operator = "GREATER_THAN" + threshold = 150 + threshold_type = "PERCENTAGE" + notification_type = "ACTUAL" + subscriber_email_addresses = [local.config_file.monitoring.email] + } + + tags = { + Name = "lablink-monthly-budget-${var.resource_suffix}" + Environment = var.resource_suffix + } +} \ No newline at end of file diff --git a/lablink-infrastructure/cloudtrail.tf b/lablink-infrastructure/cloudtrail.tf new file mode 100644 index 0000000..d920b84 --- /dev/null +++ b/lablink-infrastructure/cloudtrail.tf @@ -0,0 +1,127 @@ +# S3 bucket for CloudTrail logs +resource "aws_s3_bucket" "cloudtrail_logs" { + bucket = "lablink-cloudtrail-${var.resource_suffix}-${data.aws_caller_identity.current.account_id}" + force_destroy = true + + tags = { + Name = "lablink-cloudtrail-${var.resource_suffix}" + Environment = var.resource_suffix + } +} + +# S3 bucket encryption +resource "aws_s3_bucket_server_side_encryption_configuration" "cloudtrail_encryption" { + bucket = aws_s3_bucket.cloudtrail_logs.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "AES256" + } + } +} + +# S3 bucket policy for CloudTrail +resource "aws_s3_bucket_policy" "cloudtrail_policy" { + bucket = aws_s3_bucket.cloudtrail_logs.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "AWSCloudTrailAclCheck" + Effect = "Allow" + Principal = { + Service = "cloudtrail.amazonaws.com" + } + Action = "s3:GetBucketAcl" + Resource = aws_s3_bucket.cloudtrail_logs.arn + }, + { + Sid = "AWSCloudTrailWrite" + Effect = "Allow" + Principal = { + Service = "cloudtrail.amazonaws.com" + } + Action = "s3:PutObject" + Resource = "${aws_s3_bucket.cloudtrail_logs.arn}/*" + Condition = { + StringEquals = { + "s3:x-amz-acl" = "bucket-owner-full-control" + } + } + } + ] + }) +} + +# CloudWatch Log Group for CloudTrail +resource "aws_cloudwatch_log_group" "cloudtrail_logs" { + name = "lablink-cloudtrail-${var.resource_suffix}" + retention_in_days = try(local.config_file.monitoring.cloudtrail.retention_days, 90) +} + +# IAM role for CloudTrail to write to CloudWatch +resource "aws_iam_role" "cloudtrail_cloudwatch_role" { + name = "lablink_cloudtrail_cloudwatch_${var.resource_suffix}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "cloudtrail.amazonaws.com" + } + Action = "sts:AssumeRole" + } + ] + }) +} + +# IAM policy for CloudTrail CloudWatch access +resource "aws_iam_role_policy" "cloudtrail_cloudwatch_policy" { + name = "lablink_cloudtrail_cloudwatch_policy_${var.resource_suffix}" + role = aws_iam_role.cloudtrail_cloudwatch_role.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "logs:CreateLogStream", + "logs:PutLogEvents" + ] + Resource = "${aws_cloudwatch_log_group.cloudtrail_logs.arn}:*" + } + ] + }) +} + +# CloudTrail +resource "aws_cloudtrail" "lablink_trail" { + name = "lablink-trail-${var.resource_suffix}" + s3_bucket_name = aws_s3_bucket.cloudtrail_logs.id + include_global_service_events = true + is_multi_region_trail = true + enable_log_file_validation = true + + cloud_watch_logs_group_arn = "${aws_cloudwatch_log_group.cloudtrail_logs.arn}:*" + cloud_watch_logs_role_arn = aws_iam_role.cloudtrail_cloudwatch_role.arn + + event_selector { + read_write_type = "All" + include_management_events = true + } + + depends_on = [ + aws_s3_bucket_policy.cloudtrail_policy, + aws_cloudwatch_log_group.cloudtrail_logs, + aws_iam_role_policy.cloudtrail_cloudwatch_policy + ] + + tags = { + Name = "lablink-trail-${var.resource_suffix}" + Environment = var.resource_suffix + } +} \ No newline at end of file diff --git a/lablink-infrastructure/cloudwatch_alarms.tf b/lablink-infrastructure/cloudwatch_alarms.tf new file mode 100644 index 0000000..8e57a07 --- /dev/null +++ b/lablink-infrastructure/cloudwatch_alarms.tf @@ -0,0 +1,170 @@ +# SNS Topic for Admin Alerts +resource "aws_sns_topic" "admin_alerts" { + name = "lablink-admin-alerts-${var.resource_suffix}" + + tags = { + Name = "lablink-admin-alerts-${var.resource_suffix}" + Environment = var.resource_suffix + } +} + +# SNS Email Subscription +resource "aws_sns_topic_subscription" "admin_email" { + count = try(local.config_file.monitoring.enabled, false) ? 1 : 0 + topic_arn = aws_sns_topic.admin_alerts.arn + protocol = "email" + endpoint = try(local.config_file.monitoring.email, "") +} + +# Metric Filter: Mass Instance Launches +resource "aws_cloudwatch_log_metric_filter" "run_instances" { + name = "lablink-run-instances-${var.resource_suffix}" + log_group_name = aws_cloudwatch_log_group.cloudtrail_logs.name + + pattern = <