Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
ccb55c5
fix cidr collision
aliziel May 29, 2025
836d9d5
build: add helmfile, certificate issuer manifests, CI deploy
aliziel Jun 1, 2025
7ce0525
build: add eoAPI support chart
aliziel Jun 1, 2025
e771def
build: add backend parameters, local vars to avoid accidental state c…
aliziel Jun 1, 2025
0640437
docs: overviews on initial implementation
aliziel Jun 1, 2025
666a9f4
chore: add PR template
aliziel Jun 1, 2025
9539b4a
fix: rm echo in tofu output cmd
aliziel Jun 1, 2025
0e1303e
docs: misc formatting for readability
aliziel Jun 1, 2025
d24df79
build: update bucket name
aliziel Jun 4, 2025
f67c7b1
fix: remove conditional exec based on modified files
aliziel Jun 5, 2025
d53d075
build: add admin arns for cluster access, use variadic in second arg
aliziel Jun 26, 2025
974d271
fix: use alternative email for certs https://github.com/hotosm/k8s-in…
aliziel Jul 8, 2025
a21775a
feat: use STAC API container from OpenAerialMap (#19)
ceholden Jul 11, 2025
abd4803
build: rm access entries, add host to helm config, set certificate an…
aliziel Jul 1, 2025
d1ba341
build: hold on support chart
aliziel Jul 11, 2025
4f8e5ad
build: reset hostname deploy
aliziel Jul 11, 2025
01ac20b
fix: override STAC API command and disable vector (TiPG) (#20)
ceholden Jul 11, 2025
68970a4
build: rm label-pr workaround (fixed), reset hostname deploy
aliziel Jul 15, 2025
99158c1
build: retry with Route 53 fixes + new subdomain
aliziel Jul 17, 2025
3ddcda1
build: switch to prod ClusterIssuer, cleanup
aliziel Jul 22, 2025
2215638
build: revert support chart condition
aliziel Jul 22, 2025
5afd118
build: add cron manifests
aliziel Jul 22, 2025
808dbaf
build: start on staging certs for support services
aliziel Jul 23, 2025
6b21130
build: switch to prod issuer
aliziel Jul 23, 2025
dfcbdb4
build: ingress gate on prometheus
aliziel Jul 23, 2025
c38e0e2
build: pause crons
aliziel Jul 23, 2025
ce52162
build: replan with addtl admins
aliziel Jul 23, 2025
27e5bdf
build: reapply addtl admins
aliziel Jul 23, 2025
dc900ae
build: diff revrt stac image
aliziel Jul 30, 2025
526e6c5
build: revrt stac image
aliziel Jul 30, 2025
5377c20
build: reset stac image
aliziel Jul 30, 2025
388c527
build: swap stac image
aliziel Jul 31, 2025
ee6bb2e
build: reset stac image
aliziel Jul 31, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions .github/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
## What type of PR is this? (check all applicable)

- [ ] πŸ• Feature
- [ ] πŸ› Bug Fix
- [ ] πŸ“ Documentation
- [ ] πŸ§‘β€πŸ’» Refactor
- [ ] βœ… Test
- [ ] πŸ€– Build or CI
- [ ] ❓ Other (please specify)

## Related Issue

Example: Fixes #123

## Describe this PR

A brief description of how this solves the issue.

## Screenshots

Please provide screenshots of the change.

## Alternative Approaches Considered

Did you attempt any other approaches that are not documented in code?

## Review Guide

Notes for the reviewer. How to test this change?

## Checklist before requesting a review

- πŸ“– Read the HOT Contributing Guide: <https://docs.hotosm.org/become-a-contributor/>
- πŸ“– Read the HOT Code of Conduct: <https://docs.hotosm.org/code-of-conduct>
- πŸ‘·β€β™€οΈ Create small PRs. In most cases, this will be possible.
- βœ… Provide tests for your changes.
- πŸ“ Use descriptive commit messages.
- πŸ“— Update any related documentation and include any relevant screenshots.
- πŸ”  Does this PR introduce or change any environment variables? If so, make sure to specify this change in the description.

## [optional] What gif best describes this PR or how it makes you feel?
24 changes: 21 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Deploy Terraform
name: Deploy Changes
on:
push:
branches:
Expand Down Expand Up @@ -35,12 +35,30 @@ jobs:
role-to-assume: ${{ secrets.AWS_OIDC_ROLE }}
- name: Provision TF
uses: op5dev/tf-via-pr@v13
env:
TF_VAR_cluster_ci_access_role_arn: ${{ secrets.AWS_OIDC_ROLE }}
TF_VAR_cluster_admin_access_role_arns: ${{ secrets.CLUSTER_ADMIN_ACCESS_ROLE_ARNS }}
with:
# command: 'apply'
command: ${{ github.event_name == 'push' && 'apply' || 'plan' }}
tool: tofu
working-directory: terraform
validate: true
format: true
arg-var-file: ${{ env.VAR_FILE }}
arg-var: cluster_ci_access_role_arn=${{ secrets.AWS_OIDC_ROLE }}
label-pr: false
- name: Get TF Outputs
run: |
echo "S3_BACKUP_ROLE=$(tofu -chdir=terraform output -var-file=vars/production.tfvars s3_backup_role)" >> $GITHUB_ENV
echo "CLUSTER_NAME=$(tofu -chdir=terraform output -var-file=vars/production.tfvars cluster_name)" >> $GITHUB_ENV
- name: Pull kubeconfig
run: |
aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }}
- name: Apply manifests
run: |
kubectl apply -f kubernetes/manifests/ ${{ github.event_name == 'pull_request' && '--dry-run' || '' }}
- name: Deploy eoAPI Chart
uses: helmfile/helmfile-action@v2.0.4
with:
helmfile-args: 'apply'
# helmfile-args: ${{ github.event_name == 'push' && 'apply' || 'diff' }}
helmfile-workdirectory: kubernetes/helm
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
AWS_PROFILE ?= default
CLUSTER_NAME = $(shell tofu -chdir=terraform output cluster_name)
S3_BACKUP_ROLE = $(shell tofu -chdir=terraform output s3_backup_role)
CLUSTER_NAME = $(shell tofu -chdir=terraform output -var-file=vars/local.tfvars cluster_name)
S3_BACKUP_ROLE = $(shell tofu -chdir=terraform output -var-file=vars/local.tfvars s3_backup_role)

PGO_CHART_VERSION = 5.7.4
EOAPI_CHART_VERSION = 0.7.1
Expand Down Expand Up @@ -34,4 +34,4 @@ init-eoapi:
## deploy-eoapi: Upgrade or install eoAPI release
deploy-eoapi:
helm repo list | grep "eoapi" >/dev/null 2>&1 || { echo "Not initialized, run 'make init-eoapi' before retrying"; exit 1; }
helm upgrade --install --namespace eoapi --create-namespace eoapi eoapi/eoapi --version $(EOAPI_CHART_VERSION) -f kubernetes/helm/eoapi.yaml --set previousVersion=$(EOAPI_CHART_VERSION) --set postgrescluster.metadata.annotations.eks.amazonaws.com/role-arn=$(S3_BACKUP_ROLE)
helm upgrade --install --namespace eoapi --create-namespace eoapi eoapi/eoapi --version $(EOAPI_CHART_VERSION) -f kubernetes/helm/eoapi-values.yaml --set previousVersion=$(EOAPI_CHART_VERSION) --set postgrescluster.metadata.annotations."eks\.amazonaws\.com/role-arn"=$(S3_BACKUP_ROLE)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is useful to have for manual deploy πŸ‘

I have been having a lot of success with a GitOps approach in my homelab - using ArgoCD to pull deployments from a public repo (rather than having to deploy everything manually, or via a CI/CD push). I think I can configure the same after this PR is merged @dakotabenjamin

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And just as a heads up it might be even safer to do helmfile apply for manual as well. That'll include all of the params above, plus make sure the PostgresOperator CRDs are installed and Ready.

31 changes: 26 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,30 @@ See the [inital proposal](docs/proposal.md) for more background.

#### Required Tools

[AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
[OpenTofu](https://opentofu.org/docs/intro/install/)
[kubectl](https://kubernetes.io/docs/tasks/tools/)
[Helm](https://helm.sh/docs/intro/install/)
- [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
- [OpenTofu](https://opentofu.org/docs/intro/install/)
- [kubectl](https://kubernetes.io/docs/tasks/tools/)
- [Helm](https://helm.sh/docs/intro/install/)

// TODO 🚧

### Areas for Further (Initial) Development

#### Variable Management

- Duplication exists between TF inputs, CI workflows, and local scripts.
- A tool like https://github.com/helmfile/helmfile may help with sourcing variables by environment.
- A basic version has been added to deploy revision deltas, further templating would be required.
- As more HOT applications + services are moved to cluster, this will only grow.

#### Deployment

- Provisioning is currently done in the same workflow (TF, K8s, Helm), mostly as byproduct of initial development phase. Can be further refined.
- GitOps tools like ArgoCD are [under consideration](https://github.com/hotosm/k8s-infra/issues/14)
- Flux [Tofu controller](https://github.com/flux-iac/tofu-controller) may be an analog for base infrastructure (further investigation required).

#### Bridging TF and Kubernetes

- TF-managed information often needs to be referenced on the cluster
- ex: PostgresCluster CRD requires the role ARN authorized for backups. Role and bucket are created in TF.
- Global cluster resources are provisioned through TF, but argument can be made for their management by K8s.
- Ideal solution enables cluster resources to reference, mount, inject, etc. TF-managed information with minimal developer intervention.
151 changes: 151 additions & 0 deletions kubernetes/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# Cluster Applications

See [initial migration outline](../proposal.md) for main HOT OSM applications.

Relevant Docs:
- [kubectl]
- [Helm]

## Global

### ClusterIssuer

Issue TLS certificates for the cluster via [cert-manager]. See also [eoAPI TLS section](#transport-layer-security-tls).

Install:
```sh
# ** See helm/eoapi-values.yaml for initial setup **
$ kubectl apply -f kubernetes/manifests/cluster-issuer.yaml
```

## eoAPI

Open source Earth Observation (EO) backend supporting Open Aerial Map (OAM).

Site: https://eoapi.dev/
Chart: https://github.com/developmentseed/eoapi-k8s

Install:
```sh
$ helm upgrade --install --set disable_check_for_upgrades=true pgo oci://registry.developers.crunchydata.com/crunchydata/pgo --version $PGO_VERSION
$ helm repo add eoapi https://devseed.com/eoapi-k8s/
$ helm upgrade --install --namespace eoapi --create-namespace eoapi eoapi/eoapi \
--version $EOAPI_CHART_VERSION \
-f kubernetes/helm/eoapi-values.yaml \
--set previousVersion=$EOAPI_CHART_VERSION \
--set postgrescluster.metadata.annotations."eks\.amazonaws\.com/role-arn"=$S3_BACKUP_ROLE
```

#### helmfile

A basic [helmfile] has been added for GitHub Actions, but its recommended to use outside of CI workflows to maintain consistency.

```sh
$ helmfile apply
```

Provided the values match, a similar workflow can be achieved with the Makefile commands if the additional install isn't desired.

### Configuration

See [eoAPI chart docs]. The following sections provide a basic outline of overlays, customizations, and considerations specific to HOT's initial implementation.

#### Transport Layer Security (TLS)

See [cert-manager docs] and [eoAPI guidance on cert-manager setup].

- Requires a domain controlled by HOT
- Issuer manifests and chart settings have been made available to provision certificates using [ingress annotations] and Let's Encrypt/[ACME]
- Recommend going through staging issuer first to avoid hitting rate limits

#### Backups

Enabled with default settings, see the [PostgresOperator docs] for further customization.

Uses an [OIDC auth setup] to access S3, which requires propagating TF-managed information to K8s.

> [!NOTE]
> Further development to bridge and/or reorganize TF and K8s-provisioned resources may remove the need to set a `role-arn` annotation on each release.

#### Monitoring / Observability / Autoscaling

The eoAPI support chart adds Prometheus and Grafana tooling to enable systems analysis, visualization, and custom metrics for autoscaling.

- [eoAPI support chart setup]: in-depth walkthrough
- [eoAPI chart configuration]: set HPA behavior for services
- [eoAPI support chart dependencies]: explore further customization, provider documentation

_Currently set to install once TLS is enabled in eoAPI._

## Tips + Commands

### Setup

#### Local Context

```sh
$ aws eks update-kubeconfig --name <cluster_name>
```

### Debugging

CLI manual will be most helpful:
```sh
$ kubectl --help
```

#### Examples

Basic cluster overview:
```sh
$ kubectl get pod,svc,deploy -A
```

Shell into default container on pod:
```sh
$ kubectl -n <ns> exec -it <pod> -- bash
# $
```

Inspect ingress details:
```sh
$ kubectl -n <ns> describe ingress/<ingress>
```

Redirect pod log output to file:
```sh
$ kubectl -n <ns> logs <pod> --all-containers=true >> file.log
```

[kubectl]:
https://kubernetes.io/docs/reference/kubectl/
[Helm]:
https://helm.sh/docs/
[Let's Encrypt]:
https://letsencrypt.org/
[cert-manager]:
https://cert-manager.io/
[cert-manager docs]:
https://cert-manager.io/docs/configuration/
[helmfile]:
https://github.com/helmfile/helmfile
[eoAPI chart docs]:
https://github.com/developmentseed/eoapi-k8s/tree/975a26639fa3b8be7d3338220d6ea9c4470d8d15/docs
[iframing]:
https://developmentseed.slack.com/archives/C08B8L61QTT/p1747740182369159?thread_ts=1747314980.658339&cid=C08B8L61QTT
[eoAPI guidance on cert-manager setup]:
https://github.com/developmentseed/eoapi-k8s/blob/main/docs/unified-ingress.md#setting-up-tls-with-cert-manager
[ingress annotations]:
https://cert-manager.io/docs/usage/ingress/
[ACME]:
https://cert-manager.io/docs/configuration/acme/
[PostgresOperator docs]:
https://access.crunchydata.com/documentation/postgres-operator/latest/tutorials/backups-disaster-recovery/backups
[OIDC auth setup]:
https://access.crunchydata.com/documentation/postgres-operator/latest/tutorials/backups-disaster-recovery/backups#using-an-aws-integrated-identity-provider-and-role
[eoAPI support chart setup]:
https://github.com/developmentseed/eoapi-k8s/blob/975a26639fa3b8be7d3338220d6ea9c4470d8d15/docs/autoscaling.md
[eoAPI chart configuration]:
https://github.com/developmentseed/eoapi-k8s/blob/975a26639fa3b8be7d3338220d6ea9c4470d8d15/docs/configuration.md
[eoAPI support chart dependencies]:
https://github.com/developmentseed/eoapi-k8s/blob/975a26639fa3b8be7d3338220d6ea9c4470d8d15/helm-chart/eoapi-support/Chart.yaml
58 changes: 58 additions & 0 deletions kubernetes/helm/eoapi-support-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
prometheus-adapter:
prometheus:
url: http://eoapi-support-prometheus-server.eoapi-support.svc.cluster.local

prometheus:
server:
service:
type: ClusterIP
annotations: { }
ingress:
annotations:
nginx.ingress.kubernetes.io/auth-type: basic
nginx.ingress.kubernetes.io/auth-secret: eoapi-support-prometheus
nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required'
nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/enable-access-log: "true"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
enabled: true
ingressClassName: nginx
hosts:
- metrics.k8s-prod.hotosm.org
tls:
- secretName: prometheus-server-tls
hosts:
- metrics.k8s-prod.hotosm.org
persistentVolume:
storageClass: gp2

grafana:
service:
type: ClusterIP
annotations: { }
ingress:
annotations:
nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/enable-access-log: "true"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
enabled: true
ingressClassName: nginx
hosts:
- dashboard.k8s-prod.hotosm.org
tls:
- secretName: grafana-tls
hosts:
- dashboard.k8s-prod.hotosm.org
datasources:
datasources.yaml:
datasources:
- name: prometheus
orgId: 1
type: prometheus
url: http://eoapi-support-prometheus-server.eoapi-support.svc.cluster.local
access: proxy
jsonData:
timeInterval: "5s"
isDefault: true
editable: true
version: 1 # This number should be increased when changes are made to update the datasource
Loading