Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion contrib/aks/aks.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ resource federatedCrendial 'Microsoft.ManagedIdentity/userAssignedIdentities/fed
'api://AzureADTokenExchange'
]
issuer: aks.properties.oidcIssuerProfile.issuerURL
subject: 'system:serviceaccount:kube-system:azure-acr-identity'
subject: 'system:serviceaccount:default:azure-acr-identity'
}
}

Expand Down
46 changes: 1 addition & 45 deletions contrib/aks/k8s-deploy/wi-image-cred-provider.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,48 +4,4 @@ metadata:
annotations:
azure.workload.identity/client-id: __CLIENT_ID__
name: azure-acr-identity
namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: wi-image-cred-provider
namespace: kube-system
spec:
selector:
matchLabels:
app: wi-image-cred-provider
template:
metadata:
labels:
app: wi-image-cred-provider
azure.workload.identity/use: "true"
spec:
serviceAccountName: azure-acr-identity
initContainers:
- name: install
image: openpaistatic.azurecr.io/openai/wi-image-cred-provider:0.2.530
command: ['sh', '-c']
args:
- |
cp /workload-identity-token /host/bin
volumeMounts:
- name: binfolder
mountPath: /host/bin
containers:
- name: provider
image: openpaistatic.azurecr.io/openai/wi-image-cred-provider:0.2.530
command:
- "/wi-image-cred-provider"
volumeMounts:
- name: sock
mountPath: /var/run/
volumes:
- name: sock
hostPath:
path: /var/run/
type: DirectoryOrCreate
- name: binfolder
hostPath:
path: /opt/image-cred-provider/bin
type: DirectoryOrCreate
namespace: default
21 changes: 13 additions & 8 deletions contrib/aks/provisionscript.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ resource aksbootstrapid 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-0
scope: resourceGroup(hubsub, hubgroup)
}

resource aksAcrUai 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' existing = {
name: 'aksacr'
scope: resourceGroup(hubsub, hubgroup)
}

var kubeconfig = base64ToString(aks.listClusterUserCredential().kubeconfigs[0].value)

var kubeletversion = aks.properties.kubernetesVersion
Expand Down Expand Up @@ -44,7 +49,7 @@ var bootstrapscripts = {
'${nvidiacronjobscript} 1215 1410'
'${containerdscript} nvidia'
kubeletmsiscript
'${kubeletscript} Standard_ND96asr_v4 gpu'
'${kubeletscript} Standard_ND96asr_v4 gpu ${tenant().tenantId} ${hubsub} ${hubgroup} ${aksAcrUai.properties.clientId}'
tlsscanscript
blobproxyscript
]
Expand All @@ -57,7 +62,7 @@ var bootstrapscripts = {
'${nvidiacronjobscript} 1593 1410'
'${containerdscript} nvidia'
kubeletmsiscript
'${kubeletscript} Standard_ND96amsr_A100_v4 gpu'
'${kubeletscript} Standard_ND96amsr_A100_v4 gpu ${tenant().tenantId} ${hubsub} ${hubgroup} ${aksAcrUai.properties.clientId}'
tlsscanscript
blobproxyscript
]
Expand All @@ -70,7 +75,7 @@ var bootstrapscripts = {
rocmruntimescript
'${containerdscript} rocm'
kubeletmsiscript
'${kubeletscript} Standard_ND96isr_MI300X_v5 gpu'
'${kubeletscript} Standard_ND96isr_MI300X_v5 gpu ${tenant().tenantId} ${hubsub} ${hubgroup} ${aksAcrUai.properties.clientId}'
tlsscanscript
blobproxyscript
configipoibscript
Expand All @@ -83,7 +88,7 @@ var bootstrapscripts = {
'${nvidianvswitch} 2619 1980'
'${containerdscript} nvidia'
kubeletmsiscript
'${kubeletscript} Standard_ND96isr_H100_v5 gpu'
'${kubeletscript} Standard_ND96isr_H100_v5 gpu ${tenant().tenantId} ${hubsub} ${hubgroup} ${aksAcrUai.properties.clientId}'
tlsscanscript
blobproxyscript
]
Expand All @@ -95,7 +100,7 @@ var bootstrapscripts = {
'${nvidianvswitch} 3201 1980'
'${containerdscript} nvidia'
kubeletmsiscript
'${kubeletscript} Standard_ND96isr_H200_v5 gpu'
'${kubeletscript} Standard_ND96isr_H200_v5 gpu ${tenant().tenantId} ${hubsub} ${hubgroup} ${aksAcrUai.properties.clientId}'
tlsscanscript
blobproxyscript
]
Expand All @@ -104,21 +109,21 @@ var bootstrapscripts = {
waitdnsready
'${containerdscript} runc'
kubeletmsiscript
'${kubeletscript} Standard_E16bs_v5 cpu'
'${kubeletscript} Standard_E16bs_v5 cpu ${tenant().tenantId} ${hubsub} ${hubgroup} ${aksAcrUai.properties.clientId}'
]

Standard_D8s_v3: [
waitdnsready
'${containerdscript} runc'
kubeletmsiscript
'${kubeletscript} Standard_D8s_v3 cpu'
'${kubeletscript} Standard_D8s_v3 cpu ${tenant().tenantId} ${hubsub} ${hubgroup} ${aksAcrUai.properties.clientId}'
]

Standard_E8ds_v4: [
waitdnsready
'${containerdscript} runc'
kubeletmsiscript
'${kubeletscript} Standard_E8ds_v4 cpu'
'${kubeletscript} Standard_E8ds_v4 cpu ${tenant().tenantId} ${hubsub} ${hubgroup} ${aksAcrUai.properties.clientId}'
]
}

Expand Down
44 changes: 34 additions & 10 deletions contrib/aks/scripts/kubelet.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@ AKS_FQDN=$2
KUBE_CA_CERT=$3
INSTANCE_TYPE=$4
PROCESSOR_TYPE=$5
TENANT_ID=$6
SUBSCRIPTION_ID=$7
RESOURCE_GROUP=$8
UAMI_CLIENT_ID_OR_RESOURCE_ID=$9
ARCH_TYPE=$(uname -m)
if [ "$ARCH_TYPE" = "x86_64" ]; then
ARCH_TYPE="amd64"
elif [ "$ARCH_TYPE" = "aarch64" ]; then
ARCH_TYPE="arm64"
else
echo "Unsupported architecture: $ARCH_TYPE"
exit 1
fi

NODE_NAME=$(hostname)

Expand All @@ -36,36 +49,47 @@ KUBELET_SERVER_CERT_PATH="/etc/kubernetes/certs/kubeletserver.crt"
openssl genrsa -out $KUBELET_SERVER_PRIVATE_KEY_PATH 4096
openssl req -new -x509 -days 7300 -key $KUBELET_SERVER_PRIVATE_KEY_PATH -out $KUBELET_SERVER_CERT_PATH -subj "/CN=system:node:${NODE_NAME}"

curl -LO https://dl.k8s.io/v${KUBE_VERSION}/kubernetes-node-linux-amd64.tar.gz
tar -xvzf kubernetes-node-linux-amd64.tar.gz kubernetes/node/bin/kubelet
curl -LO https://dl.k8s.io/v${KUBE_VERSION}/kubernetes-node-linux-${ARCH_TYPE}.tar.gz
tar -xvzf kubernetes-node-linux-${ARCH_TYPE}.tar.gz kubernetes/node/bin/kubelet
mv kubernetes/node/bin/kubelet /usr/local/bin
rm kubernetes-node-linux-amd64.tar.gz
rm kubernetes-node-linux-${ARCH_TYPE}.tar.gz

# setup wicred
mkdir -p /opt/image-cred-provider/config/
mkdir -p /opt/image-cred-provider/bin/

touch /opt/image-cred-provider/bin/workload-identity-token
chmod +x /opt/image-cred-provider/bin/workload-identity-token
curl -L https://github.com/kubernetes-sigs/cloud-provider-azure/releases/download/v${KUBE_VERSION}/azure-acr-credential-provider-linux-${ARCH_TYPE} -o /opt/image-cred-provider/bin/acr-credential-provider
chmod +x /opt/image-cred-provider/bin/acr-credential-provider

sudo tee /etc/kubernetes/azure.json > /dev/null <<EOF
{
"cloud": "AzurePublicCloud",
"tenantId": "${TENANT_ID}",
"subscriptionId": "${SUBSCRIPTION_ID}",
"resourceGroup": "${RESOURCE_GROUP}",
"useManagedIdentityExtension": true,
"userAssignedIdentityID": "${UAMI_CLIENT_ID_OR_RESOURCE_ID}"
}
EOF

tee /opt/image-cred-provider/config/workload-identity-token.yaml > /dev/null <<EOF
tee /opt/image-cred-provider/config/acr-credential-provider.yaml > /dev/null <<EOF
kind: CredentialProviderConfig
apiVersion: kubelet.config.k8s.io/v1
providers:
- name: workload-identity-token
- name: acr-credential-provider
apiVersion: credentialprovider.kubelet.k8s.io/v1
matchImages:
- "*.azurecr.io"
args:
- /var/run/workload-identity-token.sock
defaultCacheDuration: 1m
- /etc/kubernetes/azure.json
defaultCacheDuration: 10m
EOF
# end setup wicred

# adust flags as desired
tee /etc/default/kubelet > /dev/null <<EOF
KUBELET_NODE_LABELS="kubernetes.azure.com/mode=system,kubernetes.azure.com/role=agent,node.kubernetes.io/exclude-from-external-load-balancers=true,kubernetes.azure.com/managed=false,kubernetes.io/os=linux,node.kubernetes.io/instance-type=$INSTANCE_TYPE,RepairStatus=Validate"
KUBELET_FLAGS="--address=0.0.0.0 --anonymous-auth=false --authentication-token-webhook=true --authorization-mode=Webhook --cgroup-driver=systemd --cgroups-per-qos=true --client-ca-file=/etc/kubernetes/certs/ca.crt --cluster-dns=10.0.0.10 --cluster-domain=cluster.local --enforce-node-allocatable=pods --event-qps=0 --eviction-hard=memory.available<500Mi,nodefs.available<50Gi,imagefs.available<200Gi,nodefs.inodesFree<5% --image-gc-high-threshold=99 --image-gc-low-threshold=90 --kube-reserved=cpu=180m,memory=3399Mi,pid=1000 --kubeconfig=/var/lib/kubelet/kubeconfig --max-pods=110 --node-status-update-frequency=10s --pod-infra-container-image=mcr.microsoft.com/oss/kubernetes/pause:3.6 --protect-kernel-defaults=true --read-only-port=0 --rotate-certificates=true --streaming-connection-idle-timeout=4h --tls-cert-file=/etc/kubernetes/certs/kubeletserver.crt --tls-cipher-suites=TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_GCM_SHA256 --tls-private-key-file=/etc/kubernetes/certs/kubeletserver.key --image-credential-provider-config=/opt/image-cred-provider/config/workload-identity-token.yaml --image-credential-provider-bin-dir=/opt/image-cred-provider/bin --container-log-max-size=5Gi --container-log-max-files=2"
KUBELET_FLAGS="--address=0.0.0.0 --anonymous-auth=false --authentication-token-webhook=true --authorization-mode=Webhook --cgroup-driver=systemd --cgroups-per-qos=true --client-ca-file=/etc/kubernetes/certs/ca.crt --cluster-dns=10.0.0.10 --cluster-domain=cluster.local --enforce-node-allocatable=pods --event-qps=0 --eviction-hard=memory.available<500Mi,nodefs.available<50Gi,imagefs.available<200Gi,nodefs.inodesFree<5% --image-gc-high-threshold=99 --image-gc-low-threshold=90 --kube-reserved=cpu=180m,memory=3399Mi,pid=1000 --kubeconfig=/var/lib/kubelet/kubeconfig --max-pods=110 --node-status-update-frequency=10s --pod-infra-container-image=mcr.microsoft.com/oss/kubernetes/pause:3.6 --protect-kernel-defaults=true --read-only-port=0 --rotate-certificates=true --streaming-connection-idle-timeout=4h --tls-cert-file=/etc/kubernetes/certs/kubeletserver.crt --tls-cipher-suites=TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_GCM_SHA256 --tls-private-key-file=/etc/kubernetes/certs/kubeletserver.key --image-credential-provider-config=/opt/image-cred-provider/config/acr-credential-provider.yaml --image-credential-provider-bin-dir=/opt/image-cred-provider/bin --container-log-max-size=5Gi --container-log-max-files=2"
EOF

# can simplify this + 2 following files by merging together
Expand Down
17 changes: 9 additions & 8 deletions src/cluster-configuration/deploy/start.sh.template
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,16 @@ pushd $(dirname "$0") > /dev/null
#chmod u+x configmap-create.sh
/bin/bash configmap-create.sh || exit $?

cp secret.yaml secret-default.yaml
sed -i "s/%NAMESPACE%/default/g" secret-default.yaml
kubectl apply --overwrite=true -f secret-default.yaml || exit $?
rm -rf secret-default.yaml
# Remove the pai-secret deployment for we don't need to use it anymore
#cp secret.yaml secret-default.yaml
#sed -i "s/%NAMESPACE%/default/g" secret-default.yaml
#kubectl apply --overwrite=true -f secret-default.yaml || exit $?
#rm -rf secret-default.yaml

cp secret.yaml secret-system.yaml
sed -i "s/%NAMESPACE%/kube-system/g" secret-system.yaml
kubectl apply --overwrite=true -f secret-system.yaml || exit $?
rm -rf secret-system.yaml
#cp secret.yaml secret-system.yaml
#sed -i "s/%NAMESPACE%/kube-system/g" secret-system.yaml
#kubectl apply --overwrite=true -f secret-system.yaml || exit $?
#rm -rf secret-system.yaml

# Create priorityClass for PAI daemon
kubectl apply --overwrite=true -f priority-class.yaml || exit $?
Expand Down
8 changes: 6 additions & 2 deletions src/cluster-configuration/deploy/stop.sh.template
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,12 @@ if kubectl get configmap | grep -q "k8s-etc-hosts"; then
fi
kubectl delete job write-etc-hosts --ignore-not-found || exit $?

if kubectl get secret | grep -q "{{ cluster_cfg['cluster']['docker-registry']['secret-name'] }}"; then
kubectl delete secret {{ cluster_cfg['cluster']['docker-registry']['secret-name'] }} || exit $?
if kubectl get secret -n default | grep -q "{{ cluster_cfg['cluster']['docker-registry']['secret-name'] }}"; then
kubectl delete secret {{ cluster_cfg['cluster']['docker-registry']['secret-name'] }} -n default || exit $?
fi

if kubectl get secret -n kube-system | grep -q "{{ cluster_cfg['cluster']['docker-registry']['secret-name'] }}"; then
kubectl delete secret {{ cluster_cfg['cluster']['docker-registry']['secret-name'] }} -n kube-system || exit $?
fi

if kubectl get priorityclass | grep -q "pai-daemon-priority"; then
Expand Down
48 changes: 47 additions & 1 deletion src/webportal-dind/build/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,53 @@ fi

log_info "Logging in to Docker registry"
DOCKER_REGISTRY=$(echo "$DOCKER_IMAGE" | cut -d'/' -f1)
echo "$DOCKER_PASS" | docker login "$DOCKER_REGISTRY" -u "$DOCKER_USER" --password-stdin

# Get UAMI Client ID from Azure Workload Identity injected environment variable
if [ -z "$AZURE_CLIENT_ID" ]; then
log_error "AZURE_CLIENT_ID environment variable not found"
log_error "Please ensure azure-acr-identity service account is properly configured with Workload Identity"
exit 1
fi

log_info "Using AZURE_CLIENT_ID: $AZURE_CLIENT_ID"

# For ACR authentication, we need to get a token for the ACR resource
IMDS_URL="http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://management.azure.com/&client_id=${AZURE_CLIENT_ID}"

log_info "Getting AAD token from IMDS for ACR: ${DOCKER_REGISTRY}"
AAD_TOKEN=$(curl -s -H "Metadata: true" "${IMDS_URL}" | jq -r .access_token)
if [ "${AAD_TOKEN}" == "null" ] || [ -z "${AAD_TOKEN}" ]; then
log_error "Failed to get AAD token from IMDS"
exit 1
fi

log_info "Exchanging AAD token for ACR refresh token"
ACR_REFRESH_TOKEN=$(curl -s -X POST \
-H "Content-Type: application/x-www-form-urlencoded" \
-d "grant_type=access_token&service=${DOCKER_REGISTRY}&access_token=${AAD_TOKEN}" \
"https://${DOCKER_REGISTRY}/oauth2/exchange" | jq -r .refresh_token)
if [ "${ACR_REFRESH_TOKEN}" == "null" ] || [ -z "${ACR_REFRESH_TOKEN}" ]; then
log_error "Failed to get ACR refresh token"
exit 1
fi

log_info "Getting ACR access token"
# Extract full repository path (e.g., "luciaopenai/webportal" from "luciaopenpai.azurecr.io/luciaopenai/webportal:test")
DOCKER_REPOSITORY=$(echo "$DOCKER_IMAGE" | cut -d'/' -f2- | cut -d':' -f1)
log_info "Repository path: ${DOCKER_REPOSITORY}"
ACR_ACCESS_TOKEN=$(curl -s -X POST \
-H "Content-Type: application/x-www-form-urlencoded" \
-d "grant_type=refresh_token&service=${DOCKER_REGISTRY}&scope=repository:${DOCKER_REPOSITORY}:pull&refresh_token=${ACR_REFRESH_TOKEN}" \
"https://${DOCKER_REGISTRY}/oauth2/token" | jq -r .access_token)
if [ "${ACR_ACCESS_TOKEN}" == "null" ] || [ -z "${ACR_ACCESS_TOKEN}" ]; then
log_error "Failed to get ACR access token"
exit 1
fi

log_info "Logging in to Docker registry with ACR access token"
echo "${ACR_ACCESS_TOKEN}" | docker login "${DOCKER_REGISTRY}" \
-u 00000000-0000-0000-0000-000000000000 \
--password-stdin

log_info "Pulling webportal Docker image"
docker pull "$DOCKER_IMAGE":"$DOCKER_TAG"
Expand Down
2 changes: 2 additions & 0 deletions src/webportal-dind/build/webportal-dind.common.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ FROM ubuntu:latest

RUN apt-get update && apt-get upgrade -y

RUN apt-get install -y jq

RUN mkdir -p /var/lib/docker-vfs

COPY build/install.sh .
Expand Down
8 changes: 4 additions & 4 deletions src/webportal-dind/deploy/webportal-dind.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ spec:
name: webportal-dind
labels:
app: webportal-dind
azure.workload.identity/use: "true"
spec:
serviceAccountName: azure-acr-identity
hostNetwork: false
hostPID: false
containers:
Expand All @@ -41,10 +43,8 @@ spec:
env:
- name: DOCKER_IMAGE
value: {{ cluster_cfg['cluster']['docker-registry']['prefix'] }}webportal
- name: DOCKER_USER
value: {{ cluster_cfg['cluster']['docker-registry']['username'] }}
- name: DOCKER_PASS
value: {{ cluster_cfg['cluster']['docker-registry']['password'] }}
- name: DOCKER_NAMESPACE
value: {{ cluster_cfg['cluster']['docker-registry']['namespace'] }}
- name: DOCKER_TAG
value: {{ cluster_cfg['cluster']['docker-registry']['tag'] }}
- name: LAUNCHER_TYPE
Expand Down
Loading