cft-llama-cpp-gpu.yaml

# https://github.com/openmarmot/aws-cft-llama-cpp

# ! Please view the Readme at the github repo for important usage notes !

# Note that the software this template is installing takes a significant amount of time to run

# Refs
#  llama.cpp repo  : https://github.com/ggerganov/llama.cpp
#  hugging face : https://huggingface.co/

AWSTemplateFormatVersion: '2010-09-09'
Description: AWS CloudFormation Template that creates and runs a Llama.cpp server. Optimized for P3 GPU instance
Parameters:
  NamePrefix:
    Description: A prefix to use when naming objects created by this template
    Type: String
    Default: 'cft-llama'
  InstanceType:
    Description: EC2 Instance Type - about 8 GB of ram for a 7b model, about 16 for a 13b. The more CPU the better
    Type: String
    Default: p3.2xlarge
  AmiId:
    Description: Amazon Linux 2023 AMI ID
    Type: 'AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>'
    Default: '/aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-x86_64'
  VolumeSize:
    Description: Size of the EC2 instance volume in GB
    Type: Number
    Default: 100
  VPC:
    Description: The VPC for the AWS objects
    Type: 'AWS::EC2::VPC::Id'
  SubnetId:
    Description: VPC PUBLIC subnet ID
    Type: 'AWS::EC2::Subnet::Id'
  NetAllowedIn:
    Description: 'CIDR Network Allowed to access the server (X.X.X.X/XX). This could be your public ip with a /32'
    Type: String
  ModelURL:
    Description: 'URL for the .gguf format model you want to download. The default model is about 4.8 GB.'
    Type: String
    Default: 'https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf'
  ModelFileName:
    Description: 'The file name you want to save the model as (x.gguf)'
    Type: String
    Default: 'model.gguf'
  
Resources:

  IAMRole:
    Type: 'AWS::IAM::Role'
    Properties:
      RoleName: !Sub '${NamePrefix}-ec2-role'
      AssumeRolePolicyDocument:
        Version: '2012-10-17'
        Statement:
          - Effect: 'Allow'
            Principal:
              Service:
                - 'ec2.amazonaws.com'
            Action:
              - 'sts:AssumeRole'
      Policies:
        - PolicyName: ssm-connect-minimal-permissions
          PolicyDocument:
            Version: '2012-10-17'
            Statement:
              - Effect: 'Allow'
                Action:
                  - 'ssm:UpdateInstanceInformation'
                  - 'ssmmessages:CreateControlChannel'
                  - 'ssmmessages:CreateDataChannel'
                  - 'ssmmessages:OpenControlChannel'
                  - 'ssmmessages:OpenDataChannel'
                Resource: '*'

  InstanceProfile:
    Type: 'AWS::IAM::InstanceProfile'
    Properties:
      InstanceProfileName: !Sub '${NamePrefix}-instance-profile'
      Roles:
        - Ref: IAMRole

  InstanceSecurityGroup:
    Type: 'AWS::EC2::SecurityGroup'
    Properties:
      GroupDescription: !Sub '${NamePrefix} security group'
      SecurityGroupIngress:
        - IpProtocol: tcp
          FromPort: 80
          ToPort: 80
          CidrIp: !Ref NetAllowedIn
          Description: 'Allow http inbound to the webserver'
      SecurityGroupEgress:
        - IpProtocol: -1
          CidrIp: 0.0.0.0/0
          Description: 'Allow Outbound'
      VpcId: !Ref VPC

  LlamaCppServer:
    Type: 'AWS::EC2::Instance'
    Properties:
      InstanceType: !Ref InstanceType
      ImageId: !Ref AmiId
      IamInstanceProfile: !Ref InstanceProfile
      BlockDeviceMappings:
        - DeviceName: /dev/xvda
          Ebs:
            VolumeSize: !Ref VolumeSize
            VolumeType: gp3
            Encrypted: true
      SecurityGroupIds:
        - !GetAtt InstanceSecurityGroup.GroupId
      SubnetId: !Ref SubnetId
      UserData:
        Fn::Base64: !Sub |
          #!/usr/bin/env bash

          # --- general ---
          dnf update -y
          
          # ---- NVIDIA driver install for AL2023----
          # pre-reqs
          dnf install -y dkms kernel-devel kernel-modules-extra
          # repos
          dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/amzn2023/x86_64/cuda-amzn2023.repo
          dnf config-manager --add-repo https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
          dnf clean expire-cache
          
          # nvidia driver
          dnf module install -y nvidia-driver:latest-dkms

          # cuda toolkit
          dnf install -y cuda-toolkit

          # optional - install the container tool kit 
          #dnf install -y nvidia-container-toolkit

          # tests
          # nvidia-smi
          # /etc/alternatives/cuda/bin/nvcc --version


          # ---- llama.cpp setup ----
          # install pre-reqs
          dnf install make automake gcc gcc-c++ kernel-devel git cmake htop -y
          cd /opt
          git clone --depth 1 --branch b4333 https://github.com/ggerganov/llama.cpp.git
          cd llama.cpp
          # compile llama.cpp
          # get the code here : https://developer.nvidia.com/cuda-gpus
          export CUDA_DOCKER_ARCH=compute_70
          export PATH=/etc/alternatives/cuda/bin:$PATH
          cmake -B build -DGGML_CUDA=ON
          cmake --build build --config Release
          # download the model file 
          wget -O models/${ModelFileName} ${ModelURL}

          # setup systemd service and start server
          SERVICE_NAME="llama-server.service"
          cat <<EOF > /etc/systemd/system/llama-server.service
          [Unit]
          Description=LLama.cpp Server Service
          After=network.target

          [Service]
          Type=simple
          User=root
          WorkingDirectory=/opt/llama.cpp
          ExecStart=/opt/llama.cpp/build/bin/llama-server -m /opt/llama.cpp/models/${ModelFileName} -c 2048 --host 0.0.0.0 --port 80
          Restart=on-failure

          [Install]
          WantedBy=multi-user.target
          EOF

          systemctl daemon-reload
          systemctl enable llama-server.service
          systemctl start llama-server.service

          sleep 10
          systemctl status llama-server.service

          #--- Signal result to CloudFormation ---
          /opt/aws/bin/cfn-signal -e $? --stack "${AWS::StackName}" --resource "LlamaCppServer" --region "${AWS::Region}"
      Tags:
        - Key: Name
          Value: !Sub '${NamePrefix} llama.cpp server'
    CreationPolicy:
      ResourceSignal:
        Timeout: PT80M

Outputs:
  WebServerURL:
    Description: Access the Web Server Here
    Value: !Join ['',['http://',!GetAtt LlamaCppServer.PublicIp]]