-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcft-llama-cpp-gpu.yaml
196 lines (172 loc) · 6.49 KB
/
cft-llama-cpp-gpu.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# https://github.com/openmarmot/aws-cft-llama-cpp
# ! Please view the Readme at the github repo for important usage notes !
# Note that the software this template is installing takes a significant amount of time to run
# Refs
# llama.cpp repo : https://github.com/ggerganov/llama.cpp
# hugging face : https://huggingface.co/
AWSTemplateFormatVersion: '2010-09-09'
Description: AWS CloudFormation Template that creates and runs a Llama.cpp server. Optimized for P3 GPU instance
Parameters:
NamePrefix:
Description: A prefix to use when naming objects created by this template
Type: String
Default: 'cft-llama'
InstanceType:
Description: EC2 Instance Type - about 8 GB of ram for a 7b model, about 16 for a 13b. The more CPU the better
Type: String
Default: p3.2xlarge
AmiId:
Description: Amazon Linux 2023 AMI ID
Type: 'AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>'
Default: '/aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-x86_64'
VolumeSize:
Description: Size of the EC2 instance volume in GB
Type: Number
Default: 100
VPC:
Description: The VPC for the AWS objects
Type: 'AWS::EC2::VPC::Id'
SubnetId:
Description: VPC PUBLIC subnet ID
Type: 'AWS::EC2::Subnet::Id'
NetAllowedIn:
Description: 'CIDR Network Allowed to access the server (X.X.X.X/XX). This could be your public ip with a /32'
Type: String
ModelURL:
Description: 'URL for the .gguf format model you want to download. The default model is about 4.8 GB.'
Type: String
Default: 'https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf'
ModelFileName:
Description: 'The file name you want to save the model as (x.gguf)'
Type: String
Default: 'model.gguf'
Resources:
IAMRole:
Type: 'AWS::IAM::Role'
Properties:
RoleName: !Sub '${NamePrefix}-ec2-role'
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: 'Allow'
Principal:
Service:
- 'ec2.amazonaws.com'
Action:
- 'sts:AssumeRole'
Policies:
- PolicyName: ssm-connect-minimal-permissions
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: 'Allow'
Action:
- 'ssm:UpdateInstanceInformation'
- 'ssmmessages:CreateControlChannel'
- 'ssmmessages:CreateDataChannel'
- 'ssmmessages:OpenControlChannel'
- 'ssmmessages:OpenDataChannel'
Resource: '*'
InstanceProfile:
Type: 'AWS::IAM::InstanceProfile'
Properties:
InstanceProfileName: !Sub '${NamePrefix}-instance-profile'
Roles:
- Ref: IAMRole
InstanceSecurityGroup:
Type: 'AWS::EC2::SecurityGroup'
Properties:
GroupDescription: !Sub '${NamePrefix} security group'
SecurityGroupIngress:
- IpProtocol: tcp
FromPort: 80
ToPort: 80
CidrIp: !Ref NetAllowedIn
Description: 'Allow http inbound to the webserver'
SecurityGroupEgress:
- IpProtocol: -1
CidrIp: 0.0.0.0/0
Description: 'Allow Outbound'
VpcId: !Ref VPC
LlamaCppServer:
Type: 'AWS::EC2::Instance'
Properties:
InstanceType: !Ref InstanceType
ImageId: !Ref AmiId
IamInstanceProfile: !Ref InstanceProfile
BlockDeviceMappings:
- DeviceName: /dev/xvda
Ebs:
VolumeSize: !Ref VolumeSize
VolumeType: gp3
Encrypted: true
SecurityGroupIds:
- !GetAtt InstanceSecurityGroup.GroupId
SubnetId: !Ref SubnetId
UserData:
Fn::Base64: !Sub |
#!/usr/bin/env bash
# --- general ---
dnf update -y
# ---- NVIDIA driver install for AL2023----
# pre-reqs
dnf install -y dkms kernel-devel kernel-modules-extra
# repos
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/amzn2023/x86_64/cuda-amzn2023.repo
dnf config-manager --add-repo https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
dnf clean expire-cache
# nvidia driver
dnf module install -y nvidia-driver:latest-dkms
# cuda toolkit
dnf install -y cuda-toolkit
# optional - install the container tool kit
#dnf install -y nvidia-container-toolkit
# tests
# nvidia-smi
# /etc/alternatives/cuda/bin/nvcc --version
# ---- llama.cpp setup ----
# install pre-reqs
dnf install make automake gcc gcc-c++ kernel-devel git cmake htop -y
cd /opt
git clone --depth 1 --branch b4333 https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
# compile llama.cpp
# get the code here : https://developer.nvidia.com/cuda-gpus
export CUDA_DOCKER_ARCH=compute_70
export PATH=/etc/alternatives/cuda/bin:$PATH
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release
# download the model file
wget -O models/${ModelFileName} ${ModelURL}
# setup systemd service and start server
SERVICE_NAME="llama-server.service"
cat <<EOF > /etc/systemd/system/llama-server.service
[Unit]
Description=LLama.cpp Server Service
After=network.target
[Service]
Type=simple
User=root
WorkingDirectory=/opt/llama.cpp
ExecStart=/opt/llama.cpp/build/bin/llama-server -m /opt/llama.cpp/models/${ModelFileName} -c 2048 --host 0.0.0.0 --port 80
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl enable llama-server.service
systemctl start llama-server.service
sleep 10
systemctl status llama-server.service
#--- Signal result to CloudFormation ---
/opt/aws/bin/cfn-signal -e $? --stack "${AWS::StackName}" --resource "LlamaCppServer" --region "${AWS::Region}"
Tags:
- Key: Name
Value: !Sub '${NamePrefix} llama.cpp server'
CreationPolicy:
ResourceSignal:
Timeout: PT80M
Outputs:
WebServerURL:
Description: Access the Web Server Here
Value: !Join ['',['http://',!GetAtt LlamaCppServer.PublicIp]]