forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDockerfile_VSCode_Inference_Server
More file actions
64 lines (47 loc) · 1.83 KB
/
Dockerfile_VSCode_Inference_Server
File metadata and controls
64 lines (47 loc) · 1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
RUN apt-get update -y \
&& apt-get install -y python3-pip
WORKDIR /workspace
# install build and runtime dependencies
COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt
# install development dependencies
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt
# image to build pytorch extensions
FROM dev AS build
# install build dependencies
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-build.txt
# copy input files
COPY csrc csrc
COPY setup.py setup.py
COPY requirements.txt requirements.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py
# max jobs used by Ninja to build extensions
ENV MAX_JOBS=4
# number of threads used by nvcc
ARG nvcc_threads=2
ENV NVCC_THREADS=$nvcc_threads
RUN python3 setup.py build_ext --inplace
# image to run unit testing suite
FROM dev AS test
# copy pytorch extensions separately to avoid having to rebuild
# when python code changes
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY tests tests
COPY vllm vllm
# downloading model
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
RUN apt-get install git-lfs -y
RUN git lfs install
# currently one one I've tested
RUN git clone https://huggingface.co/TheBloke/CodeLlama-7B-Python-AWQ model
COPY llm-vscode-inference-server/api_server.py api_server.py
# EXPOSE 8000
ENTRYPOINT ["python3", "/workspace/api_server.py", "--trust-remote-code", "--model", "/workspace/model", "--quantization", "awq", "--dtype", "half", "--max-model-len", "4096", "--max-num-batched-tokens", "4096"]
# CMD ["/bin/sh"]