Skip to content

Commit ccebd99

Browse files
authored
Merge pull request #145 from NillionNetwork/feat/add_gpt_oss
feat: Add GPT OSS 20B and 120B
2 parents a2d815e + a33ab07 commit ccebd99

File tree

9 files changed

+101
-78
lines changed

9 files changed

+101
-78
lines changed

.env.ci

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,6 @@ ETCD_PORT = 2379
4444
# Grafana Docker Compose Config
4545
GF_SECURITY_ADMIN_USER = "admin"
4646
GF_SECURITY_ADMIN_PASSWORD = "password"
47+
48+
# WebSearch Settings
49+
BRAVE_SEARCH_API = "Your API here"

.env.sample

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,6 @@ ETCD_PORT = 2379
4444
# Grafana Docker Compose Config
4545
GF_SECURITY_ADMIN_USER = "admin"
4646
GF_SECURITY_ADMIN_PASSWORD = "password"
47+
48+
# WebSearch Settings
49+
BRAVE_SEARCH_API = "Your API here"
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
services:
2+
gpt_120b_gpu:
3+
image: nillion/nilai-vllm:latest
4+
deploy:
5+
resources:
6+
reservations:
7+
devices:
8+
- driver: nvidia
9+
count: 1
10+
capabilities: [gpu]
11+
12+
ulimits:
13+
memlock: -1
14+
stack: 67108864
15+
env_file:
16+
- .env
17+
restart: unless-stopped
18+
depends_on:
19+
etcd:
20+
condition: service_healthy
21+
command: >
22+
--model openai/gpt-oss-120b
23+
--gpu-memory-utilization 0.95
24+
--max-model-len 100000
25+
--max-num-batched-tokens 100000
26+
--tensor-parallel-size 1
27+
--uvicorn-log-level warning
28+
environment:
29+
- SVC_HOST=gpt_120b_gpu
30+
- SVC_PORT=8000
31+
- ETCD_HOST=etcd
32+
- ETCD_PORT=2379
33+
- TOOL_SUPPORT=true
34+
volumes:
35+
- hugging_face_models:/root/.cache/huggingface # cache models
36+
healthcheck:
37+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
38+
interval: 30s
39+
retries: 3
40+
start_period: 60s
41+
timeout: 10s
42+
volumes:
43+
hugging_face_models:
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
services:
2+
gpt_20b_gpu:
3+
image: nillion/nilai-vllm:latest
4+
deploy:
5+
resources:
6+
reservations:
7+
devices:
8+
- driver: nvidia
9+
count: 1
10+
capabilities: [gpu]
11+
12+
ulimits:
13+
memlock: -1
14+
stack: 67108864
15+
env_file:
16+
- .env
17+
restart: unless-stopped
18+
depends_on:
19+
etcd:
20+
condition: service_healthy
21+
command: >
22+
--model openai/gpt-oss-20b
23+
--gpu-memory-utilization 0.85
24+
--max-model-len 100000
25+
--max-num-batched-tokens 100000
26+
--tensor-parallel-size 1
27+
--uvicorn-log-level warning
28+
environment:
29+
- SVC_HOST=gpt_20b_gpu
30+
- SVC_PORT=8000
31+
- ETCD_HOST=etcd
32+
- ETCD_PORT=2379
33+
- TOOL_SUPPORT=true
34+
volumes:
35+
- hugging_face_models:/root/.cache/huggingface # cache models
36+
healthcheck:
37+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
38+
interval: 30s
39+
retries: 3
40+
start_period: 60s
41+
timeout: 10s
42+
volumes:
43+
hugging_face_models:

nilai-api/src/nilai_api/config/config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ model_concurrent_rate_limit:
77
cognitivecomputations/Dolphin3.0-Llama3.1-8B: 30
88
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B: 5
99
hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4: 5
10+
openai/gpt-oss-20b: 50
11+
default: 50
1012

1113
user_rate_limit_minute: null
1214
user_rate_limit_hour: null

nilai-api/src/nilai_api/routers/private.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,9 @@ async def chat_completion_concurrent_rate_limit(request: Request) -> Tuple[int,
109109
except ValueError:
110110
raise HTTPException(status_code=400, detail="Invalid request body")
111111
key = f"chat:{chat_request.model}"
112-
try:
113-
limit = MODEL_CONCURRENT_RATE_LIMIT[chat_request.model]
114-
except KeyError:
115-
raise HTTPException(status_code=400, detail="Invalid model name")
112+
limit = MODEL_CONCURRENT_RATE_LIMIT.get(
113+
chat_request.model, MODEL_CONCURRENT_RATE_LIMIT.get("default", 50)
114+
)
116115
return limit, key
117116

118117

nilai-api/src/nilai_api/state.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ async def models(self) -> Dict[str, ModelEndpoint]:
4545
return await self.discovery_service.discover_models()
4646

4747
async def get_model(self, model_id: str) -> Optional[ModelEndpoint]:
48+
if model_id is None or len(model_id) == 0:
49+
return None
4850
return await self.discovery_service.get_model(model_id)
4951

5052

scripts/wait_for_ci_services.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
API_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-api 2>/dev/null)
55
MODEL_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-llama_1b_gpu 2>/dev/null)
66
NUC_API_HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' nilai-nuc-api 2>/dev/null)
7-
MAX_ATTEMPTS=20
7+
MAX_ATTEMPTS=30
88
ATTEMPT=1
99

1010
while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do

uv.lock

Lines changed: 1 addition & 73 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)