facebookresearch · dongwang218 · Apr 17, 2025 · Apr 17, 2025
@@ -82,7 +82,13 @@ matrix deploy_applications --action add --applications "[{'model_name': 'meta-ll
 matrix deploy_applications --applications ''
 ```
 ### Adjust Model Args
-vLLM Engine [Aruments](https://docs.vllm.ai/en/latest/serving/engine_args.html) can be specified in the deploy_applications arguments. The default values for popular models are in this [config](matrix/app_server/llm/llm_config.py). To scale the deployment, `min_replia` and `max_replica` can be added based on num of workers.
+vLLM Engine [Aruments](https://docs.vllm.ai/en/latest/serving/engine_args.html) can be specified in the deploy_applications arguments. The default values for popular models are in [llm_config.py](matrix/app_server/llm/llm_config.py). Other useful args
+* `model_name`: a huggingface model name or a directory containing checkpoints.
+* `name`: the default app_name.
+* `model_size`: map a non huggingface model to the defaults in the config file.
+* `max_ongoing_requests`: the max concurrent requests to each replica.
+* `min_replia` and `max_replica`: the num of replicas ranges auto-scaled based on num of Ray workers.
+* `use_grpc`: enable grpc by adding `{'use_grpc':  'true'}`.
 
 ### OpenAI Azure Model
 - Note: no GPU is required, in start_workers, can add `--slurm "{'gpus_per_node': 0}"`

@@ -6,7 +6,7 @@
 
 # default model parameters can be overwritten from command line
 llm_model_default_parameters = {
-    "meta-llama/Meta-Llama-3.1-3B-Instruct": {
+    "meta-llama/Llama-3.1-3B-Instruct": {
         "name": "3B",
         "tensor-parallel-size": 1,
         "pipeline-parallel-size": 1,
@@ -15,7 +15,7 @@
         "max-model-len": 131072,
         "gpu-memory-utilization": 0.8,
     },
-    "meta-llama/Meta-Llama-3.1-8B-Instruct": {
+    "meta-llama/Llama-3.1-8B-Instruct": {
         "name": "8B",
         "tensor-parallel-size": 1,
         "pipeline-parallel-size": 1,
@@ -24,7 +24,7 @@
         "max-model-len": 131072,
         "gpu-memory-utilization": 0.8,
     },
-    "meta-llama/Meta-Llama-3.1-70B-Instruct": {
+    "meta-llama/Llama-3.1-70B-Instruct": {
         "name": "70B",
         "tensor-parallel-size": 4,
         "pipeline-parallel-size": 1,
@@ -33,7 +33,7 @@
         "gpu-memory-utilization": 0.8,
         "max_ongoing_requests": 100,
     },
-    "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8": {
+    "meta-llama/Llama-3.1-405B-Instruct-FP8": {
         "name": "405B-FP8",
         "tensor-parallel-size": 8,
         "pipeline-parallel-size": 1,
@@ -42,7 +42,7 @@
         "gpu-memory-utilization": 0.8,
         "max_ongoing_requests": 50,
     },
-    "meta-llama/Meta-Llama-3.1-405B-Instruct": {
+    "meta-llama/Llama-3.1-405B-Instruct": {
         "name": "405B",
         "tensor-parallel-size": 8,
         "pipeline-parallel-size": 2,
@@ -51,7 +51,7 @@
         "gpu-memory-utilization": 0.8,
         "max_ongoing_requests": 50,
     },
-    "meta-llama/Meta-Llama-3.3-70B-Instruct": {
+    "meta-llama/Llama-3.3-70B-Instruct": {
         "name": "3_3_70B",
         "tensor-parallel-size": 4,
         "pipeline-parallel-size": 1,

@@ -73,7 +73,7 @@ classifiers=[
   ]
   vllm_083 = [
       "vllm==v0.8.3",
-      "ray[serve]>=2.43.0",
+      "ray[serve]==2.43.0",
       "torch>=2.6.0",
   ]