@@ -94,20 +94,22 @@ int llama_server(int argc, char ** argv) {
9494 const bool is_router_server = params.model .path .empty ();
9595 common_params_print_info (params, !is_router_server);
9696
97- // validate batch size for embeddings
98- // embeddings require all tokens to be processed in a single ubatch
99- // see https://github.com/ggml-org/llama.cpp/issues/12836
100- if (params.embedding && params.n_batch > params.n_ubatch ) {
101- SRV_WRN (" embeddings enabled with n_batch (%d) > n_ubatch (%d)\n " , params.n_batch , params.n_ubatch );
102- SRV_WRN (" setting n_batch = n_ubatch = %d to avoid assertion failure\n " , params.n_ubatch );
103- params.n_batch = params.n_ubatch ;
104- }
97+ if (!is_router_server) {
98+ // validate batch size for embeddings
99+ // embeddings require all tokens to be processed in a single ubatch
100+ // see https://github.com/ggml-org/llama.cpp/issues/12836
101+ if (params.embedding && params.n_batch > params.n_ubatch ) {
102+ SRV_WRN (" embeddings enabled with n_batch (%d) > n_ubatch (%d)\n " , params.n_batch , params.n_ubatch );
103+ SRV_WRN (" setting n_batch = n_ubatch = %d to avoid assertion failure\n " , params.n_ubatch );
104+ params.n_batch = params.n_ubatch ;
105+ }
105106
106- if (params.n_parallel < 0 ) {
107- SRV_INF (" %s" , " n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n " );
107+ if (params.n_parallel < 0 ) {
108+ SRV_INF (" %s" , " n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n " );
108109
109- params.n_parallel = 4 ;
110- params.kv_unified = true ;
110+ params.n_parallel = 4 ;
111+ params.kv_unified = true ;
112+ }
111113 }
112114
113115 // for consistency between server router mode and single-model mode, we set the same model name as alias
0 commit comments