feat: prioritize running local inference jobs on skynet nodes with vLLM (#208)

quitrk · web-flow · commit a99e77bbb349 · 2025-05-22T13:26:47.000+03:00
diff --git a/skynet/auth/user_info.py b/skynet/auth/user_info.py
@@ -13,9 +13,10 @@
 
 
 class CredentialsType(Enum):
-    OPENAI = 'OPENAI'
     AZURE_OPENAI = 'AZURE_OPENAI'
+    LOCAL = 'LOCAL'
     OCI = 'OCI'
+    OPENAI = 'OPENAI'
 
 
 async def open_yaml(file_path):
diff --git a/skynet/modules/ttt/llm_selector.py b/skynet/modules/ttt/llm_selector.py
@@ -47,11 +47,12 @@ def get_job_processor(customer_id: str, job_id: Optional[str] = None) -> Process
             elif api_type == CredentialsType.AZURE_OPENAI.value:
                 return Processors.AZURE
 
+        if api_type == CredentialsType.LOCAL.value:
+            return Processors.LOCAL
+
         if oci_available:
             return Processors.OCI
 
-        log.warning(f'OCI is not available, falling back to local processing for customer {customer_id}')
-
         return Processors.LOCAL
 
     @staticmethod
@@ -109,8 +110,7 @@ def select(
                 service_endpoint=oci_service_endpoint,
             )
         else:
-            if customer_id:
-                log.info(f'Customer {customer_id} has no API key configured, falling back to local processing')
+            log.info(f'Forwarding inference to local LLM for customer {customer_id}')
 
             return ChatOpenAI(
                 api_key='placeholder',  # use a placeholder value to bypass validation
diff --git a/skynet/modules/ttt/processor_test.py b/skynet/modules/ttt/processor_test.py
@@ -137,6 +137,33 @@ async def test_process_with_azure_open_ai(self, process_fixture):
 
         LLMSelector.select.assert_called_once()
 
+    @pytest.mark.asyncio
+    async def test_process_with_local(self, process_fixture):
+        '''Test that a job is sent for local inference if there is a customer id configured for it.'''
+
+        from skynet.modules.ttt.llm_selector import LLMSelector
+        from skynet.modules.ttt.processor import process
+
+        process_fixture.patch(
+            'skynet.modules.ttt.llm_selector.get_credentials',
+            return_value={'type': 'LOCAL'},
+        )
+        process_fixture.patch('skynet.modules.ttt.llm_selector.oci_available', True)
+
+        job = Job(
+            payload=DocumentPayload(
+                text="Andrew: Hello. Beatrix: Honey? It’s me . . . Andrew: Where are you? Beatrix: At the station. I missed my train."
+            ),
+            metadata=DocumentMetadata(customer_id='test'),
+            type=JobType.SUMMARY,
+        )
+
+        assert LLMSelector.get_job_processor(job.metadata.customer_id, job.id) == Processors.LOCAL
+
+        await process(job)
+
+        LLMSelector.select.assert_called_once()
+
     @pytest.mark.asyncio
     async def test_process_with_oci(self, process_fixture):
         '''Test that a job is sent for inference to oci if there is a customer id configured for it.'''
diff --git a/skynet/modules/ttt/summaries/jobs.py b/skynet/modules/ttt/summaries/jobs.py
@@ -3,7 +3,7 @@
 import time
 
 from skynet.constants import ERROR_JOBS_KEY, PENDING_JOBS_KEY, RUNNING_JOBS_KEY
-from skynet.env import enable_batching, job_timeout, max_concurrency, modules, redis_exp_seconds
+from skynet.env import enable_batching, job_timeout, max_concurrency, modules, redis_exp_seconds, use_vllm
 from skynet.logs import get_logger
 from skynet.modules.monitoring import (
     OPENAI_API_RESTART_COUNTER,
@@ -82,6 +82,10 @@ async def create_job(job_type: JobType, payload: DocumentPayload, metadata: Docu
     """Create a job and add it to the db queue if it can't be started immediately."""
 
     job = Job(payload=payload, type=job_type, metadata=metadata)
+    processor = LLMSelector.get_job_processor(metadata.customer_id)
+
+    # encode the processor in the job id to avoid having to retrieve the whole job object
+    job.id += f':{processor.value}'
     job_id = job.id
 
     await db.set(job_id, Job.model_dump_json(job))
@@ -198,7 +202,20 @@ async def maybe_run_next_job() -> None:
     if not can_run_next_job():
         return
 
-    next_job_id = await db.lpop(PENDING_JOBS_KEY)
+    next_job_id = None
+
+    if use_vllm:
+        pending_jobs_keys = await db.lrange(PENDING_JOBS_KEY, 0, -1)
+
+        for job_id in pending_jobs_keys:
+            if job_id.endswith(Processors.LOCAL.value):
+                next_job_id = job_id
+                await db.lrem(PENDING_JOBS_KEY, 0, job_id)
+
+                break
+
+    if not next_job_id:
+        next_job_id = await db.lpop(PENDING_JOBS_KEY)
 
     await update_summary_queue_metric()