meta-llama
diff --git a/‎llama_stack/distribution/datatypes.py
Lines changed: 8 additions & 0 deletions b/‎llama_stack/distribution/datatypes.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎llama_stack/distribution/resolver.py
Lines changed: 8 additions & 4 deletions b/‎llama_stack/distribution/resolver.py
Lines changed: 8 additions & 4 deletions
diff --git a/‎llama_stack/distribution/routers/__init__.py
Lines changed: 8 additions & 1 deletion b/‎llama_stack/distribution/routers/__init__.py
Lines changed: 8 additions & 1 deletion
diff --git a/‎llama_stack/distribution/routers/routers.py
Lines changed: 34 additions & 2 deletions b/‎llama_stack/distribution/routers/routers.py
Lines changed: 34 additions & 2 deletions
diff --git a/‎llama_stack/providers/utils/inference/inference_store.py
Lines changed: 76 additions & 0 deletions b/‎llama_stack/providers/utils/inference/inference_store.py
Lines changed: 76 additions & 0 deletions
diff --git a/‎llama_stack/providers/utils/inference/stores/sqlite.py
Lines changed: 132 additions & 0 deletions b/‎llama_stack/providers/utils/inference/stores/sqlite.py
Lines changed: 132 additions & 0 deletions
@@ -25,6 +25,7 @@
 from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack.providers.utils.inference.inference_store import InferenceStoreConfig
 from llama_stack.providers.utils.kvstore.config import KVStoreConfig
 
 LLAMA_STACK_BUILD_CONFIG_VERSION = "2"
@@ -297,6 +298,13 @@ class StackRunConfig(BaseModel):
 a default SQLite store will be used.""",
     )
 
+    inference_store: InferenceStoreConfig | None = Field(
+        default=None,
+        description="""
+Configuration for the persistence store used by the inference API. If not specified,
+a default SQLite store will be used.""",
+    )
+
     # registry of "resources" in the distribution
     models: list[ModelInput] = Field(default_factory=list)
     shields: list[ShieldInput] = Field(default_factory=list)
 
@@ -140,7 +140,7 @@ async def resolve_impls(
 
     sorted_providers = sort_providers_by_deps(providers_with_specs, run_config)
 
-    return await instantiate_providers(sorted_providers, router_apis, dist_registry)
+    return await instantiate_providers(sorted_providers, router_apis, dist_registry, run_config)
 
 
 def specs_for_autorouted_apis(apis_to_serve: list[str] | set[str]) -> dict[str, dict[str, ProviderWithSpec]]:
@@ -243,7 +243,10 @@ def sort_providers_by_deps(
 
 
 async def instantiate_providers(
-    sorted_providers: list[tuple[str, ProviderWithSpec]], router_apis: set[Api], dist_registry: DistributionRegistry
+    sorted_providers: list[tuple[str, ProviderWithSpec]],
+    router_apis: set[Api],
+    dist_registry: DistributionRegistry,
+    run_config: StackRunConfig,
 ) -> dict:
     """Instantiates providers asynchronously while managing dependencies."""
     impls: dict[Api, Any] = {}
@@ -258,7 +261,7 @@ async def instantiate_providers(
         if isinstance(provider.spec, RoutingTableProviderSpec):
             inner_impls = inner_impls_by_provider_id[f"inner-{provider.spec.router_api.value}"]
 
-        impl = await instantiate_provider(provider, deps, inner_impls, dist_registry)
+        impl = await instantiate_provider(provider, deps, inner_impls, dist_registry, run_config)
 
         if api_str.startswith("inner-"):
             inner_impls_by_provider_id[api_str][provider.provider_id] = impl
@@ -308,6 +311,7 @@ async def instantiate_provider(
     deps: dict[Api, Any],
     inner_impls: dict[str, Any],
     dist_registry: DistributionRegistry,
+    run_config: StackRunConfig,
 ):
     provider_spec = provider.spec
     if not hasattr(provider_spec, "module"):
@@ -327,7 +331,7 @@ async def instantiate_provider(
         method = "get_auto_router_impl"
 
         config = None
-        args = [provider_spec.api, deps[provider_spec.routing_table_api], deps]
+        args = [provider_spec.api, deps[provider_spec.routing_table_api], deps, run_config]
     elif isinstance(provider_spec, RoutingTableProviderSpec):
         method = "get_routing_table_impl"
 
 
@@ -7,8 +7,10 @@
 from typing import Any
 
 from llama_stack.distribution.datatypes import RoutedProtocol
+from llama_stack.distribution.stack import StackRunConfig
 from llama_stack.distribution.store import DistributionRegistry
 from llama_stack.providers.datatypes import Api, RoutingTable
+from llama_stack.providers.utils.inference.inference_store import inference_store_impl
 
 from .routing_tables import (
     BenchmarksRoutingTable,
@@ -45,7 +47,9 @@ async def get_routing_table_impl(
     return impl
 
 
-async def get_auto_router_impl(api: Api, routing_table: RoutingTable, deps: dict[str, Any]) -> Any:
+async def get_auto_router_impl(
+    api: Api, routing_table: RoutingTable, deps: dict[str, Any], run_config: StackRunConfig
+) -> Any:
     from .routers import (
         DatasetIORouter,
         EvalRouter,
@@ -76,6 +80,9 @@ async def get_auto_router_impl(api: Api, routing_table: RoutingTable, deps: dict
         if dep_api in deps:
             api_to_dep_impl[dep_name] = deps[dep_api]
 
+    if api == Api.inference and run_config.inference_store:
+        api_to_dep_impl["store"] = await inference_store_impl(run_config.inference_store)
+
     impl = api_to_routers[api.value](routing_table, **api_to_dep_impl)
     await impl.initialize()
     return impl
@@ -32,8 +32,11 @@
     EmbeddingsResponse,
     EmbeddingTaskType,
     Inference,
+    ListOpenAIChatCompletionResponse,
     LogProbConfig,
     Message,
+    OpenAICompletionWithInputMessages,
+    Order,
     ResponseFormat,
     SamplingParams,
     StopReason,
@@ -73,6 +76,8 @@
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
+from llama_stack.providers.utils.inference.inference_store import InferenceStore
+from llama_stack.providers.utils.inference.stream_utils import stream_and_store_openai_completion
 from llama_stack.providers.utils.telemetry.tracing import get_current_span
 
 logger = get_logger(name=__name__, category="core")
@@ -141,10 +146,12 @@ def __init__(
         self,
         routing_table: RoutingTable,
         telemetry: Telemetry | None = None,
+        store: InferenceStore | None = None,
     ) -> None:
         logger.debug("Initializing InferenceRouter")
         self.routing_table = routing_table
         self.telemetry = telemetry
+        self.store = store
         if self.telemetry:
             self.tokenizer = Tokenizer.get_instance()
             self.formatter = ChatFormat(self.tokenizer)
@@ -607,9 +614,34 @@ async def openai_chat_completion(
 
         provider = self.routing_table.get_provider_impl(model_obj.identifier)
         if stream:
-            return await provider.openai_chat_completion(**params)
+            response_stream = await provider.openai_chat_completion(**params)
+            if self.store:
+                return stream_and_store_openai_completion(response_stream, model, self.store, messages)
+            else:
+                return response_stream
         else:
-            return await self._nonstream_openai_chat_completion(provider, params)
+            response = await self._nonstream_openai_chat_completion(provider, params)
+            if self.store:
+                await self.store.store_chat_completion(response, messages)
+            return response
+
+    async def list_chat_completions(
+        self,
+        after: str | None = None,
+        limit: int | None = 20,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIChatCompletionResponse:
+        if self.store:
+            return await self.store.list_chat_completions(after, limit, model, order)
+        else:
+            raise NotImplementedError("List chat completions is not supported: inference store is not configured.")
+
+    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
+        if self.store:
+            return await self.store.get_chat_completion(completion_id)
+        else:
+            raise NotImplementedError("Get chat completion is not supported: inference store is not configured.")
 
     async def _nonstream_openai_chat_completion(self, provider: Inference, params: dict) -> OpenAIChatCompletion:
         response = await provider.openai_chat_completion(**params)
 
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from enum import Enum
+from typing import Annotated, Literal, Protocol
+
+from pydantic import BaseModel, Field
+
+from llama_stack.apis.inference import (
+    ListOpenAIChatCompletionResponse,
+    OpenAIChatCompletion,
+    OpenAICompletionWithInputMessages,
+    OpenAIMessageParam,
+    Order,
+)
+from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
+
+
+class InferenceStoreType(Enum):
+    sqlite = "sqlite"
+
+
+class SqliteInferenceStoreConfig(BaseModel):
+    type: Literal["sqlite"] = InferenceStoreType.sqlite.value
+    db_path: str = Field(
+        default=(RUNTIME_BASE_DIR / "inference_store.db").as_posix(),
+        description="File path for the sqlite database",
+    )
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, db_name: str = "inference_store.db"):
+        return {
+            "type": "sqlite",
+            "db_path": "${env.SQLITE_STORE_DIR:" + __distro_dir__ + "}/" + db_name,
+        }
+
+
+InferenceStoreConfig = Annotated[
+    SqliteInferenceStoreConfig,
+    Field(discriminator="type", default=InferenceStoreType.sqlite.value),
+]
+
+
+class InferenceStore(Protocol):
+    async def initialize(self) -> None: ...
+
+    async def store_chat_completion(
+        self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
+    ) -> None: ...
+
+    async def list_chat_completions(
+        self,
+        after: str | None = None,
+        limit: int | None = 20,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIChatCompletionResponse: ...
+
+    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages: ...
+
+
+async def inference_store_impl(config: InferenceStoreConfig) -> InferenceStore:
+    if config.type == InferenceStoreType.sqlite.value:
+        from .stores.sqlite import SqliteInferenceStore
+
+        impl = SqliteInferenceStore(config.db_path)
+    else:
+        raise ValueError(
+            f"Unknown inference store type {config.type}: available types are {InferenceStoreType.values()}"
+        )
+
+    await impl.initialize()
+    return impl
@@ -0,0 +1,132 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import os
+
+import aiosqlite
+
+from llama_stack.apis.inference import (
+    ListOpenAIChatCompletionResponse,
+    OpenAIChatCompletion,
+    OpenAICompletionWithInputMessages,
+    OpenAIMessageParam,
+    Order,
+)
+
+from ..inference_store import InferenceStore
+
+
+class SqliteInferenceStore(InferenceStore):
+    def __init__(self, conn_string: str):
+        self.conn_string = conn_string
+
+    async def initialize(self):
+        """Create the necessary tables if they don't exist."""
+        # Create directory if it doesn't exist
+        os.makedirs(os.path.dirname(self.conn_string), exist_ok=True)
+
+        async with aiosqlite.connect(self.conn_string) as conn:
+            await conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS chat_completions (
+                    id TEXT PRIMARY KEY,
+                    created INTEGER,
+                    model TEXT,
+                    choices TEXT,
+                    input_messages TEXT
+                )
+            """
+            )
+            await conn.commit()
+
+    async def store_chat_completion(
+        self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
+    ) -> None:
+        data = chat_completion.model_dump()
+
+        async with aiosqlite.connect(self.conn_string) as conn:
+            await conn.execute(
+                """
+            INSERT INTO chat_completions (id, created, model, choices, input_messages)
+            VALUES (?, ?, ?, ?, ?)
+            """,
+                (
+                    data["id"],
+                    data["created"],
+                    data["model"],
+                    json.dumps(data["choices"]),
+                    json.dumps([message.model_dump() for message in input_messages]),
+                ),
+            )
+            await conn.commit()
+
+    async def list_chat_completions(
+        self,
+        after: str | None = None,
+        limit: int | None = 20,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIChatCompletionResponse:
+        """
+        List chat completions from the database.
+
+        :param after: The ID of the last chat completion to return.
+        :param limit: The maximum number of chat completions to return.
+        :param model: The model to filter by.
+        :param order: The order to sort the chat completions by.
+        """
+        # TODO: support after
+        if after:
+            raise NotImplementedError("After is not supported for SQLite")
+        if not order:
+            order = Order.desc
+
+        async with aiosqlite.connect(self.conn_string) as conn:
+            conn.row_factory = aiosqlite.Row
+            where_clause = f"WHERE model = {model}" if model else ""
+            cursor = await conn.execute(
+                f"""
+                SELECT * FROM chat_completions
+                {where_clause}
+                ORDER BY created {order.value}
+                LIMIT {limit}
+                """
+            )
+            rows = await cursor.fetchall()
+
+        data = [
+            OpenAICompletionWithInputMessages(
+                id=row["id"],
+                created=row["created"],
+                model=row["model"],
+                choices=json.loads(row["choices"]),
+                input_messages=json.loads(row["input_messages"]),
+            )
+            for row in rows
+        ]
+        return ListOpenAIChatCompletionResponse(
+            data=data,
+            # TODO: implement has_more
+            has_more=False,
+            first_id=data[0].id,
+            last_id=data[-1].id,
+        )
+
+    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
+        async with aiosqlite.connect(self.conn_string) as conn:
+            conn.row_factory = aiosqlite.Row
+            cursor = await conn.execute("SELECT * FROM chat_completions WHERE id = ?", (completion_id,))
+            row = await cursor.fetchone()
+        if row is None:
+            raise ValueError(f"Chat completion with id {completion_id} not found")
+        return OpenAICompletionWithInputMessages(
+            id=row["id"],
+            created=row["created"],
+            model=row["model"],
+            choices=json.loads(row["choices"]),
+            input_messages=json.loads(row["input_messages"]),
+        )