infiniflow · Kingsuperyzy · Mar 19, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/api/db/db_models.py b/api/db/db_models.py
@@ -836,6 +836,30 @@ class Meta:
         )
 
 
+class LLMUsageLog(DataBaseModel):
+    """LLM 调用明细流水表，记录每次调用的 token 消耗与计费，关联业务上下文。"""
+    id = CharField(max_length=32, primary_key=True)
+    tenant_id = CharField(max_length=32, null=False, index=True)
+    user_id = CharField(max_length=32, null=True, index=True)
+    biz_type = CharField(max_length=32, null=False, index=True,
+                         help_text="业务类型: dialog/agent/document_parse/graphrag/raptor/other")
+    biz_id = CharField(max_length=32, null=True, index=True,
+                       help_text="业务对象 ID: session_id/canvas_id/document_id 等")
+    tenant_llm_id = IntegerField(null=False, index=True,
+                                 help_text="关联 TenantLLM.id")
+    model_type = CharField(max_length=32, null=False, index=True,
+                           help_text="chat/embedding/rerank/image2text/speech2text")
+    prompt_tokens = IntegerField(default=0)
+    completion_tokens = IntegerField(default=0)
+    total_tokens = IntegerField(default=0)
+    cost = FloatField(default=0.0, help_text="USD")
+    created_at = BigIntegerField(null=False, index=True,
+                                 help_text="Unix 毫秒时间戳")
+
+    class Meta:
+        db_table = "llm_usage_log"
+
+
 class TenantLangfuse(DataBaseModel):
     tenant_id = CharField(max_length=32, null=False, primary_key=True)
     secret_key = CharField(max_length=2048, null=False, help_text="SECRET KEY", index=True)

diff --git a/api/db/services/llm_service.py b/api/db/services/llm_service.py
@@ -24,9 +24,10 @@
 
 from api.db.db_models import LLM
 from api.db.services.common_service import CommonService
+from api.db.services.llm_usage_log_service import LLMUsageLogService
 from api.db.services.tenant_llm_service import LLM4Tenant, TenantLLMService
 from common.constants import LLMType
-from common.token_utils import num_tokens_from_string
+from common.token_utils import LLMUsage, num_tokens_from_string
 
 
 class LLMService(CommonService):
@@ -83,8 +84,31 @@ def get_init_tenant_llm(user_id):
 
 
 class LLMBundle(LLM4Tenant):
-    def __init__(self, tenant_id: str, model_config: dict, lang="Chinese", **kwargs):
+    def __init__(self, tenant_id: str, model_config: dict, lang="Chinese",
+                 user_id: str = None, biz_type: str = None, biz_id: str = None,
+                 **kwargs):
         super().__init__(tenant_id, model_config, lang, **kwargs)
+        self.user_id = user_id
+        self.biz_type = biz_type or "other"
+        self.biz_id = biz_id
+
+    def _log_usage(self, model_type: str, usage: LLMUsage):
+        """将本次 LLM 调用的 token 消耗写入明细日志表，失败只记日志不影响主流程。"""
+        tenant_llm_id = self.model_config.get("id")
+        if not tenant_llm_id:
+            return
+        LLMUsageLogService.create(
+            tenant_id=self.tenant_id,
+            tenant_llm_id=tenant_llm_id,
+            model_type=model_type,
+            user_id=self.user_id,
+            biz_type=self.biz_type,
+            biz_id=self.biz_id,
+            prompt_tokens=usage.prompt_tokens,
+            completion_tokens=usage.completion_tokens,
+            total_tokens=usage.total_tokens,
+            cost=usage.cost,
+        )
 
     def bind_tools(self, toolcall_session, tools):
         if not self.is_tools:
@@ -115,6 +139,7 @@ def encode(self, texts: list):
             generation.update(usage_details={"total_tokens": used_tokens})
             generation.end()
 
+        self._log_usage("embedding", LLMUsage(total_tokens=used_tokens))
         return embeddings, used_tokens
 
     def encode_queries(self, query: str):
@@ -131,6 +156,7 @@ def encode_queries(self, query: str):
             generation.update(usage_details={"total_tokens": used_tokens})
             generation.end()
 
+        self._log_usage("embedding", LLMUsage(total_tokens=used_tokens))
         return emd, used_tokens
 
     def similarity(self, query: str, texts: list):
@@ -145,6 +171,7 @@ def similarity(self, query: str, texts: list):
             generation.update(usage_details={"total_tokens": used_tokens})
             generation.end()
 
+        self._log_usage("rerank", LLMUsage(total_tokens=used_tokens))
         return sim, used_tokens
 
     def describe(self, image, max_tokens=300):
@@ -159,6 +186,7 @@ def describe(self, image, max_tokens=300):
             generation.update(output={"output": txt}, usage_details={"total_tokens": used_tokens})
             generation.end()
 
+        self._log_usage("image2text", LLMUsage(total_tokens=used_tokens))
         return txt
 
     def describe_with_prompt(self, image, prompt):
@@ -173,6 +201,7 @@ def describe_with_prompt(self, image, prompt):
             generation.update(output={"output": txt}, usage_details={"total_tokens": used_tokens})
             generation.end()
 
+        self._log_usage("image2text", LLMUsage(total_tokens=used_tokens))
         return txt
 
     def transcription(self, audio):
@@ -187,6 +216,7 @@ def transcription(self, audio):
             generation.update(output={"output": txt}, usage_details={"total_tokens": used_tokens})
             generation.end()
 
+        self._log_usage("speech2text", LLMUsage(total_tokens=used_tokens))
         return txt
 
     def stream_transcription(self, audio):
@@ -225,6 +255,8 @@ def stream_transcription(self, audio):
                     )
                     generation.end()
 
+                self._log_usage("speech2text", LLMUsage(total_tokens=used_tokens))
+
             return
 
         if self.langfuse:
@@ -245,6 +277,7 @@ def stream_transcription(self, audio):
             )
             generation.end()
 
+        self._log_usage("speech2text", LLMUsage(total_tokens=used_tokens))
         yield {
             "event": "final",
             "text": full_text,
@@ -382,7 +415,7 @@ async def async_chat(self, system: str, history: list, gen_conf: dict = {}, **kw
         use_kwargs = self._clean_param(chat_partial, **kwargs)
 
         try:
-            txt, used_tokens = await chat_partial(**use_kwargs)
+            txt, usage = await chat_partial(**use_kwargs)
         except Exception as e:
             if generation:
                 generation.update(output={"error": str(e)})
@@ -393,17 +426,22 @@ async def async_chat(self, system: str, history: list, gen_conf: dict = {}, **kw
         if not self.verbose_tool_use:
             txt = re.sub(r"<tool_call>.*?</tool_call>", "", txt, flags=re.DOTALL)
 
-        if used_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], used_tokens):
-            logging.error("LLMBundle.async_chat can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], used_tokens))
+        if usage.total_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], usage.total_tokens):
+            logging.error("LLMBundle.async_chat can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], usage.total_tokens))
 
         if generation:
-            generation.update(output={"output": txt}, usage_details={"total_tokens": used_tokens})
+            generation.update(output={"output": txt}, usage_details={
+                "prompt_tokens": usage.prompt_tokens,
+                "completion_tokens": usage.completion_tokens,
+                "total_tokens": usage.total_tokens,
+            })
             generation.end()
 
+        self._log_usage("chat", usage)
         return txt
 
     async def async_chat_streamly(self, system: str, history: list, gen_conf: dict = {}, **kwargs):
-        total_tokens = 0
+        usage = LLMUsage()
         ans = ""
         _bundle_is_tools = self.is_tools
         _mdl_is_tools = getattr(self.mdl, "is_tools", False)
@@ -424,8 +462,8 @@ async def async_chat_streamly(self, system: str, history: list, gen_conf: dict =
             use_kwargs = self._clean_param(chat_partial, **kwargs)
             try:
                 async for txt in chat_partial(**use_kwargs):
-                    if isinstance(txt, int):
-                        total_tokens = txt
+                    if isinstance(txt, LLMUsage):
+                        usage = txt
                         break
 
                     if txt.endswith("</think>") and ans.endswith("</think>"):
@@ -441,15 +479,20 @@ async def async_chat_streamly(self, system: str, history: list, gen_conf: dict =
                     generation.update(output={"error": str(e)})
                     generation.end()
                 raise
-            if total_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], total_tokens):
-                logging.error("LLMBundle.async_chat_streamly can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], total_tokens))
+            if usage.total_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], usage.total_tokens):
+                logging.error("LLMBundle.async_chat_streamly can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], usage.total_tokens))
             if generation:
-                generation.update(output={"output": ans}, usage_details={"total_tokens": total_tokens})
+                generation.update(output={"output": ans}, usage_details={
+                    "prompt_tokens": usage.prompt_tokens,
+                    "completion_tokens": usage.completion_tokens,
+                    "total_tokens": usage.total_tokens,
+                })
                 generation.end()
+            self._log_usage("chat", usage)
             return
 
     async def async_chat_streamly_delta(self, system: str, history: list, gen_conf: dict = {}, **kwargs):
-        total_tokens = 0
+        usage = LLMUsage()
         ans = ""
         if self.is_tools and getattr(self.mdl, "is_tools", False) and hasattr(self.mdl, "async_chat_streamly_with_tools"):
             stream_fn = getattr(self.mdl, "async_chat_streamly_with_tools", None)
@@ -467,8 +510,8 @@ async def async_chat_streamly_delta(self, system: str, history: list, gen_conf:
             use_kwargs = self._clean_param(chat_partial, **kwargs)
             try:
                 async for txt in chat_partial(**use_kwargs):
-                    if isinstance(txt, int):
-                        total_tokens = txt
+                    if isinstance(txt, LLMUsage):
+                        usage = txt
                         break
 
                     if txt.endswith("</think>") and ans.endswith("</think>"):
@@ -484,9 +527,14 @@ async def async_chat_streamly_delta(self, system: str, history: list, gen_conf:
                     generation.update(output={"error": str(e)})
                     generation.end()
                 raise
-            if total_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], total_tokens):
-                logging.error("LLMBundle.async_chat_streamly can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], total_tokens))
+            if usage.total_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], usage.total_tokens):
+                logging.error("LLMBundle.async_chat_streamly can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], usage.total_tokens))
             if generation:
-                generation.update(output={"output": ans}, usage_details={"total_tokens": total_tokens})
+                generation.update(output={"output": ans}, usage_details={
+                    "prompt_tokens": usage.prompt_tokens,
+                    "completion_tokens": usage.completion_tokens,
+                    "total_tokens": usage.total_tokens,
+                })
                 generation.end()
+            self._log_usage("chat", usage)
             return
diff --git a/api/db/services/llm_usage_log_service.py b/api/db/services/llm_usage_log_service.py
@@ -0,0 +1,60 @@
+import logging
+import time
+from uuid import uuid4
+
+from api.db.db_models import DB, LLMUsageLog
+from api.db.services.common_service import CommonService
+
+
+class LLMUsageLogService(CommonService):
+    model = LLMUsageLog
+
+    @classmethod
+    @DB.connection_context()
+    def create(
+        cls,
+        tenant_id: str,
+        tenant_llm_id: int,
+        model_type: str,
+        total_tokens: int,
+        user_id: str = None,
+        biz_type: str = "other",
+        biz_id: str = None,
+        prompt_tokens: int = 0,
+        completion_tokens: int = 0,
+        cost: float = 0.0,
+    ):
+        """写入一条 LLM 调用明细记录。
+
+        Args:
+            tenant_id:        租户 ID
+            tenant_llm_id:    TenantLLM.id（关联具体模型配置）
+            model_type:       模型类型，如 "chat"/"embedding"/"rerank"
+            total_tokens:     本次调用消耗的总 token 数
+            user_id:          发起调用的用户 ID（可选）
+            biz_type:         业务类型，如 "dialog"/"agent"/"document_parse"
+            biz_id:           业务对象 ID，如 session_id/canvas_id/document_id
+            prompt_tokens:    输入 token 数（Chat 模式有值，其他模式为 0）
+            completion_tokens:输出 token 数（Chat 模式有值，其他模式为 0）
+            cost:             本次调用费用（USD），LiteLLM 模式有值，其他暂为 0
+        """
+        try:
+            cls.model.create(
+                id=uuid4().hex,
+                tenant_id=tenant_id,
+                user_id=user_id,
+                biz_type=biz_type,
+                biz_id=biz_id,
+                tenant_llm_id=tenant_llm_id,
+                model_type=model_type,
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+                cost=cost,
+                created_at=int(time.time() * 1000),
+            )
+        except Exception:
+            logging.exception(
+                "LLMUsageLogService.create failed for tenant_id=%s, biz_type=%s, biz_id=%s",
+                tenant_id, biz_type, biz_id,
+            )
diff --git a/common/token_utils.py b/common/token_utils.py
@@ -16,6 +16,8 @@
 
 
 import os
+from dataclasses import dataclass
+
 import tiktoken
 
 from common.file_utils import get_project_base_directory
@@ -26,6 +28,21 @@
 encoder = tiktoken.get_encoding("cl100k_base")
 
 
+@dataclass
+class LLMUsage:
+    """LLM 调用的 token 消耗与计费信息。
+
+    用于替代各处裸 int total_tokens，统一携带输入/输出 token 明细和 cost。
+    - Chat 模式：prompt_tokens / completion_tokens / cost 均有值
+    - Embedding / Rerank 模式：completion_tokens=0，cost 暂为 0
+    - 原生 SDK 模式（Mistral、百度等）：仅 total_tokens 有值，其余为 0
+    """
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    total_tokens: int = 0
+    cost: float = 0.0
+
+
 def num_tokens_from_string(string: str) -> int:
     """Returns the number of tokens in a text string."""
     try: