Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
7dc331f
Fix: include usage tokens in streaming LLM calls to avoid inaccurate …
Kingsuperyzy Mar 19, 2026
820eca4
Merge remote-tracking branch 'origin/main'
Mar 20, 2026
e3ba810
feat: per-conversation token usage tracking for model requests
Kingsuperyzy Mar 20, 2026
cfa8965
feat: per-conversation token usage tracking for model requests
Kingsuperyzy Mar 20, 2026
39997db
Merge branch 'main' into main
Kingsuperyzy Mar 23, 2026
39399ee
Fix: include usage tokens in streaming LLM calls to avoid inaccurate …
Kingsuperyzy Mar 24, 2026
554f92d
Merge branch 'main' into main
Kingsuperyzy Mar 25, 2026
7bf2a05
Merge branch 'main' into main
JinHai-CN Mar 25, 2026
065cb10
Merge branch 'main' into main
Kingsuperyzy Mar 25, 2026
ddd5898
Fix: include usage tokens in streaming LLM calls to avoid inaccurate …
Kingsuperyzy Mar 25, 2026
aafb99c
Merge branch 'infiniflow:main' into main
Kingsuperyzy Mar 30, 2026
e5d9840
Merge branch 'infiniflow:main' into main
Kingsuperyzy Mar 31, 2026
8a60314
Merge branch 'infiniflow:main' into main
Kingsuperyzy Mar 31, 2026
b416cd4
Merge branch 'infiniflow:main' into main
Kingsuperyzy Apr 2, 2026
936564c
Merge branch 'infiniflow:main' into main
Kingsuperyzy Apr 2, 2026
7f9e277
Passing `biz_type` and `biz_id` when instantiating `LLMBundle`
Kingsuperyzy Apr 2, 2026
610b8dc
Passing `biz_type` and `biz_id` when instantiating `LLMBundle`
Kingsuperyzy Apr 2, 2026
0720425
Passing `biz_type` and `biz_id` when instantiating `LLMBundle`
Kingsuperyzy Apr 2, 2026
f82dd96
Passing `biz_type` and `biz_id` when instantiating `LLMBundle`
Kingsuperyzy Apr 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions api/db/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,30 @@ class Meta:
)


class LLMUsageLog(DataBaseModel):
"""LLM 调用明细流水表,记录每次调用的 token 消耗与计费,关联业务上下文。"""
id = CharField(max_length=32, primary_key=True)
tenant_id = CharField(max_length=32, null=False, index=True)
user_id = CharField(max_length=32, null=True, index=True)
biz_type = CharField(max_length=32, null=False, index=True,
help_text="业务类型: dialog/agent/document_parse/graphrag/raptor/other")
biz_id = CharField(max_length=32, null=True, index=True,
help_text="业务对象 ID: session_id/canvas_id/document_id 等")
tenant_llm_id = IntegerField(null=False, index=True,
help_text="关联 TenantLLM.id")
model_type = CharField(max_length=32, null=False, index=True,
help_text="chat/embedding/rerank/image2text/speech2text")
prompt_tokens = IntegerField(default=0)
completion_tokens = IntegerField(default=0)
total_tokens = IntegerField(default=0)
cost = FloatField(default=0.0, help_text="USD")
created_at = BigIntegerField(null=False, index=True,
help_text="Unix 毫秒时间戳")

class Meta:
db_table = "llm_usage_log"


class TenantLangfuse(DataBaseModel):
tenant_id = CharField(max_length=32, null=False, primary_key=True)
secret_key = CharField(max_length=2048, null=False, help_text="SECRET KEY", index=True)
Expand Down
84 changes: 66 additions & 18 deletions api/db/services/llm_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@

from api.db.db_models import LLM
from api.db.services.common_service import CommonService
from api.db.services.llm_usage_log_service import LLMUsageLogService
from api.db.services.tenant_llm_service import LLM4Tenant, TenantLLMService
from common.constants import LLMType
from common.token_utils import num_tokens_from_string
from common.token_utils import LLMUsage, num_tokens_from_string


class LLMService(CommonService):
Expand Down Expand Up @@ -83,8 +84,31 @@ def get_init_tenant_llm(user_id):


class LLMBundle(LLM4Tenant):
def __init__(self, tenant_id: str, model_config: dict, lang="Chinese", **kwargs):
def __init__(self, tenant_id: str, model_config: dict, lang="Chinese",
user_id: str = None, biz_type: str = None, biz_id: str = None,
**kwargs):
super().__init__(tenant_id, model_config, lang, **kwargs)
self.user_id = user_id
self.biz_type = biz_type or "other"
self.biz_id = biz_id

def _log_usage(self, model_type: str, usage: LLMUsage):
"""将本次 LLM 调用的 token 消耗写入明细日志表,失败只记日志不影响主流程。"""
tenant_llm_id = self.model_config.get("id")
if not tenant_llm_id:
return
LLMUsageLogService.create(
tenant_id=self.tenant_id,
tenant_llm_id=tenant_llm_id,
model_type=model_type,
user_id=self.user_id,
biz_type=self.biz_type,
biz_id=self.biz_id,
prompt_tokens=usage.prompt_tokens,
completion_tokens=usage.completion_tokens,
total_tokens=usage.total_tokens,
cost=usage.cost,
)

def bind_tools(self, toolcall_session, tools):
if not self.is_tools:
Expand Down Expand Up @@ -115,6 +139,7 @@ def encode(self, texts: list):
generation.update(usage_details={"total_tokens": used_tokens})
generation.end()

self._log_usage("embedding", LLMUsage(total_tokens=used_tokens))
return embeddings, used_tokens

def encode_queries(self, query: str):
Expand All @@ -131,6 +156,7 @@ def encode_queries(self, query: str):
generation.update(usage_details={"total_tokens": used_tokens})
generation.end()

self._log_usage("embedding", LLMUsage(total_tokens=used_tokens))
return emd, used_tokens

def similarity(self, query: str, texts: list):
Expand All @@ -145,6 +171,7 @@ def similarity(self, query: str, texts: list):
generation.update(usage_details={"total_tokens": used_tokens})
generation.end()

self._log_usage("rerank", LLMUsage(total_tokens=used_tokens))
return sim, used_tokens

def describe(self, image, max_tokens=300):
Expand All @@ -159,6 +186,7 @@ def describe(self, image, max_tokens=300):
generation.update(output={"output": txt}, usage_details={"total_tokens": used_tokens})
generation.end()

self._log_usage("image2text", LLMUsage(total_tokens=used_tokens))
return txt

def describe_with_prompt(self, image, prompt):
Expand All @@ -173,6 +201,7 @@ def describe_with_prompt(self, image, prompt):
generation.update(output={"output": txt}, usage_details={"total_tokens": used_tokens})
generation.end()

self._log_usage("image2text", LLMUsage(total_tokens=used_tokens))
return txt

def transcription(self, audio):
Expand All @@ -187,6 +216,7 @@ def transcription(self, audio):
generation.update(output={"output": txt}, usage_details={"total_tokens": used_tokens})
generation.end()

self._log_usage("speech2text", LLMUsage(total_tokens=used_tokens))
return txt

def stream_transcription(self, audio):
Expand Down Expand Up @@ -225,6 +255,8 @@ def stream_transcription(self, audio):
)
generation.end()

self._log_usage("speech2text", LLMUsage(total_tokens=used_tokens))

return

if self.langfuse:
Expand All @@ -245,6 +277,7 @@ def stream_transcription(self, audio):
)
generation.end()

self._log_usage("speech2text", LLMUsage(total_tokens=used_tokens))
yield {
"event": "final",
"text": full_text,
Expand Down Expand Up @@ -382,7 +415,7 @@ async def async_chat(self, system: str, history: list, gen_conf: dict = {}, **kw
use_kwargs = self._clean_param(chat_partial, **kwargs)

try:
txt, used_tokens = await chat_partial(**use_kwargs)
txt, usage = await chat_partial(**use_kwargs)
except Exception as e:
if generation:
generation.update(output={"error": str(e)})
Expand All @@ -393,17 +426,22 @@ async def async_chat(self, system: str, history: list, gen_conf: dict = {}, **kw
if not self.verbose_tool_use:
txt = re.sub(r"<tool_call>.*?</tool_call>", "", txt, flags=re.DOTALL)

if used_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], used_tokens):
logging.error("LLMBundle.async_chat can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], used_tokens))
if usage.total_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], usage.total_tokens):
logging.error("LLMBundle.async_chat can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], usage.total_tokens))

if generation:
generation.update(output={"output": txt}, usage_details={"total_tokens": used_tokens})
generation.update(output={"output": txt}, usage_details={
"prompt_tokens": usage.prompt_tokens,
"completion_tokens": usage.completion_tokens,
"total_tokens": usage.total_tokens,
})
generation.end()

self._log_usage("chat", usage)
return txt

async def async_chat_streamly(self, system: str, history: list, gen_conf: dict = {}, **kwargs):
total_tokens = 0
usage = LLMUsage()
ans = ""
_bundle_is_tools = self.is_tools
_mdl_is_tools = getattr(self.mdl, "is_tools", False)
Expand All @@ -424,8 +462,8 @@ async def async_chat_streamly(self, system: str, history: list, gen_conf: dict =
use_kwargs = self._clean_param(chat_partial, **kwargs)
try:
async for txt in chat_partial(**use_kwargs):
if isinstance(txt, int):
total_tokens = txt
if isinstance(txt, LLMUsage):
usage = txt
break

if txt.endswith("</think>") and ans.endswith("</think>"):
Expand All @@ -441,15 +479,20 @@ async def async_chat_streamly(self, system: str, history: list, gen_conf: dict =
generation.update(output={"error": str(e)})
generation.end()
raise
if total_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], total_tokens):
logging.error("LLMBundle.async_chat_streamly can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], total_tokens))
if usage.total_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], usage.total_tokens):
logging.error("LLMBundle.async_chat_streamly can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], usage.total_tokens))
if generation:
generation.update(output={"output": ans}, usage_details={"total_tokens": total_tokens})
generation.update(output={"output": ans}, usage_details={
"prompt_tokens": usage.prompt_tokens,
"completion_tokens": usage.completion_tokens,
"total_tokens": usage.total_tokens,
})
generation.end()
self._log_usage("chat", usage)
return

async def async_chat_streamly_delta(self, system: str, history: list, gen_conf: dict = {}, **kwargs):
total_tokens = 0
usage = LLMUsage()
ans = ""
if self.is_tools and getattr(self.mdl, "is_tools", False) and hasattr(self.mdl, "async_chat_streamly_with_tools"):
stream_fn = getattr(self.mdl, "async_chat_streamly_with_tools", None)
Expand All @@ -467,8 +510,8 @@ async def async_chat_streamly_delta(self, system: str, history: list, gen_conf:
use_kwargs = self._clean_param(chat_partial, **kwargs)
try:
async for txt in chat_partial(**use_kwargs):
if isinstance(txt, int):
total_tokens = txt
if isinstance(txt, LLMUsage):
usage = txt
break

if txt.endswith("</think>") and ans.endswith("</think>"):
Expand All @@ -484,9 +527,14 @@ async def async_chat_streamly_delta(self, system: str, history: list, gen_conf:
generation.update(output={"error": str(e)})
generation.end()
raise
if total_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], total_tokens):
logging.error("LLMBundle.async_chat_streamly can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], total_tokens))
if usage.total_tokens and not TenantLLMService.increase_usage_by_id(self.model_config["id"], usage.total_tokens):
logging.error("LLMBundle.async_chat_streamly can't update token usage for {}/CHAT llm_name: {}, used_tokens: {}".format(self.tenant_id, self.model_config["llm_name"], usage.total_tokens))
if generation:
generation.update(output={"output": ans}, usage_details={"total_tokens": total_tokens})
generation.update(output={"output": ans}, usage_details={
"prompt_tokens": usage.prompt_tokens,
"completion_tokens": usage.completion_tokens,
"total_tokens": usage.total_tokens,
})
generation.end()
self._log_usage("chat", usage)
return
60 changes: 60 additions & 0 deletions api/db/services/llm_usage_log_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import logging
import time
from uuid import uuid4

from api.db.db_models import DB, LLMUsageLog
from api.db.services.common_service import CommonService


class LLMUsageLogService(CommonService):
model = LLMUsageLog

@classmethod
@DB.connection_context()
def create(
cls,
tenant_id: str,
tenant_llm_id: int,
model_type: str,
total_tokens: int,
user_id: str = None,
biz_type: str = "other",
biz_id: str = None,
prompt_tokens: int = 0,
completion_tokens: int = 0,
cost: float = 0.0,
):
"""写入一条 LLM 调用明细记录。

Args:
tenant_id: 租户 ID
tenant_llm_id: TenantLLM.id(关联具体模型配置)
model_type: 模型类型,如 "chat"/"embedding"/"rerank"
total_tokens: 本次调用消耗的总 token 数
user_id: 发起调用的用户 ID(可选)
biz_type: 业务类型,如 "dialog"/"agent"/"document_parse"
biz_id: 业务对象 ID,如 session_id/canvas_id/document_id
prompt_tokens: 输入 token 数(Chat 模式有值,其他模式为 0)
completion_tokens:输出 token 数(Chat 模式有值,其他模式为 0)
cost: 本次调用费用(USD),LiteLLM 模式有值,其他暂为 0
"""
try:
cls.model.create(
id=uuid4().hex,
tenant_id=tenant_id,
user_id=user_id,
biz_type=biz_type,
biz_id=biz_id,
tenant_llm_id=tenant_llm_id,
model_type=model_type,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=total_tokens,
cost=cost,
created_at=int(time.time() * 1000),
)
except Exception:
logging.exception(
"LLMUsageLogService.create failed for tenant_id=%s, biz_type=%s, biz_id=%s",
tenant_id, biz_type, biz_id,
)
17 changes: 17 additions & 0 deletions common/token_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@


import os
from dataclasses import dataclass

import tiktoken

from common.file_utils import get_project_base_directory
Expand All @@ -26,6 +28,21 @@
encoder = tiktoken.get_encoding("cl100k_base")


@dataclass
class LLMUsage:
"""LLM 调用的 token 消耗与计费信息。

用于替代各处裸 int total_tokens,统一携带输入/输出 token 明细和 cost。
- Chat 模式:prompt_tokens / completion_tokens / cost 均有值
- Embedding / Rerank 模式:completion_tokens=0,cost 暂为 0
- 原生 SDK 模式(Mistral、百度等):仅 total_tokens 有值,其余为 0
"""
prompt_tokens: int = 0
completion_tokens: int = 0
total_tokens: int = 0
cost: float = 0.0


def num_tokens_from_string(string: str) -> int:
"""Returns the number of tokens in a text string."""
try:
Expand Down
Loading
Loading