Skip to content

Commit 4b06e17

Browse files
committed
Move token related functions to common
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
1 parent 44f2d6f commit 4b06e17

25 files changed

+529
-78
lines changed

api/db/services/dialog_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
from rag.nlp.search import index_name
4242
from rag.prompts.generator import chunks_format, citation_prompt, cross_languages, full_question, kb_prompt, keyword_extraction, message_fit_in, \
4343
gen_meta_filter, PROMPT_JINJA_ENV, ASK_SUMMARY
44-
from rag.utils import num_tokens_from_string
44+
from common.token_utils import num_tokens_from_string
4545
from rag.utils.tavily_conn import Tavily
4646
from common.string_utils import remove_redundant_spaces
4747

api/db/services/llm_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import inspect
1717
import logging
1818
import re
19-
from rag.utils import num_tokens_from_string
19+
from common.token_utils import num_tokens_from_string
2020
from functools import partial
2121
from typing import Generator
2222
from api.db.db_models import LLM

common/token_utils.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#
2+
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
18+
import os
19+
import tiktoken
20+
21+
from common.file_utils import get_project_base_directory
22+
23+
tiktoken_cache_dir = get_project_base_directory()
24+
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
25+
# encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
26+
encoder = tiktoken.get_encoding("cl100k_base")
27+
28+
29+
def num_tokens_from_string(string: str) -> int:
30+
"""Returns the number of tokens in a text string."""
31+
try:
32+
code_list = encoder.encode(string)
33+
return len(code_list)
34+
except Exception:
35+
return 0
36+
37+
def total_token_count_from_response(resp):
38+
if resp is None:
39+
return 0
40+
41+
if hasattr(resp, "usage") and hasattr(resp.usage, "total_tokens"):
42+
try:
43+
return resp.usage.total_tokens
44+
except Exception:
45+
pass
46+
47+
if hasattr(resp, "usage_metadata") and hasattr(resp.usage_metadata, "total_tokens"):
48+
try:
49+
return resp.usage_metadata.total_tokens
50+
except Exception:
51+
pass
52+
53+
if 'usage' in resp and 'total_tokens' in resp['usage']:
54+
try:
55+
return resp["usage"]["total_tokens"]
56+
except Exception:
57+
pass
58+
59+
if 'usage' in resp and 'input_tokens' in resp['usage'] and 'output_tokens' in resp['usage']:
60+
try:
61+
return resp["usage"]["input_tokens"] + resp["usage"]["output_tokens"]
62+
except Exception:
63+
pass
64+
65+
if 'meta' in resp and 'tokens' in resp['meta'] and 'input_tokens' in resp['meta']['tokens'] and 'output_tokens' in resp['meta']['tokens']:
66+
try:
67+
return resp["meta"]["tokens"]["input_tokens"] + resp["meta"]["tokens"]["output_tokens"]
68+
except Exception:
69+
pass
70+
return 0
71+
72+
73+
def truncate(string: str, max_len: int) -> str:
74+
"""Returns truncated text if the length of text exceed max_len."""
75+
return encoder.decode(encoder.encode(string)[:max_len])
76+

deepdoc/parser/txt_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import re
1818

1919
from deepdoc.parser.utils import get_text
20-
from rag.nlp import num_tokens_from_string
20+
from common.token_utils import num_tokens_from_string
2121

2222

2323
class RAGFlowTxtParser:

graphrag/general/community_reports_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from graphrag.general.leiden import add_community_info2graph
2222
from rag.llm.chat_model import Base as CompletionLLM
2323
from graphrag.utils import perform_variable_replacements, dict_has_keys_with_types, chat_limiter
24-
from rag.utils import num_tokens_from_string
24+
from common.token_utils import num_tokens_from_string
2525
import trio
2626

2727

graphrag/general/extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
)
3939
from rag.llm.chat_model import Base as CompletionLLM
4040
from rag.prompts.generator import message_fit_in
41-
from rag.utils import truncate
41+
from common.token_utils import truncate
4242

4343
GRAPH_FIELD_SEP = "<SEP>"
4444
DEFAULT_ENTITY_TYPES = ["organization", "person", "geo", "event", "category"]

graphrag/general/graph_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from graphrag.utils import ErrorHandlerFn, perform_variable_replacements, chat_limiter, split_string_by_multi_markers
1717
from rag.llm.chat_model import Base as CompletionLLM
1818
import networkx as nx
19-
from rag.utils import num_tokens_from_string
19+
from common.token_utils import num_tokens_from_string
2020

2121
DEFAULT_TUPLE_DELIMITER = "<|>"
2222
DEFAULT_RECORD_DELIMITER = "##"

graphrag/general/index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ async def run_graphrag_for_kb(
165165
return {"ok_docs": [], "failed_docs": [], "total_docs": 0, "total_chunks": 0, "seconds": 0.0}
166166

167167
def load_doc_chunks(doc_id: str) -> list[str]:
168-
from rag.utils import num_tokens_from_string
168+
from common.token_utils import num_tokens_from_string
169169

170170
chunks = []
171171
current_chunk = ""

graphrag/general/mind_map_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from rag.llm.chat_model import Base as CompletionLLM
2828
import markdown_to_json
2929
from functools import reduce
30-
from rag.utils import num_tokens_from_string
30+
from common.token_utils import num_tokens_from_string
3131

3232

3333
@dataclass

graphrag/light/graph_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from graphrag.light.graph_prompt import PROMPTS
1818
from graphrag.utils import chat_limiter, pack_user_ass_to_openai_messages, split_string_by_multi_markers
1919
from rag.llm.chat_model import Base as CompletionLLM
20-
from rag.utils import num_tokens_from_string
20+
from common.token_utils import num_tokens_from_string
2121

2222

2323
@dataclass

0 commit comments

Comments
 (0)