fix: take global_config from storage class

drahnreb · drahnreb · commit 3ff9dee9f63c · 2025-04-17T16:57:53.000+02:00
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
@@ -7,7 +7,7 @@
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from functools import partial
-from typing import Any, AsyncIterator, Callable, Iterator, cast, final, Literal
+from typing import Any, AsyncIterator, Callable, Iterator, cast, final, Literal, Optional, List, Dict
 
 from lightrag.kg import (
     STORAGES,
diff --git a/lightrag/operate.py b/lightrag/operate.py
@@ -116,7 +116,6 @@ async def _handle_entity_relation_summary(
     use_llm_func: callable = global_config["llm_model_func"]
     tokenizer: Tokenizer = global_config["tokenizer"]
     llm_max_tokens = global_config["llm_model_max_token_size"]
-    tiktoken_model_name = global_config["tiktoken_model_name"]
     summary_max_tokens = global_config["summary_to_max_tokens"]
 
     language = global_config["addon_params"].get(
@@ -842,7 +841,6 @@ async def kg_query(
         relationships_vdb,
         text_chunks_db,
         query_param,
-        global_config,
     )
 
     if query_param.only_need_context:
@@ -1114,7 +1112,6 @@ async def get_kg_context():
                 relationships_vdb,
                 text_chunks_db,
                 query_param,
-                global_config,
             )
 
             return context
@@ -1269,7 +1266,6 @@ async def _build_query_context(
     relationships_vdb: BaseVectorStorage,
     text_chunks_db: BaseKVStorage,
     query_param: QueryParam,
-    global_config: dict[str, str],
 ):
     logger.info(f"Process {os.getpid()} buidling query context...")
     if query_param.mode == "local":
@@ -1279,7 +1275,6 @@ async def _build_query_context(
             entities_vdb,
             text_chunks_db,
             query_param,
-            global_config,
         )
     elif query_param.mode == "global":
         entities_context, relations_context, text_units_context = await _get_edge_data(
@@ -1288,7 +1283,6 @@ async def _build_query_context(
             relationships_vdb,
             text_chunks_db,
             query_param,
-            global_config,
         )
     else:  # hybrid mode
         ll_data = await _get_node_data(
@@ -1297,15 +1291,13 @@ async def _build_query_context(
             entities_vdb,
             text_chunks_db,
             query_param,
-            global_config,
         )
         hl_data = await _get_edge_data(
             hl_keywords,
             knowledge_graph_inst,
             relationships_vdb,
             text_chunks_db,
             query_param,
-            global_config,
         )
 
         (
@@ -1352,7 +1344,6 @@ async def _get_node_data(
     entities_vdb: BaseVectorStorage,
     text_chunks_db: BaseKVStorage,
     query_param: QueryParam,
-    global_config: dict[str, str],
 ):
     # get similar entities
     logger.info(
@@ -1389,13 +1380,13 @@ async def _get_node_data(
     ]  # what is this text_chunks_db doing.  dont remember it in airvx.  check the diagram.
     # get entitytext chunk
     use_text_units = await _find_most_related_text_unit_from_entities(
-        node_datas, query_param, text_chunks_db, knowledge_graph_inst, global_config
+        node_datas, query_param, text_chunks_db, knowledge_graph_inst,
     )
     use_relations = await _find_most_related_edges_from_entities(
-        node_datas, query_param, knowledge_graph_inst, global_config
+        node_datas, query_param, knowledge_graph_inst,
     )
 
-    tokenizer: Tokenizer = global_config["tokenizer"]
+    tokenizer: Tokenizer = text_chunks_db.global_config.get("tokenizer")
     len_node_datas = len(node_datas)
     node_datas = truncate_list_by_token_size(
         node_datas,
@@ -1495,7 +1486,6 @@ async def _find_most_related_text_unit_from_entities(
     query_param: QueryParam,
     text_chunks_db: BaseKVStorage,
     knowledge_graph_inst: BaseGraphStorage,
-    global_config: dict[str, str],
 ):
     text_units = [
         split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP])
@@ -1577,7 +1567,7 @@ async def _find_most_related_text_unit_from_entities(
         logger.warning("No valid text units found")
         return []
 
-    tokenizer: Tokenizer = global_config["tokenizer"]
+    tokenizer: Tokenizer = text_chunks_db.global_config.get("tokenizer")
     all_text_units = sorted(
         all_text_units, key=lambda x: (x["order"], -x["relation_counts"])
     )
@@ -1600,7 +1590,6 @@ async def _find_most_related_edges_from_entities(
     node_datas: list[dict],
     query_param: QueryParam,
     knowledge_graph_inst: BaseGraphStorage,
-    global_config: dict[str, str],
 ):
     node_names = [dp["entity_name"] for dp in node_datas]
     batch_edges_dict = await knowledge_graph_inst.get_nodes_edges_batch(node_names)
@@ -1640,7 +1629,7 @@ async def _find_most_related_edges_from_entities(
             }
             all_edges_data.append(combined)
 
-    tokenizer: Tokenizer = global_config["tokenizer"]
+    tokenizer: Tokenizer = knowledge_graph_inst.global_config.get("tokenizer")
     all_edges_data = sorted(
         all_edges_data, key=lambda x: (x["rank"], x["weight"]), reverse=True
     )
@@ -1664,7 +1653,6 @@ async def _get_edge_data(
     relationships_vdb: BaseVectorStorage,
     text_chunks_db: BaseKVStorage,
     query_param: QueryParam,
-    global_config: dict[str, str],
 ):
     logger.info(
         f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
@@ -1705,7 +1693,7 @@ async def _get_edge_data(
             }
             edge_datas.append(combined)
 
-    tokenizer: Tokenizer = global_config["tokenizer"]
+    tokenizer: Tokenizer = text_chunks_db.global_config.get("tokenizer")
     edge_datas = sorted(
         edge_datas, key=lambda x: (x["rank"], x["weight"]), reverse=True
     )
@@ -1717,10 +1705,10 @@ async def _get_edge_data(
     )
     use_entities, use_text_units = await asyncio.gather(
         _find_most_related_entities_from_relationships(
-            edge_datas, query_param, knowledge_graph_inst, global_config
+            edge_datas, query_param, knowledge_graph_inst,
         ),
         _find_related_text_unit_from_relationships(
-            edge_datas, query_param, text_chunks_db, knowledge_graph_inst, global_config
+            edge_datas, query_param, text_chunks_db, knowledge_graph_inst,
         ),
     )
     logger.info(
@@ -1800,7 +1788,6 @@ async def _find_most_related_entities_from_relationships(
     edge_datas: list[dict],
     query_param: QueryParam,
     knowledge_graph_inst: BaseGraphStorage,
-    global_config: dict[str, str],
 ):
     entity_names = []
     seen = set()
@@ -1831,7 +1818,7 @@ async def _find_most_related_entities_from_relationships(
         combined = {**node, "entity_name": entity_name, "rank": degree}
         node_datas.append(combined)
 
-    tokenizer: Tokenizer = global_config["tokenizer"]
+    tokenizer: Tokenizer = knowledge_graph_inst.global_config.get("tokenizer")
     len_node_datas = len(node_datas)
     node_datas = truncate_list_by_token_size(
         node_datas,
@@ -1851,7 +1838,6 @@ async def _find_related_text_unit_from_relationships(
     query_param: QueryParam,
     text_chunks_db: BaseKVStorage,
     knowledge_graph_inst: BaseGraphStorage,
-    global_config: dict[str, str],
 ):
     text_units = [
         split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP])
@@ -1893,7 +1879,7 @@ async def fetch_chunk_data(c_id, index):
         logger.warning("No valid text chunks after filtering")
         return []
 
-    tokenizer: Tokenizer = global_config["tokenizer"]
+    tokenizer: Tokenizer = text_chunks_db.global_config.get("tokenizer")
     truncated_text_units = truncate_list_by_token_size(
         valid_text_units,
         key=lambda x: x["data"]["content"],
@@ -2130,7 +2116,6 @@ async def kg_query_with_keywords(
         relationships_vdb,
         text_chunks_db,
         query_param,
-        global_config,
     )
     if not context:
         return PROMPTS["fail_response"]