DebateForge/research_graph.py at master · aaishwarymishra/DebateForge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
from dotenv  import load_dotenv
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from pydantic import BaseModel, Field
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
import json
import uuid
from datetime import datetime, timezone, timedelta
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from typing import Literal, TypedDict
from langgraph.graph import StateGraph, START, END
from IPython.display import display,Image
from langgraph.checkpoint.memory import InMemorySaver
from graph_states import DataExtractionState,Queries,Extractor,Analyzer
from langgraph.checkpoint.memory import InMemorySaver
import asyncio


load_dotenv()

research_model = openai_model = ChatOpenAI(model="gpt-5-nano",reasoning_effort="minimal")
embedding_model = OpenAIEmbeddings(
    model="text-embedding-3-small",
)


def generate_subqueries(state):

  prompt = f"""You are an assistant that expands a user’s query into a set of focused, well-structured sub-queries for web search and research.

  Goal:
  Given any user query, generate a list of 5 sub-queries that explore the topic from multiple perspectives, ensure comprehensive coverage, and surface context the user may not have considered.

  Instructions:

  Understand the user’s original query and identify its core intent.

  Queries must be upto date with current time and date {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')}.

  Break it into smaller, precise, search-ready sub-queries.

  Include different angles such as:

  Background and foundational concepts

  Technical details

  Related problems or challenges

  Historical context

  Current trends or recent developments

  Tools, methods, or frameworks

  Comparisons and alternatives

  Case studies and practical examples

  Controversies, limitations, or open research questions

  Avoid redundancy; each sub-query should explore a unique dimension.

  Use concise, clear phrasing suitable for search engines."""

  messages = [
      SystemMessage(content=prompt),
      HumanMessage(content=state["query"])
  ]
  structured_model = research_model.with_structured_output(Queries)
  output = structured_model.invoke(messages)
  return {"subqueries":output.queries,"vector_store":str(uuid.uuid4())}

def search_subqueries(state):

  wrapper = DuckDuckGoSearchAPIWrapper(time="d", max_results=10)

  search = DuckDuckGoSearchResults(api_wrapper=wrapper, source="news",output_format='list')
  # search = tool
  search_results = []
  subqueries = state["subqueries"]
  for query in subqueries:
    result=search.invoke(query)
    search_results.append(json.dumps(result))
  return {"results":search_results}


def create_vectorstore_ddg(state):
  results =  state["results"]
  for result in results:
    documents = []
    result=json.loads(result)
    for item in result:
        page_content = item.get("snippet", "")
        title = item.get("title", "No Title")
        link = item.get("link", "")
        doc = Document(
            page_content=page_content,
            metadata={"source": link, "title": title}
        )
        documents.append(doc)
  splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
      encoding_name="cl100k_base", chunk_size=128, chunk_overlap=24
  )
  texts = splitter.split_documents(documents)
  vector_store = Chroma(
    collection_name=state["vector_store"],
    embedding_function=embedding_model,
    persist_directory="./chroma_langchain_db",
    tenant="default_tenant",
)
  vector_store.add_documents(texts)
  return state

async def batch_extract(state):
  prompt = """Role:
    You are a precision extraction model. Your job is to read the provided web-scraped content and extract only the information relevant to the user’s query. You do not interpret, judge, or analyze correctness—only extract.

    Instructions:

    Read the user’s query.

    Read the provided scraped content.

    Identify all information that is directly relevant to fulfilling the query.

    Remove everything irrelevant: filler text, ads, disclaimers, unrelated sections, partial sentences, duplicates, noise, formatting artifacts, navigation menus, etc.

    When information appears multiple times, keep the clearest version.

    Preserve original factual wording when possible.

    If a required piece of information is missing, output "not found" for that field.

    Do not guess or invent content under any circumstances.

    Output the result in a compact, structured format.

    Inputs:

    User Query: {QUERY}

    Your task: Return only the structured extraction described above."""
  # structured_model = gemini_model.with_structured_output(Extractor.model_json_schema())
  structured_model = research_model.with_structured_output(Extractor.model_json_schema())
  batches = []
  queries = state["subqueries"]
  results = state["results"]
  for query,result in zip(queries,results):
    message = [
        SystemMessage(content=prompt.format(QUERY=query,CONTENT=result)),
        HumanMessage(content=f"Scraped {result}"),
    ]
    batches.append(message)
  results = await structured_model.abatch(batches)
  return {"subquery_extracted_data":results}


async def analyzer(state):

  structured_model = research_model.with_structured_output(Analyzer)
  ANALYZER_PROMPT = """
    You are an analysis model. Your job is to decide whether the extracted information satisfies the query.

    Rules:
    1. Use ONLY the extracted information. Do not add or guess any facts.
    2. Determine if the query is satisfied: True or False.
    3. Provide short, logical reasoning.
    4. If any required info is missing or uncertain, list it. Otherwise return an empty list.
    5. Schema explanation:

    {{
      "query_statisfied": <true_or_false>,
      "reasoning": "<short explanation>",
      "missing_or_uncertain_information": [
        "<missing or uncertain item>",
        ...
      ]
    }}

    User Query:
    {query}

    Extracted Information:
    {extracted}
    """

  if state.get("subquery_extracted_data",None):
    data = state["subquery_extracted_data"]
  else:
    data = state["results"]

  batches = []

  for query,extracted in zip(state["subqueries"],data):
    message = [
        SystemMessage(content=ANALYZER_PROMPT.format(query=query,extracted=json.dumps(extracted)))
    ]
    batches.append(message)

  results = await structured_model.abatch(batches)

  return {"subquery_reviews": results}

async def query_refiner(state):
  context_dicts = []
  for review in state["subquery_reviews"]:
    context_dicts.append(review.model_dump())
  context_json_string = json.dumps(context_dicts)

  prompt = f"""
    # Role
    You are an expert Query Refinement Engine. Your goal is to analyze failed search attempts and generate a single, highly optimized search query to retrieve the missing information.
    Queries must be upto date with current time and date {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')}.

    # Input Context
    You will receive a list of JSON objects representing the analysis of previous search attempts. Each object follows this schema:
    - `query_statisfied` (bool): Whether the intent was met.
    - `query` (str): The original query attempted.
    - `reasoning` (str): Context on why it failed or succeeded.
    - `missing_or_uncertain_information` (list[str]): Specific data points that are still needed.

    # Task
    Iterate through the input list. **Only** process entries where `query_statisfied` is `False`.

    For each unsatisfied entry, generate **ONE** (1) refined query.
    1.  **Consolidate:** If multiple pieces of information are missing for a single entry, attempt to combine them into one specific, high-density query.
    2.  **Precision:** Use the `reasoning` field to identify why the previous attempt failed (e.g., broadness, wrong domain) and adjust keywords accordingly.
    3.  **Self-Correction:** If the previous query was a question, try a keyword string. If it was keywords, try a natural language question.

    # Constraints
    - **Do not** generate multiple queries for a single unsatisfied entry unless the missing information items are completely unrelated topics.
    - **Maximum limit:** Strictly produce no more than 1 refined query per unsatisfied input entry.
    - If a query failed due to being "too broad", the refined query must be narrower.


    # Context:
    {context_json_string}
    """

  structured_model = research_model.with_structured_output(Queries)
  response = structured_model.invoke([SystemMessage(content=prompt)])
  state["subqueries"] = response.queries
  state["query_limit"]-=1
  return state

async def refinement_check(state)->Literal["Query Refiner",END]:
  for review in state["subquery_reviews"]:
    if (not review.query_statisfied) and state["query_limit"]>0:
      return "Query Refiner"
  return END

def check_result_size(state)->Literal["Batch Extract","Analyzer"]:
  results = state["results"]
  data = "".join(results)
  if len(data)>12000:
    return "Batch Extract"
  else:
    return "Analyzer"

async def create_research_graph():
    blueprint = StateGraph(DataExtractionState)
    blueprint.add_node("Subqueries node",generate_subqueries)
    blueprint.add_node("Search Subqueries",search_subqueries)
    blueprint.add_node("Create VectorStore",create_vectorstore_ddg)
    blueprint.add_node("Batch Extract",batch_extract)
    blueprint.add_node("Analyzer",analyzer)
    blueprint.add_node("Query Refiner",query_refiner)

    blueprint.add_edge(START,"Subqueries node")
    blueprint.add_edge("Subqueries node","Search Subqueries")
    blueprint.add_edge("Search Subqueries","Create VectorStore")
    blueprint.add_edge("Batch Extract","Analyzer")
    blueprint.add_conditional_edges("Create VectorStore",check_result_size)
    blueprint.add_conditional_edges("Analyzer",refinement_check)
    blueprint.add_edge("Query Refiner","Search Subqueries")
    graph = blueprint.compile(checkpointer=InMemorySaver())
    display(Image(graph.get_graph().draw_mermaid_png()))
    return graph

if __name__ == "__main__":
    async def main():
        research_graph = await create_research_graph()
        thread_id = str(uuid.uuid4())
        config = {"configurable":{"thread_id":thread_id}}
        initial_state: DataExtractionState = {
            "query": "What are the latest advancements in renewable energy technologies in 2024?",
            "subqueries": [],
            "subquery_extracted_data": None,
            "subquery_reviews": [],
            "results": [],
            "vector_store": "",
            "query_limit":1
        }
        final_state = await research_graph.ainvoke(initial_state)
        print("Final State:", final_state)

    asyncio.run(main())