Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions databao/executors/lighthouse/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def get_query_ids_mapping(messages: list[BaseMessage]) -> dict[str, ToolMessage]


class ExecuteSubmit:
"""Simple graph with two tools: run_sql_query and submit_query_id.
"""Simple graph with two tools: run_sql_query and submit_result.
All context must be in the SystemMessage."""

MAX_TOOL_ROWS = 12
Expand Down Expand Up @@ -69,7 +69,7 @@ def get_result(self, state: AgentState) -> ExecutionResult:
if last_ai_message is None:
raise RuntimeError("No AI message found in message log")
if len(last_ai_message.tool_calls) == 0:
# Sometimes models don't call the submit_query_id tool, but we still want to return some dataframe.
# Sometimes models don't call the submit_result tool, but we still want to return some dataframe.
sql = state.get("sql", "")
df = state.get("df") # Latest df result (usually from run_sql_query)
visualization_prompt = state.get("visualization_prompt")
Expand All @@ -85,9 +85,9 @@ def get_result(self, state: AgentState) -> ExecutionResult:
)
elif len(last_ai_message.tool_calls) > 1:
raise RuntimeError("Expected exactly one tool call in AI message")
elif last_ai_message.tool_calls[0]["name"] != "submit_query_id":
elif last_ai_message.tool_calls[0]["name"] != "submit_result":
raise RuntimeError(
f"Expected submit_query_id tool call in AI message, got {last_ai_message.tool_calls[0]['name']}"
f"Expected submit_result tool call in AI message, got {last_ai_message.tool_calls[0]['name']}"
)
else:
sql = state.get("sql", "")
Expand Down Expand Up @@ -130,7 +130,7 @@ def run_sql_query(sql: str, graph_state: Annotated[AgentState, InjectedState]) -
return {"error": exception_to_string(e)}

@tool(parse_docstring=True)
def submit_query_id(
def submit_result(
query_id: str,
result_description: str,
visualization_prompt: str,
Expand All @@ -149,7 +149,7 @@ def submit_query_id(
"""
return f"Query {query_id} submitted successfully. Your response is now visible to the user."

tools = [run_sql_query, submit_query_id]
tools = [run_sql_query, submit_result]
return tools

def compile(self, model_config: LLMConfig) -> CompiledStateGraph[Any]:
Expand All @@ -170,11 +170,11 @@ def tool_executor_node(state: AgentState) -> dict[str, Any]:

tool_calls = last_message.tool_calls

is_ready_for_user = any(tc["name"] == "submit_query_id" for tc in tool_calls)
is_ready_for_user = any(tc["name"] == "submit_result" for tc in tool_calls)
if is_ready_for_user:
if len(tool_calls) > 1:
tool_messages = [
ToolMessage("submit_query_id must be the only tool call.", tool_call_id=tool_call["id"])
ToolMessage("submit_result must be the only tool call.", tool_call_id=tool_call["id"])
for tool_call in tool_calls
]
return {"messages": tool_messages, "ready_for_user": False}
Expand Down Expand Up @@ -244,14 +244,14 @@ def tool_executor_node(state: AgentState) -> dict[str, Any]:
tool_call_id=tool_call_id,
artifact=result,
)
elif name == "submit_query_id":
elif name == "submit_result":
content = str(result)
query_id = tool_call["args"]["query_id"]
visualization_prompt = tool_call["args"].get("visualization_prompt", "")
sql = state["query_ids"][query_id].artifact["sql"]
df = state["query_ids"][query_id].artifact["df"]
tool_messages.append(ToolMessage(content=content, tool_call_id=tool_call_id, artifact=result))
if name == "submit_query_id":
if name == "submit_result":
return {
"messages": tool_messages,
"sql": sql,
Expand All @@ -276,7 +276,7 @@ def should_continue(state: AgentState) -> Literal["tool_executor", "end"]:
return "end"

def should_finish(state: AgentState) -> Literal["llm_node", "end"]:
# Check if we just executed submit_query_id - if so, end the conversation
# Check if we just executed submit_result - if so, end the conversation
if state.get("ready_for_user", False):
return "end"
return "llm_node"
Expand Down
15 changes: 10 additions & 5 deletions databao/executors/lighthouse/system_prompt.jinja
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
You are an agent that has direct access to the database. You generate SQL requests, which are executed on a DB client with no changes.
You are a "Databao" agent that has direct access to the database. You generate SQL requests, which are executed on a DB client with no changes.
User can connect several databases and DataFrames to your internal DuckDB instance. DataFrames are available as tables with "temp.main" prefix.
The task is to request all necessary data and answer the user question.
You can answer with
- text (using plain text with no tool or result_description parameter of submit_result tool)
- a table (using SQL requests and query_id parameter of submit_result tool). It will be visible as a DataFrame.
- a plot (using visualization parameter of submit_result tool)
or a combination of these.

Today's date is: {{ date }} (YYYY-MM-DD).

Expand All @@ -11,14 +17,13 @@ Today's date is: {{ date }} (YYYY-MM-DD).
- Get DB schema in the 'Database schema' section. Don't waste tool call for it.
- Pay attention to SQL dialect specific commands (DuckDB is used)
- Cross joins are allowed only for tables that are guaranteed small (< 5 rows), such as enums or static dictionaries.
- Use 'today()' instead of 'now()' to get current date
- When calculating percentages like (a - b) / a * 100, you must make multiplication first to prevent number rounding. Use 100 * (a - b) / a.
- When comparing an unfinished period like the current year to a finished one like last year, use the same date range. Never compare unfinished periods to finished one.
- Make sure the submitted query answers the user's question and it is not-empty
- Result description of submitted query should contain definitions being used, important decisions and analysis of resulting data
- Make sure the submitted result answers the user's question and it is not-empty
- Result description of submitted result should contain definitions being used, important decisions and analysis of resulting data
- Leave visualization prompt empty if you don't want to visualize the result. Table with few values or table with heterogeneous data don't need visualization
- Time series require visualization
- The user will see only the submitted result of submit_query_id. The user will not see intermediate results
- The user will see only the submitted result - final SQL and DataFrame. The user will not see intermediate results


# Database schema
Expand Down
Loading