Merge pull request #158 from mindsdb/extract-query-from-text2sql-agent

dusvyat · web-flow · commit 4a6f7256ad14 · 2024-09-11T17:24:42.000+03:00
Extract query from text2sql agent
diff --git a/examples/using_agents_with_text2sql.py b/examples/using_agents_with_text2sql.py
@@ -5,11 +5,11 @@
 con = mindsdb_sdk.connect()
 
 open_ai_key = os.getenv('OPENAI_API_KEY')
-model_name = 'gpt-4'
+model_name = 'gpt-4o'
 
 # Now create an agent that will use the model we just created.
 agent = con.agents.create(name=f'mindsdb_sql_agent_{model_name}_{uuid4().hex}',
-                          model='gpt-4')
+                          model=model_name)
 
 
 # Set up a Postgres data source with our new agent.
diff --git a/examples/using_agents_with_text2sql_streaming.py b/examples/using_agents_with_text2sql_streaming.py
@@ -1,16 +1,19 @@
+import logging
+
 import mindsdb_sdk
 from uuid import uuid4
 import os
 
+from mindsdb_sdk.utils.agents import MindsDBSQLStreamParser
+
 con = mindsdb_sdk.connect()
 
 open_ai_key = os.getenv('OPENAI_API_KEY')
-model_name = 'gpt-4'
+model_name = 'gpt-4o'
 
 # Now create an agent that will use the model we just created.
 agent = con.agents.create(name=f'mindsdb_sql_agent_{model_name}_{uuid4().hex}',
-                          model='gpt-4')
-
+                          model=model_name)
 
 # Set up a Postgres data source with our new agent.
 data_source = 'postgres'
@@ -32,23 +35,13 @@
 # Actually connect the agent to the datasource.
 agent.add_database(database.name, [], description)
 
-
 question = 'How many three-bedroom houses were sold in 2008?'
 
 completion_stream = agent.completion_stream([{'question': question, 'answer': None}])
 
-# Process the streaming response
-full_response = ""
-for chunk in completion_stream:
-    print(chunk)  # Print the entire chunk for debugging
-    if isinstance(chunk, dict):
-        if 'output' in chunk:
-            full_response += chunk['output']
-    elif isinstance(chunk, str):
-        full_response += chunk
-
-print("\n\nFull response:")
-print(full_response)
+#default logging level is set to INFO, we can change it to DEBUG to see more detailed logs and get full agent steps
+mdb_parser = MindsDBSQLStreamParser()
+full_response, sql_query = mdb_parser.process_stream(completion_stream)
 
 con.databases.drop(database.name)
 con.agents.drop(agent.name)
diff --git a/mindsdb_sdk/utils/agents.py b/mindsdb_sdk/utils/agents.py
@@ -0,0 +1,127 @@
+import re
+import json
+import logging
+from typing import Dict, Any, Generator, Optional, Tuple
+
+
+class MindsDBSQLStreamParser:
+    """
+    A utility class for parsing SQL queries from MindsDB completion streams.
+
+    This class provides methods to process completion streams, extract SQL queries,
+    and accumulate full responses.
+
+    Attributes:
+        logger (logging.Logger): The logger instance for this class.
+    """
+
+    def __init__(self, log_level: int = logging.INFO):
+        """
+        Initialize the MindsDBSQLStreamParser.
+
+        Args:
+            log_level (int, optional): The logging level to use. Defaults to logging.INFO.
+        """
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(log_level)
+
+        # Create a console handler and set its level
+        ch = logging.StreamHandler()
+        ch.setLevel(log_level)
+
+        # Create a formatter
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+        # Add the formatter to the handler
+        ch.setFormatter(formatter)
+
+        # Add the handler to the logger
+        self.logger.addHandler(ch)
+
+    def stream_and_parse_sql_query(self, completion_stream: Generator[Dict[str, Any], None, None]) -> Generator[
+        Dict[str, Optional[str]], None, None]:
+        """
+        Stream and parse the completion stream, yielding output and SQL queries.
+
+        This generator function processes each chunk of the completion stream,
+        extracts any output and SQL queries, and yields the results.
+
+        Args:
+            completion_stream (Generator[Dict[str, Any], None, None]): The input completion stream.
+
+        Yields:
+            Dict[str, Optional[str]]: A dictionary containing 'output' and 'sql_query' keys.
+                - 'output': The extracted output string from the chunk, if any.
+                - 'sql_query': The extracted SQL query string, if found in the chunk.
+
+        Note:
+            This function will only yield the first SQL query it finds in the stream.
+        """
+        sql_query_found = False
+
+        for chunk in completion_stream:
+            output = ""
+            sql_query = None
+
+            # Log full chunk at DEBUG level
+            self.logger.debug(f"Processing chunk: {json.dumps(chunk, indent=2)}")
+
+            # Log important info at INFO level
+            if isinstance(chunk, dict):
+                if 'quick_response' in chunk:
+                    self.logger.info(f"Quick response received: {json.dumps(chunk)}")
+
+                output = chunk.get('output', '')
+                if output:
+                    self.logger.info(f"Chunk output: {output}")
+
+                if 'messages' in chunk:
+                    for message in chunk['messages']:
+                        if message.get('role') == 'assistant':
+                            self.logger.info(f"Assistant message: {message.get('content', '')}")
+                if chunk.get('type') == 'sql':
+                    sql_query = chunk['content']
+                    self.logger.info(f"Generated SQL: {sql_query}")
+
+            elif isinstance(chunk, str):
+                output = chunk
+                self.logger.info(f"String chunk received: {chunk}")
+
+            yield {
+                'output':output,
+                'sql_query':sql_query
+            }
+
+    def process_stream(self, completion_stream: Generator[Dict[str, Any], None, None]) -> Tuple[str, Optional[str]]:
+        """
+        Process the completion stream and extract the SQL query.
+
+        This method iterates through the stream, accumulates the full response,
+        logs outputs, and extracts the SQL query when found.
+
+        Args:
+            completion_stream (Generator[Dict[str, Any], None, None]): The input completion stream.
+
+        Returns:
+            Tuple[str, Optional[str]]: A tuple containing:
+                - The full accumulated response as a string.
+                - The extracted SQL query as a string, or None if no query was found.
+        """
+        full_response = ""
+        sql_query = None
+
+        self.logger.info("Starting to process completion stream...")
+
+        for result in self.stream_and_parse_sql_query(completion_stream):
+            if result['output']:
+                self.logger.info(f"Output: {result['output']}")
+                full_response += result['output']
+
+            if result['sql_query'] and sql_query is None:
+                sql_query = result['sql_query']
+                self.logger.info(f"Extracted SQL Query: {sql_query}")
+
+        self.logger.info(f"Full Response: {full_response}")
+        self.logger.info(f"Final SQL Query: {sql_query}")
+
+        return full_response, sql_query
diff --git a/tests/test_agent_stream_process.py b/tests/test_agent_stream_process.py
@@ -0,0 +1,70 @@
+import pytest
+import logging
+
+from mindsdb_sdk.utils.agents import MindsDBSQLStreamParser
+
+@pytest.fixture
+def parser():
+    return MindsDBSQLStreamParser(log_level=logging.INFO)
+
+def test_initialization(parser):
+    assert isinstance(parser, MindsDBSQLStreamParser)
+    assert parser.logger.level == logging.INFO
+
+def test_stream_and_parse_sql_query_with_dict(parser):
+    mock_stream = [
+        {'output': 'Test output', 'type': 'text'},
+        {'type': 'sql', 'content': 'SELECT * FROM table'},
+        {'output': 'More output'}
+    ]
+
+    generator = parser.stream_and_parse_sql_query(iter(mock_stream))
+    results = list(generator)
+
+    assert len(results) == 3
+    assert results[0] == {'output': 'Test output', 'sql_query': None}
+    assert results[1] == {'output': '', 'sql_query': 'SELECT * FROM table'}
+    assert results[2] == {'output': 'More output', 'sql_query': None}
+
+def test_stream_and_parse_sql_query_with_string(parser):
+    mock_stream = ['String chunk 1', 'String chunk 2']
+
+    generator = parser.stream_and_parse_sql_query(iter(mock_stream))
+    results = list(generator)
+
+    assert len(results) == 2
+    assert results[0] == {'output': 'String chunk 1', 'sql_query': None}
+    assert results[1] == {'output': 'String chunk 2', 'sql_query': None}
+
+
+def test_process_stream(parser, caplog):
+    mock_stream = [
+        {'output':'First output'},
+        {'type':'sql', 'content':'SELECT * FROM users'},
+        {'output':'Second output'}
+    ]
+
+    with caplog.at_level(logging.INFO):
+        full_response, sql_query = parser.process_stream(iter(mock_stream))
+
+    assert full_response == 'First outputSecond output'
+    assert sql_query == 'SELECT * FROM users'
+
+    # Check for specific log messages
+    assert 'Starting to process completion stream...' in caplog.text
+    assert 'Output: First output' in caplog.text
+    assert 'Extracted SQL Query: SELECT * FROM users' in caplog.text
+    assert 'Output: Second output' in caplog.text
+    assert f'Full Response: {full_response}' in caplog.text
+    assert f'Final SQL Query: {sql_query}' in caplog.text
+
+def test_process_stream_no_sql(parser):
+    mock_stream = [
+        {'output': 'First output'},
+        {'output': 'Second output'}
+    ]
+
+    full_response, sql_query = parser.process_stream(iter(mock_stream))
+
+    assert full_response == 'First outputSecond output'
+    assert sql_query is None