feat: robust json parsing & entity extraction progress log (#55)

rangehow · web-flow · commit 50218fb32727 · 2024-10-19T14:09:30.000+08:00
* robust json parsing &amp; entity extraction progress log

* remove loguru in test
diff --git a/nano_graphrag/_op.py b/nano_graphrag/_op.py
@@ -373,7 +373,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
             already_processed % len(PROMPTS["process_tickers"])
         ]
         print(
-            f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",
+            f"{now_ticks} Processed {already_processed}({already_processed*100//len(ordered_chunks)}%) chunks,  {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",
             end="",
             flush=True,
         )
diff --git a/nano_graphrag/_utils.py b/nano_graphrag/_utils.py
@@ -16,7 +16,6 @@
 logger = logging.getLogger("nano-graphrag")
 ENCODER = None
 
-
 def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
     try:
         # If there is already an event loop, use it.
@@ -29,24 +28,93 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
     return loop
 
 
-def locate_json_string_body_from_string(content: str) -> Union[str, None]:
-    """Locate the JSON string body from a string"""
-    maybe_json_str = re.search(r"{.*}", content, re.DOTALL)
-    if maybe_json_str is not None:
-        return maybe_json_str.group(0)
-    else:
+def extract_first_complete_json(s: str):
+    """Extract the first complete JSON object from the string using a stack to track braces."""
+    stack = []
+    first_json_start = None
+    
+    for i, char in enumerate(s):
+        if char == '{':
+            stack.append(i)
+            if first_json_start is None:
+                first_json_start = i
+        elif char == '}':
+            if stack:
+                start = stack.pop()
+                if not stack:
+                    first_json_str = s[first_json_start:i+1]
+                    try:
+                        # Attempt to parse the JSON string
+                        return json.loads(first_json_str.replace("\n", ""))
+                    except json.JSONDecodeError as e:
+                        logger.error(f"JSON decoding failed: {e}. Attempted string: {first_json_str[:50]}...")
+                        return None
+                    finally:
+                        first_json_start = None
+    logger.warning("No complete JSON object found in the input string.")
+    return None
+
+def parse_value(value: str):
+    """Convert a string value to its appropriate type (int, float, bool, None, or keep as string). Work as a more broad 'eval()'"""
+    value = value.strip()
+
+    if value == "null":
         return None
+    elif value == "true":
+        return True
+    elif value == "false":
+        return False
+    else:
+        # Try to convert to int or float
+        try:
+            if '.' in value:  # If there's a dot, it might be a float
+                return float(value)
+            else:
+                return int(value)
+        except ValueError:
+            # If conversion fails, return the value as-is (likely a string)
+            return value.strip('"')  # Remove surrounding quotes if they exist
+
+def extract_values_from_json(json_string, keys=["reasoning", "answer", "data"], allow_no_quotes=False):
+    """Extract key values from a non-standard or malformed JSON string, handling nested objects."""
+    extracted_values = {}
+    
+    # Enhanced pattern to match both quoted and unquoted values, as well as nested objects
+    regex_pattern = r'(?P<key>"?\w+"?)\s*:\s*(?P<value>{[^}]*}|".*?"|[^,}]+)'
+    
+    for match in re.finditer(regex_pattern, json_string, re.DOTALL):
+        key = match.group('key').strip('"')  # Strip quotes from key
+        value = match.group('value').strip()
+
+        # If the value is another nested JSON (starts with '{' and ends with '}'), recursively parse it
+        if value.startswith('{') and value.endswith('}'):
+            extracted_values[key] = extract_values_from_json(value)
+        else:
+            # Parse the value into the appropriate type (int, float, bool, etc.)
+            extracted_values[key] = parse_value(value)
+
+    if not extracted_values:
+        logger.warning("No values could be extracted from the string.")
+    
+    return extracted_values
 
 
 def convert_response_to_json(response: str) -> dict:
-    json_str = locate_json_string_body_from_string(response)
-    assert json_str is not None, f"Unable to parse JSON from response: {response}"
-    try:
-        data = json.loads(json_str)
-        return data
-    except json.JSONDecodeError as e:
-        logger.error(f"Failed to parse JSON: {json_str}")
-        raise e from None
+    """Convert response string to JSON, with error handling and fallback to non-standard JSON extraction."""
+    prediction_json = extract_first_complete_json(response)
+    
+    if prediction_json is None:
+        logger.info("Attempting to extract values from a non-standard JSON string...")
+        prediction_json = extract_values_from_json(response, allow_no_quotes=True)
+    
+    if not prediction_json:
+        logger.error("Unable to extract meaningful data from the response.")
+    else:
+        logger.info("JSON data successfully extracted.")
+    
+    return prediction_json
+
+
 
 
 def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"):
diff --git a/tests/test_json_parsing.py b/tests/test_json_parsing.py
@@ -0,0 +1,131 @@
+import unittest
+# from loguru import logger
+from nano_graphrag._utils import convert_response_to_json  
+
+class TestJSONExtraction(unittest.TestCase):
+
+    def setUp(self):
+        """Set up runs before each test case."""
+        ...
+
+    def test_standard_json(self):
+        """Test standard JSON extraction."""
+        response = '''
+        {
+            "reasoning": "This is a test.",
+            "answer": 42,
+            "data": {"key1": "value1", "key2": "value2"}
+        }
+        '''
+        expected = {
+            "reasoning": "This is a test.",
+            "answer": 42,
+            "data": {"key1": "value1", "key2": "value2"}
+        }
+        self.assertEqual(convert_response_to_json(response), expected)
+
+    def test_non_standard_json_without_quotes(self):
+        """Test non-standard JSON without quotes on numbers and booleans."""
+        response = '''
+        {
+            "reasoning": "Boolean and numbers test.",
+            "answer": 42,
+            "isCorrect": true,
+            "data": {key1: value1}
+        }
+        '''
+        expected = {
+            "reasoning": "Boolean and numbers test.",
+            "answer": 42,
+            "isCorrect": True,
+            "data": {"key1": "value1"}
+        }
+        self.assertEqual(convert_response_to_json(response), expected)
+
+    def test_nested_json(self):
+        """Test extraction of nested JSON objects."""
+        response = '''
+        {
+            "reasoning": "Nested structure.",
+            "answer": 42,
+            "data": {"nested": {"key": "value"}}
+        }
+        '''
+        expected = {
+            "reasoning": "Nested structure.",
+            "answer": 42,
+            "data": {
+                "nested": {"key": "value"}
+            }
+        }
+        self.assertEqual(convert_response_to_json(response), expected)
+
+    def test_malformed_json(self):
+        """Test handling of malformed JSON."""
+        response = '''
+        Some text before JSON
+        {
+            "reasoning": "This is malformed.",
+            "answer": 42,
+            "data": {"key": "value"}
+        }
+        Some text after JSON
+        '''
+        expected = {
+            "reasoning": "This is malformed.",
+            "answer": 42,
+            "data": {"key": "value"}
+        }
+        self.assertEqual(convert_response_to_json(response), expected)
+
+    def test_incomplete_json(self):
+        """Test handling of incomplete JSON."""
+        response = '''
+        {
+            "reasoning": "Incomplete structure",
+            "answer": 42
+        '''
+        expected = {
+            "reasoning": "Incomplete structure",
+            "answer": 42
+        }
+        self.assertEqual(convert_response_to_json(response), expected)
+
+    def test_value_with_special_characters(self):
+        """Test JSON with special characters in values."""
+        response = '''
+        {
+            "reasoning": "Special characters !@#$%^&*()",
+            "answer": 42,
+            "data": {"key": "value with special characters !@#$%^&*()"}
+        }
+        '''
+        expected = {
+            "reasoning": "Special characters !@#$%^&*()",
+            "answer": 42,
+            "data": {"key": "value with special characters !@#$%^&*()"}
+        }
+        self.assertEqual(convert_response_to_json(response), expected)
+
+    def test_boolean_and_null_values(self):
+        """Test JSON with boolean and null values."""
+        response = '''
+        {
+            "reasoning": "Boolean and null test.",
+            "isCorrect": true,
+            "isWrong": false,
+            "unknown": null,
+            "answer": 42
+        }
+        '''
+        expected = {
+            "reasoning": "Boolean and null test.",
+            "isCorrect": True,
+            "isWrong": False,
+            "unknown": None,
+            "answer": 42
+        }
+        self.assertEqual(convert_response_to_json(response), expected)
+
+if __name__ == "__main__":
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -373,7 +373,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):`
`373`	`373`	`already_processed % len(PROMPTS["process_tickers"])`
`374`	`374`	`]`
`375`	`375`	`print(`
`376`		`- f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",`
	`376`	`+ f"{now_ticks} Processed {already_processed}({already_processed*100//len(ordered_chunks)}%) chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",`
`377`	`377`	`end="",`
`378`	`378`	`flush=True,`
`379`	`379`	`)`