Skip to content

Commit 50218fb

Browse files
authored
feat: robust json parsing & entity extraction progress log (#55)
* robust json parsing & entity extraction progress log * remove loguru in test
1 parent a96899b commit 50218fb

File tree

3 files changed

+215
-16
lines changed

3 files changed

+215
-16
lines changed

nano_graphrag/_op.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
373373
already_processed % len(PROMPTS["process_tickers"])
374374
]
375375
print(
376-
f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",
376+
f"{now_ticks} Processed {already_processed}({already_processed*100//len(ordered_chunks)}%) chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",
377377
end="",
378378
flush=True,
379379
)

nano_graphrag/_utils.py

Lines changed: 83 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
logger = logging.getLogger("nano-graphrag")
1717
ENCODER = None
1818

19-
2019
def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
2120
try:
2221
# If there is already an event loop, use it.
@@ -29,24 +28,93 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
2928
return loop
3029

3130

32-
def locate_json_string_body_from_string(content: str) -> Union[str, None]:
33-
"""Locate the JSON string body from a string"""
34-
maybe_json_str = re.search(r"{.*}", content, re.DOTALL)
35-
if maybe_json_str is not None:
36-
return maybe_json_str.group(0)
37-
else:
31+
def extract_first_complete_json(s: str):
32+
"""Extract the first complete JSON object from the string using a stack to track braces."""
33+
stack = []
34+
first_json_start = None
35+
36+
for i, char in enumerate(s):
37+
if char == '{':
38+
stack.append(i)
39+
if first_json_start is None:
40+
first_json_start = i
41+
elif char == '}':
42+
if stack:
43+
start = stack.pop()
44+
if not stack:
45+
first_json_str = s[first_json_start:i+1]
46+
try:
47+
# Attempt to parse the JSON string
48+
return json.loads(first_json_str.replace("\n", ""))
49+
except json.JSONDecodeError as e:
50+
logger.error(f"JSON decoding failed: {e}. Attempted string: {first_json_str[:50]}...")
51+
return None
52+
finally:
53+
first_json_start = None
54+
logger.warning("No complete JSON object found in the input string.")
55+
return None
56+
57+
def parse_value(value: str):
58+
"""Convert a string value to its appropriate type (int, float, bool, None, or keep as string). Work as a more broad 'eval()'"""
59+
value = value.strip()
60+
61+
if value == "null":
3862
return None
63+
elif value == "true":
64+
return True
65+
elif value == "false":
66+
return False
67+
else:
68+
# Try to convert to int or float
69+
try:
70+
if '.' in value: # If there's a dot, it might be a float
71+
return float(value)
72+
else:
73+
return int(value)
74+
except ValueError:
75+
# If conversion fails, return the value as-is (likely a string)
76+
return value.strip('"') # Remove surrounding quotes if they exist
77+
78+
def extract_values_from_json(json_string, keys=["reasoning", "answer", "data"], allow_no_quotes=False):
79+
"""Extract key values from a non-standard or malformed JSON string, handling nested objects."""
80+
extracted_values = {}
81+
82+
# Enhanced pattern to match both quoted and unquoted values, as well as nested objects
83+
regex_pattern = r'(?P<key>"?\w+"?)\s*:\s*(?P<value>{[^}]*}|".*?"|[^,}]+)'
84+
85+
for match in re.finditer(regex_pattern, json_string, re.DOTALL):
86+
key = match.group('key').strip('"') # Strip quotes from key
87+
value = match.group('value').strip()
88+
89+
# If the value is another nested JSON (starts with '{' and ends with '}'), recursively parse it
90+
if value.startswith('{') and value.endswith('}'):
91+
extracted_values[key] = extract_values_from_json(value)
92+
else:
93+
# Parse the value into the appropriate type (int, float, bool, etc.)
94+
extracted_values[key] = parse_value(value)
95+
96+
if not extracted_values:
97+
logger.warning("No values could be extracted from the string.")
98+
99+
return extracted_values
39100

40101

41102
def convert_response_to_json(response: str) -> dict:
42-
json_str = locate_json_string_body_from_string(response)
43-
assert json_str is not None, f"Unable to parse JSON from response: {response}"
44-
try:
45-
data = json.loads(json_str)
46-
return data
47-
except json.JSONDecodeError as e:
48-
logger.error(f"Failed to parse JSON: {json_str}")
49-
raise e from None
103+
"""Convert response string to JSON, with error handling and fallback to non-standard JSON extraction."""
104+
prediction_json = extract_first_complete_json(response)
105+
106+
if prediction_json is None:
107+
logger.info("Attempting to extract values from a non-standard JSON string...")
108+
prediction_json = extract_values_from_json(response, allow_no_quotes=True)
109+
110+
if not prediction_json:
111+
logger.error("Unable to extract meaningful data from the response.")
112+
else:
113+
logger.info("JSON data successfully extracted.")
114+
115+
return prediction_json
116+
117+
50118

51119

52120
def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"):

tests/test_json_parsing.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import unittest
2+
# from loguru import logger
3+
from nano_graphrag._utils import convert_response_to_json
4+
5+
class TestJSONExtraction(unittest.TestCase):
6+
7+
def setUp(self):
8+
"""Set up runs before each test case."""
9+
...
10+
11+
def test_standard_json(self):
12+
"""Test standard JSON extraction."""
13+
response = '''
14+
{
15+
"reasoning": "This is a test.",
16+
"answer": 42,
17+
"data": {"key1": "value1", "key2": "value2"}
18+
}
19+
'''
20+
expected = {
21+
"reasoning": "This is a test.",
22+
"answer": 42,
23+
"data": {"key1": "value1", "key2": "value2"}
24+
}
25+
self.assertEqual(convert_response_to_json(response), expected)
26+
27+
def test_non_standard_json_without_quotes(self):
28+
"""Test non-standard JSON without quotes on numbers and booleans."""
29+
response = '''
30+
{
31+
"reasoning": "Boolean and numbers test.",
32+
"answer": 42,
33+
"isCorrect": true,
34+
"data": {key1: value1}
35+
}
36+
'''
37+
expected = {
38+
"reasoning": "Boolean and numbers test.",
39+
"answer": 42,
40+
"isCorrect": True,
41+
"data": {"key1": "value1"}
42+
}
43+
self.assertEqual(convert_response_to_json(response), expected)
44+
45+
def test_nested_json(self):
46+
"""Test extraction of nested JSON objects."""
47+
response = '''
48+
{
49+
"reasoning": "Nested structure.",
50+
"answer": 42,
51+
"data": {"nested": {"key": "value"}}
52+
}
53+
'''
54+
expected = {
55+
"reasoning": "Nested structure.",
56+
"answer": 42,
57+
"data": {
58+
"nested": {"key": "value"}
59+
}
60+
}
61+
self.assertEqual(convert_response_to_json(response), expected)
62+
63+
def test_malformed_json(self):
64+
"""Test handling of malformed JSON."""
65+
response = '''
66+
Some text before JSON
67+
{
68+
"reasoning": "This is malformed.",
69+
"answer": 42,
70+
"data": {"key": "value"}
71+
}
72+
Some text after JSON
73+
'''
74+
expected = {
75+
"reasoning": "This is malformed.",
76+
"answer": 42,
77+
"data": {"key": "value"}
78+
}
79+
self.assertEqual(convert_response_to_json(response), expected)
80+
81+
def test_incomplete_json(self):
82+
"""Test handling of incomplete JSON."""
83+
response = '''
84+
{
85+
"reasoning": "Incomplete structure",
86+
"answer": 42
87+
'''
88+
expected = {
89+
"reasoning": "Incomplete structure",
90+
"answer": 42
91+
}
92+
self.assertEqual(convert_response_to_json(response), expected)
93+
94+
def test_value_with_special_characters(self):
95+
"""Test JSON with special characters in values."""
96+
response = '''
97+
{
98+
"reasoning": "Special characters !@#$%^&*()",
99+
"answer": 42,
100+
"data": {"key": "value with special characters !@#$%^&*()"}
101+
}
102+
'''
103+
expected = {
104+
"reasoning": "Special characters !@#$%^&*()",
105+
"answer": 42,
106+
"data": {"key": "value with special characters !@#$%^&*()"}
107+
}
108+
self.assertEqual(convert_response_to_json(response), expected)
109+
110+
def test_boolean_and_null_values(self):
111+
"""Test JSON with boolean and null values."""
112+
response = '''
113+
{
114+
"reasoning": "Boolean and null test.",
115+
"isCorrect": true,
116+
"isWrong": false,
117+
"unknown": null,
118+
"answer": 42
119+
}
120+
'''
121+
expected = {
122+
"reasoning": "Boolean and null test.",
123+
"isCorrect": True,
124+
"isWrong": False,
125+
"unknown": None,
126+
"answer": 42
127+
}
128+
self.assertEqual(convert_response_to_json(response), expected)
129+
130+
if __name__ == "__main__":
131+
unittest.main()

0 commit comments

Comments
 (0)