Skip to content

Commit 38650dc

Browse files
authored
Merge pull request #12 from mindsdb/feat/verify-chat-content
feat: verify chat content and improve logs
2 parents 2035ba4 + cf688a7 commit 38650dc

File tree

2 files changed

+111
-54
lines changed

2 files changed

+111
-54
lines changed

src/aipdf/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .ocr import ocr, ocr_async
22

3-
__version__ = "0.0.6"
3+
__version__ = "0.0.6.2"
44

55
__all__ = ["__version__", "ocr", "ocr_async"]

src/aipdf/ocr.py

Lines changed: 110 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,6 @@
99
from openai import OpenAI, AsyncOpenAI, AzureOpenAI, AsyncAzureOpenAI
1010

1111

12-
# Set up logging
13-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14-
1512
DEFAULT_PROMPT = """
1613
Extract the full markdown text from the given image, following these guidelines:
1714
- Respond only with markdown, no additional commentary.
@@ -56,6 +53,80 @@ def get_openai_client(api_key=None, base_url='https://api.openai.com/v1', is_asy
5653
return OpenAI(api_key=api_key, base_url=base_url, **kwargs)
5754

5855

56+
def _prepare_image_messages(file_object, prompt):
57+
"""
58+
Helper function to prepare messages for OpenAI API call.
59+
60+
Args:
61+
file_object (io.BytesIO): The image file object.
62+
prompt (str): The prompt to send to the API.
63+
64+
Returns:
65+
list: The messages list for the API call.
66+
"""
67+
base64_image = base64.b64encode(file_object.read()).decode('utf-8')
68+
69+
return [
70+
{
71+
"role": "user",
72+
"content": [
73+
{
74+
"type": "text",
75+
"text": prompt
76+
},
77+
{
78+
"type": "image_url",
79+
"image_url": {
80+
"url": f"data:image/jpeg;base64,{base64_image}"
81+
}
82+
}
83+
]
84+
}
85+
]
86+
87+
88+
def _validate_and_extract_content(response):
89+
"""
90+
Helper function to validate OpenAI API response and extract content.
91+
92+
Args:
93+
response: The response object from OpenAI API.
94+
95+
Returns:
96+
str or None: The extracted content, or None if validation fails.
97+
"""
98+
# Validate the response structure before accessing choices
99+
if not response:
100+
logging.error(f"Received empty response from OpenAI API: {response}")
101+
return None
102+
103+
if not hasattr(response, 'choices') or not response.choices:
104+
logging.error(f"Response does not contain choices or choices is empty. Response: {response}")
105+
return None
106+
107+
if len(response.choices) == 0:
108+
logging.error(f"Response choices list is empty. Response: {response}")
109+
return None
110+
111+
first_choice = response.choices[0]
112+
if not hasattr(first_choice, 'message') or not first_choice.message:
113+
logging.error(f"Response choice does not contain message. First choice: {first_choice}")
114+
return None
115+
116+
if not hasattr(first_choice.message, 'content'):
117+
logging.error(f"Response message does not contain content. Message: {first_choice.message}")
118+
return None
119+
120+
markdown_content = first_choice.message.content
121+
122+
# Additional check for empty or None content
123+
if not markdown_content:
124+
logging.warning(f"Response content is empty or None. Content: {repr(markdown_content)}")
125+
return None
126+
127+
return markdown_content
128+
129+
59130
def image_to_markdown(file_object, client, model="gpt-4o", prompt=DEFAULT_PROMPT):
60131
"""
61132
Process a single image file and convert its content to markdown using OpenAI's API.
@@ -70,36 +141,24 @@ def image_to_markdown(file_object, client, model="gpt-4o", prompt=DEFAULT_PROMP
70141
str: The markdown representation of the image content, or None if an error occurs.
71142
"""
72143
# Log that we're about to process a page
73-
logging.info("About to process a page")
144+
logging.debug("About to process a page")
74145

75-
base64_image = base64.b64encode(file_object.read()).decode('utf-8')
146+
messages = _prepare_image_messages(file_object, prompt)
76147

77148
try:
78149
response = client.chat.completions.create(
79150
model=model,
80-
messages=[
81-
{
82-
"role": "user",
83-
"content": [
84-
{
85-
"type": "text",
86-
"text": prompt
87-
},
88-
{
89-
"type": "image_url",
90-
"image_url": {
91-
"url": f"data:image/jpeg;base64,{base64_image}"
92-
}
93-
}
94-
]
95-
}
96-
]
151+
messages=messages
97152
)
98153

99-
# Extract the markdown content from the response
100-
markdown_content = response.choices[0].message.content
101-
logging.info("Page processed successfully")
102-
return markdown_content
154+
markdown_content = _validate_and_extract_content(response)
155+
156+
if markdown_content:
157+
logging.debug("Page processed successfully")
158+
return markdown_content
159+
else:
160+
logging.warning("Page is empty or contains no text.")
161+
return None
103162

104163
except Exception as e:
105164
logging.error(f"An error occurred while processing the image: {e}")
@@ -120,36 +179,24 @@ async def image_to_markdown_async(file_object, client, model="gpt-4o", prompt=DE
120179
tuple: A tuple containing the page number and the markdown representation of the image content, or None if an error occurs.
121180
"""
122181
# Log that we're about to process a page
123-
logging.info("About to process a page")
182+
logging.debug("About to process a page")
124183

125-
base64_image = base64.b64encode(file_object.read()).decode('utf-8')
184+
messages = _prepare_image_messages(file_object, prompt)
126185

127186
try:
128187
response = await client.chat.completions.create(
129188
model=model,
130-
messages=[
131-
{
132-
"role": "user",
133-
"content": [
134-
{
135-
"type": "text",
136-
"text": prompt
137-
},
138-
{
139-
"type": "image_url",
140-
"image_url": {
141-
"url": f"data:image/jpeg;base64,{base64_image}"
142-
}
143-
}
144-
]
145-
}
146-
]
189+
messages=messages
147190
)
148191

149-
# Extract the markdown content from the response
150-
markdown_content = response.choices[0].message.content
151-
logging.info("Page processed successfully")
152-
return markdown_content
192+
markdown_content = _validate_and_extract_content(response)
193+
194+
if markdown_content:
195+
logging.debug("Page processed successfully")
196+
return markdown_content
197+
else:
198+
logging.warning("Page is empty or contains no text.")
199+
return None
153200

154201
except Exception as e:
155202
logging.error(f"An error occurred while processing the image: {e}")
@@ -273,7 +320,7 @@ def process_pages(pdf_file, pages_list=None, use_llm_for_all=False, drawing_area
273320
for page_num in pages_list:
274321
page = doc.load_page(page_num - 1)
275322
if not use_llm_for_all and not is_visual_page(page, drawing_area_threshold=drawing_area_threshold):
276-
logging.info(f"The content of Page {page.number + 1} will be extracted using text parsing.")
323+
logging.debug(f"The content of Page {page.number + 1} will be extracted using text parsing.")
277324
# Extract text using traditional OCR
278325
markdown_content = page_to_markdown(page, gap_threshold=gap_threshold)
279326
if markdown_content:
@@ -283,7 +330,7 @@ def process_pages(pdf_file, pages_list=None, use_llm_for_all=False, drawing_area
283330
markdown_pages[page_num - 1] = f"Page {page.number + 1} is empty or contains no text."
284331

285332
else:
286-
logging.info(f"The content of page {page.number + 1} will be extracted using the LLM.")
333+
logging.debug(f"The content of page {page.number + 1} will be extracted using the LLM.")
287334
# Convert page to image
288335
image_file = page_to_image(page)
289336
image_files[page_num - 1] = io.BytesIO(image_file)
@@ -301,6 +348,7 @@ def ocr(
301348
use_llm_for_all=False,
302349
drawing_area_threshold=DEFAULT_DRAWING_AREA_THRESHOLD,
303350
gap_threshold=DEFAULT_GAP_THRESHOLD,
351+
logging_level=logging.INFO,
304352
**kwargs
305353
):
306354
"""
@@ -318,17 +366,21 @@ def ocr(
318366
use_llm_for_all (bool, optional): If True, all pages will be processed using the LLM, regardless of visual content. Defaults to False.
319367
drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual.
320368
gap_threshold (int): The threshold for vertical gaps between text blocks.
369+
logging_level (int): The logging level. Defaults to logging.INFO.
321370
**kwargs: Additional keyword arguments.
322371
323372
Returns:
324373
list: A list of strings, each containing the markdown representation of a PDF page.
325374
"""
375+
# Set up logging
376+
logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
377+
326378
client = get_openai_client(api_key=api_key, base_url=base_url, **kwargs)
327379

328380
# Identify the maximum number of workers for parallel processing
329381
max_workers = os.getenv("AIPDF_MAX_CONCURRENT_REQUESTS", None)
330382
if max_workers:
331-
logging.info("The maximum number of concurrent requests is set to %s", max_workers)
383+
logging.debug("The maximum number of concurrent requests is set to %s", max_workers)
332384
max_workers = int(max_workers)
333385

334386
markdown_pages, image_files = process_pages(
@@ -377,6 +429,7 @@ async def ocr_async(
377429
use_llm_for_all=False,
378430
drawing_area_threshold=DEFAULT_DRAWING_AREA_THRESHOLD,
379431
gap_threshold=DEFAULT_GAP_THRESHOLD,
432+
logging_level=logging.INFO,
380433
**kwargs
381434
):
382435
"""
@@ -394,18 +447,22 @@ async def ocr_async(
394447
use_llm_for_all (bool, optional): If True, all pages will be processed using the LLM, regardless of visual content. Defaults to False.
395448
drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual.
396449
gap_threshold (int): The threshold for vertical gaps between text blocks.
450+
logging_level (int): The logging level. Defaults to logging.INFO.
397451
**kwargs: Additional keyword arguments.
398452
399453
Returns:
400454
list: A list of strings, each containing the markdown representation of a PDF page.
401455
"""
456+
# Set up logging
457+
logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
458+
402459
client = get_openai_client(api_key=api_key, base_url=base_url, is_async=True, **kwargs)
403460

404461
# Set up a semaphore for limiting concurrent requests if specified
405462
semaphore = None
406463
max_concurrent_requests = os.getenv("AIPDF_MAX_CONCURRENT_REQUESTS", None)
407464
if max_concurrent_requests:
408-
logging.info("The maximum number of concurrent requests is set to %s", max_concurrent_requests)
465+
logging.debug("The maximum number of concurrent requests is set to %s", max_concurrent_requests)
409466
max_concurrent_requests = int(max_concurrent_requests)
410467
semaphore = asyncio.Semaphore(max_concurrent_requests)
411468

0 commit comments

Comments
 (0)