99from openai import OpenAI , AsyncOpenAI , AzureOpenAI , AsyncAzureOpenAI
1010
1111
12- # Set up logging
13- logging .basicConfig (level = logging .INFO , format = '%(asctime)s - %(levelname)s - %(message)s' )
14-
1512DEFAULT_PROMPT = """
1613Extract the full markdown text from the given image, following these guidelines:
1714- Respond only with markdown, no additional commentary.
@@ -56,6 +53,80 @@ def get_openai_client(api_key=None, base_url='https://api.openai.com/v1', is_asy
5653 return OpenAI (api_key = api_key , base_url = base_url , ** kwargs )
5754
5855
56+ def _prepare_image_messages (file_object , prompt ):
57+ """
58+ Helper function to prepare messages for OpenAI API call.
59+
60+ Args:
61+ file_object (io.BytesIO): The image file object.
62+ prompt (str): The prompt to send to the API.
63+
64+ Returns:
65+ list: The messages list for the API call.
66+ """
67+ base64_image = base64 .b64encode (file_object .read ()).decode ('utf-8' )
68+
69+ return [
70+ {
71+ "role" : "user" ,
72+ "content" : [
73+ {
74+ "type" : "text" ,
75+ "text" : prompt
76+ },
77+ {
78+ "type" : "image_url" ,
79+ "image_url" : {
80+ "url" : f"data:image/jpeg;base64,{ base64_image } "
81+ }
82+ }
83+ ]
84+ }
85+ ]
86+
87+
88+ def _validate_and_extract_content (response ):
89+ """
90+ Helper function to validate OpenAI API response and extract content.
91+
92+ Args:
93+ response: The response object from OpenAI API.
94+
95+ Returns:
96+ str or None: The extracted content, or None if validation fails.
97+ """
98+ # Validate the response structure before accessing choices
99+ if not response :
100+ logging .error (f"Received empty response from OpenAI API: { response } " )
101+ return None
102+
103+ if not hasattr (response , 'choices' ) or not response .choices :
104+ logging .error (f"Response does not contain choices or choices is empty. Response: { response } " )
105+ return None
106+
107+ if len (response .choices ) == 0 :
108+ logging .error (f"Response choices list is empty. Response: { response } " )
109+ return None
110+
111+ first_choice = response .choices [0 ]
112+ if not hasattr (first_choice , 'message' ) or not first_choice .message :
113+ logging .error (f"Response choice does not contain message. First choice: { first_choice } " )
114+ return None
115+
116+ if not hasattr (first_choice .message , 'content' ):
117+ logging .error (f"Response message does not contain content. Message: { first_choice .message } " )
118+ return None
119+
120+ markdown_content = first_choice .message .content
121+
122+ # Additional check for empty or None content
123+ if not markdown_content :
124+ logging .warning (f"Response content is empty or None. Content: { repr (markdown_content )} " )
125+ return None
126+
127+ return markdown_content
128+
129+
59130def image_to_markdown (file_object , client , model = "gpt-4o" , prompt = DEFAULT_PROMPT ):
60131 """
61132 Process a single image file and convert its content to markdown using OpenAI's API.
@@ -70,36 +141,24 @@ def image_to_markdown(file_object, client, model="gpt-4o", prompt=DEFAULT_PROMP
70141 str: The markdown representation of the image content, or None if an error occurs.
71142 """
72143 # Log that we're about to process a page
73- logging .info ("About to process a page" )
144+ logging .debug ("About to process a page" )
74145
75- base64_image = base64 . b64encode (file_object . read ()). decode ( 'utf-8' )
146+ messages = _prepare_image_messages (file_object , prompt )
76147
77148 try :
78149 response = client .chat .completions .create (
79150 model = model ,
80- messages = [
81- {
82- "role" : "user" ,
83- "content" : [
84- {
85- "type" : "text" ,
86- "text" : prompt
87- },
88- {
89- "type" : "image_url" ,
90- "image_url" : {
91- "url" : f"data:image/jpeg;base64,{ base64_image } "
92- }
93- }
94- ]
95- }
96- ]
151+ messages = messages
97152 )
98153
99- # Extract the markdown content from the response
100- markdown_content = response .choices [0 ].message .content
101- logging .info ("Page processed successfully" )
102- return markdown_content
154+ markdown_content = _validate_and_extract_content (response )
155+
156+ if markdown_content :
157+ logging .debug ("Page processed successfully" )
158+ return markdown_content
159+ else :
160+ logging .warning ("Page is empty or contains no text." )
161+ return None
103162
104163 except Exception as e :
105164 logging .error (f"An error occurred while processing the image: { e } " )
@@ -120,36 +179,24 @@ async def image_to_markdown_async(file_object, client, model="gpt-4o", prompt=DE
120179 tuple: A tuple containing the page number and the markdown representation of the image content, or None if an error occurs.
121180 """
122181 # Log that we're about to process a page
123- logging .info ("About to process a page" )
182+ logging .debug ("About to process a page" )
124183
125- base64_image = base64 . b64encode (file_object . read ()). decode ( 'utf-8' )
184+ messages = _prepare_image_messages (file_object , prompt )
126185
127186 try :
128187 response = await client .chat .completions .create (
129188 model = model ,
130- messages = [
131- {
132- "role" : "user" ,
133- "content" : [
134- {
135- "type" : "text" ,
136- "text" : prompt
137- },
138- {
139- "type" : "image_url" ,
140- "image_url" : {
141- "url" : f"data:image/jpeg;base64,{ base64_image } "
142- }
143- }
144- ]
145- }
146- ]
189+ messages = messages
147190 )
148191
149- # Extract the markdown content from the response
150- markdown_content = response .choices [0 ].message .content
151- logging .info ("Page processed successfully" )
152- return markdown_content
192+ markdown_content = _validate_and_extract_content (response )
193+
194+ if markdown_content :
195+ logging .debug ("Page processed successfully" )
196+ return markdown_content
197+ else :
198+ logging .warning ("Page is empty or contains no text." )
199+ return None
153200
154201 except Exception as e :
155202 logging .error (f"An error occurred while processing the image: { e } " )
@@ -273,7 +320,7 @@ def process_pages(pdf_file, pages_list=None, use_llm_for_all=False, drawing_area
273320 for page_num in pages_list :
274321 page = doc .load_page (page_num - 1 )
275322 if not use_llm_for_all and not is_visual_page (page , drawing_area_threshold = drawing_area_threshold ):
276- logging .info (f"The content of Page { page .number + 1 } will be extracted using text parsing." )
323+ logging .debug (f"The content of Page { page .number + 1 } will be extracted using text parsing." )
277324 # Extract text using traditional OCR
278325 markdown_content = page_to_markdown (page , gap_threshold = gap_threshold )
279326 if markdown_content :
@@ -283,7 +330,7 @@ def process_pages(pdf_file, pages_list=None, use_llm_for_all=False, drawing_area
283330 markdown_pages [page_num - 1 ] = f"Page { page .number + 1 } is empty or contains no text."
284331
285332 else :
286- logging .info (f"The content of page { page .number + 1 } will be extracted using the LLM." )
333+ logging .debug (f"The content of page { page .number + 1 } will be extracted using the LLM." )
287334 # Convert page to image
288335 image_file = page_to_image (page )
289336 image_files [page_num - 1 ] = io .BytesIO (image_file )
@@ -301,6 +348,7 @@ def ocr(
301348 use_llm_for_all = False ,
302349 drawing_area_threshold = DEFAULT_DRAWING_AREA_THRESHOLD ,
303350 gap_threshold = DEFAULT_GAP_THRESHOLD ,
351+ logging_level = logging .INFO ,
304352 ** kwargs
305353 ):
306354 """
@@ -318,17 +366,21 @@ def ocr(
318366 use_llm_for_all (bool, optional): If True, all pages will be processed using the LLM, regardless of visual content. Defaults to False.
319367 drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual.
320368 gap_threshold (int): The threshold for vertical gaps between text blocks.
369+ logging_level (int): The logging level. Defaults to logging.INFO.
321370 **kwargs: Additional keyword arguments.
322371
323372 Returns:
324373 list: A list of strings, each containing the markdown representation of a PDF page.
325374 """
375+ # Set up logging
376+ logging .basicConfig (level = logging_level , format = '%(asctime)s - %(levelname)s - %(message)s' )
377+
326378 client = get_openai_client (api_key = api_key , base_url = base_url , ** kwargs )
327379
328380 # Identify the maximum number of workers for parallel processing
329381 max_workers = os .getenv ("AIPDF_MAX_CONCURRENT_REQUESTS" , None )
330382 if max_workers :
331- logging .info ("The maximum number of concurrent requests is set to %s" , max_workers )
383+ logging .debug ("The maximum number of concurrent requests is set to %s" , max_workers )
332384 max_workers = int (max_workers )
333385
334386 markdown_pages , image_files = process_pages (
@@ -377,6 +429,7 @@ async def ocr_async(
377429 use_llm_for_all = False ,
378430 drawing_area_threshold = DEFAULT_DRAWING_AREA_THRESHOLD ,
379431 gap_threshold = DEFAULT_GAP_THRESHOLD ,
432+ logging_level = logging .INFO ,
380433 ** kwargs
381434 ):
382435 """
@@ -394,18 +447,22 @@ async def ocr_async(
394447 use_llm_for_all (bool, optional): If True, all pages will be processed using the LLM, regardless of visual content. Defaults to False.
395448 drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual.
396449 gap_threshold (int): The threshold for vertical gaps between text blocks.
450+ logging_level (int): The logging level. Defaults to logging.INFO.
397451 **kwargs: Additional keyword arguments.
398452
399453 Returns:
400454 list: A list of strings, each containing the markdown representation of a PDF page.
401455 """
456+ # Set up logging
457+ logging .basicConfig (level = logging_level , format = '%(asctime)s - %(levelname)s - %(message)s' )
458+
402459 client = get_openai_client (api_key = api_key , base_url = base_url , is_async = True , ** kwargs )
403460
404461 # Set up a semaphore for limiting concurrent requests if specified
405462 semaphore = None
406463 max_concurrent_requests = os .getenv ("AIPDF_MAX_CONCURRENT_REQUESTS" , None )
407464 if max_concurrent_requests :
408- logging .info ("The maximum number of concurrent requests is set to %s" , max_concurrent_requests )
465+ logging .debug ("The maximum number of concurrent requests is set to %s" , max_concurrent_requests )
409466 max_concurrent_requests = int (max_concurrent_requests )
410467 semaphore = asyncio .Semaphore (max_concurrent_requests )
411468
0 commit comments