4040 logger .warning ("pdf2image not available. PDF processing will be limited. Install with: pip install pdf2image" )
4141
4242
43- def _check_poppler_available () -> bool :
43+ def _check_poppler_available () -> tuple [ bool , str ] :
4444 """
45- Check if poppler-utils is installed and available in PATH .
45+ Check if poppler-utils is installed and available.
4646
4747 Returns:
48- True if poppler-utils is available, False otherwise
48+ Tuple of (is_available: bool, diagnostic_message: str)
4949 """
5050 import shutil
51- return shutil .which ("pdfinfo" ) is not None
51+ from pathlib import Path
52+
53+ # Check for pdfinfo in PATH
54+ pdfinfo_path = shutil .which ("pdfinfo" )
55+ if pdfinfo_path :
56+ return True , f"Found pdfinfo at: { pdfinfo_path } "
57+
58+ # Check for pdftoppm as alternative
59+ pdftoppm_path = shutil .which ("pdftoppm" )
60+ if pdftoppm_path :
61+ return True , f"Found pdftoppm at: { pdftoppm_path } "
62+
63+ # Check common installation locations
64+ common_paths = [
65+ "/usr/bin/pdfinfo" ,
66+ "/usr/local/bin/pdfinfo" ,
67+ "/opt/homebrew/bin/pdfinfo" , # macOS Homebrew on Apple Silicon
68+ "/usr/local/opt/poppler/bin/pdfinfo" , # macOS Homebrew
69+ ]
70+
71+ for path in common_paths :
72+ if Path (path ).exists ():
73+ return True , f"Found pdfinfo at: { path } (not in PATH)"
74+
75+ # Check if we're in a virtual environment and poppler might be elsewhere
76+ python_path = shutil .which ("python3" ) or shutil .which ("python" )
77+ if python_path :
78+ python_dir = Path (python_path ).parent
79+ # Check parent directories
80+ for parent in [python_dir .parent , python_dir .parent .parent ]:
81+ potential_path = parent / "bin" / "pdfinfo"
82+ if potential_path .exists ():
83+ return True , f"Found pdfinfo at: { potential_path } (not in PATH)"
84+
85+ # Diagnostic information
86+ path_env = os .getenv ("PATH" , "" )
87+ diagnostic = (
88+ f"poppler-utils not found in PATH. "
89+ f"PATH contains: { len (path_env .split (':' ))} directories. "
90+ f"Install with: sudo apt-get install poppler-utils (Ubuntu/Debian) "
91+ f"or brew install poppler (macOS). "
92+ f"If already installed, ensure it's in your PATH environment variable."
93+ )
94+
95+ return False , diagnostic
5296
5397
5498class NeMoRetrieverPreprocessor :
@@ -217,13 +261,14 @@ async def _extract_pdf_images(self, file_path: str) -> List[Image.Image]:
217261 )
218262
219263 # Check if poppler-utils is available before attempting conversion
220- if not _check_poppler_available ():
221- raise RuntimeError (
222- "poppler-utils is not installed or not in PATH. "
223- "Install it with: sudo apt-get install poppler-utils (Ubuntu/Debian) "
224- "or brew install poppler (macOS). "
225- "This is required for PDF to image conversion."
226- )
264+ poppler_available , diagnostic_msg = _check_poppler_available ()
265+ if not poppler_available :
266+ logger .warning (f"Poppler check failed: { diagnostic_msg } " )
267+ # Still try to proceed - pdf2image might work if poppler is in a non-standard location
268+ # or the check might be too strict. pdf2image will raise a clearer error if it fails.
269+ logger .info ("Attempting PDF conversion anyway - pdf2image will provide detailed error if poppler is truly missing" )
270+ else :
271+ logger .debug (f"Poppler check passed: { diagnostic_msg } " )
227272
228273 logger .info (f"Converting PDF to images: { file_path } " )
229274
@@ -233,13 +278,30 @@ async def _extract_pdf_images(self, file_path: str) -> List[Image.Image]:
233278 # Convert PDF pages to PIL Images
234279 # dpi=150 provides good quality for OCR processing
235280 # first_page and last_page limit the number of pages processed
236- pdf_images = convert_from_path (
237- file_path ,
238- dpi = 150 ,
239- first_page = 1 ,
240- last_page = max_pages ,
241- fmt = 'png'
242- )
281+ try :
282+ pdf_images = convert_from_path (
283+ file_path ,
284+ dpi = 150 ,
285+ first_page = 1 ,
286+ last_page = max_pages ,
287+ fmt = 'png'
288+ )
289+ except Exception as pdf_error :
290+ # Check if it's a poppler-related error
291+ error_str = str (pdf_error ).lower ()
292+ if "poppler" in error_str or "pdfinfo" in error_str or "not installed" in error_str :
293+ raise RuntimeError (
294+ f"poppler-utils is required for PDF processing but is not available. "
295+ f"Error: { pdf_error } \n \n "
296+ f"Installation instructions:\n "
297+ f" Ubuntu/Debian: sudo apt-get install poppler-utils\n "
298+ f" macOS: brew install poppler\n "
299+ f" Windows: Download from http://blog.alivate.com.au/poppler-windows/ or use: choco install poppler\n \n "
300+ f"After installation, ensure poppler-utils binaries are in your PATH. "
301+ f"You may need to restart your application or terminal."
302+ ) from pdf_error
303+ # Re-raise other errors as-is
304+ raise
243305
244306 total_pages = len (pdf_images )
245307 logger .info (f"Converted { total_pages } pages from PDF" )
@@ -248,6 +310,9 @@ async def _extract_pdf_images(self, file_path: str) -> List[Image.Image]:
248310 images = pdf_images
249311 logger .info (f"Extracted { len (images )} pages from PDF" )
250312
313+ except RuntimeError :
314+ # Re-raise RuntimeError (our improved poppler error) as-is
315+ raise
251316 except Exception as e :
252317 logger .error (f"PDF image extraction failed: { e } " , exc_info = True )
253318 raise
0 commit comments