Skip to content

Commit 784cac6

Browse files
committed
fix(document): improve poppler-utils detection and error handling
- Enhance _check_poppler_available() to check multiple binaries and common paths - Check for pdfinfo and pdftoppm in PATH - Search common installation locations (/usr/bin, /usr/local/bin, Homebrew paths) - Return diagnostic messages to help troubleshoot PATH issues - Make check more lenient - attempt conversion even if check fails - Add specific error handling for pdf2image poppler-related errors - Provide clear installation instructions in error messages - Log warnings instead of failing immediately if check fails This helps diagnose cases where poppler-utils is installed but not in PATH, or when the application needs to be restarted after installation.
1 parent 4826efe commit 784cac6

1 file changed

Lines changed: 83 additions & 18 deletions

File tree

src/api/agents/document/preprocessing/nemo_retriever.py

Lines changed: 83 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,59 @@
4040
logger.warning("pdf2image not available. PDF processing will be limited. Install with: pip install pdf2image")
4141

4242

43-
def _check_poppler_available() -> bool:
43+
def _check_poppler_available() -> tuple[bool, str]:
4444
"""
45-
Check if poppler-utils is installed and available in PATH.
45+
Check if poppler-utils is installed and available.
4646
4747
Returns:
48-
True if poppler-utils is available, False otherwise
48+
Tuple of (is_available: bool, diagnostic_message: str)
4949
"""
5050
import shutil
51-
return shutil.which("pdfinfo") is not None
51+
from pathlib import Path
52+
53+
# Check for pdfinfo in PATH
54+
pdfinfo_path = shutil.which("pdfinfo")
55+
if pdfinfo_path:
56+
return True, f"Found pdfinfo at: {pdfinfo_path}"
57+
58+
# Check for pdftoppm as alternative
59+
pdftoppm_path = shutil.which("pdftoppm")
60+
if pdftoppm_path:
61+
return True, f"Found pdftoppm at: {pdftoppm_path}"
62+
63+
# Check common installation locations
64+
common_paths = [
65+
"/usr/bin/pdfinfo",
66+
"/usr/local/bin/pdfinfo",
67+
"/opt/homebrew/bin/pdfinfo", # macOS Homebrew on Apple Silicon
68+
"/usr/local/opt/poppler/bin/pdfinfo", # macOS Homebrew
69+
]
70+
71+
for path in common_paths:
72+
if Path(path).exists():
73+
return True, f"Found pdfinfo at: {path} (not in PATH)"
74+
75+
# Check if we're in a virtual environment and poppler might be elsewhere
76+
python_path = shutil.which("python3") or shutil.which("python")
77+
if python_path:
78+
python_dir = Path(python_path).parent
79+
# Check parent directories
80+
for parent in [python_dir.parent, python_dir.parent.parent]:
81+
potential_path = parent / "bin" / "pdfinfo"
82+
if potential_path.exists():
83+
return True, f"Found pdfinfo at: {potential_path} (not in PATH)"
84+
85+
# Diagnostic information
86+
path_env = os.getenv("PATH", "")
87+
diagnostic = (
88+
f"poppler-utils not found in PATH. "
89+
f"PATH contains: {len(path_env.split(':'))} directories. "
90+
f"Install with: sudo apt-get install poppler-utils (Ubuntu/Debian) "
91+
f"or brew install poppler (macOS). "
92+
f"If already installed, ensure it's in your PATH environment variable."
93+
)
94+
95+
return False, diagnostic
5296

5397

5498
class NeMoRetrieverPreprocessor:
@@ -217,13 +261,14 @@ async def _extract_pdf_images(self, file_path: str) -> List[Image.Image]:
217261
)
218262

219263
# Check if poppler-utils is available before attempting conversion
220-
if not _check_poppler_available():
221-
raise RuntimeError(
222-
"poppler-utils is not installed or not in PATH. "
223-
"Install it with: sudo apt-get install poppler-utils (Ubuntu/Debian) "
224-
"or brew install poppler (macOS). "
225-
"This is required for PDF to image conversion."
226-
)
264+
poppler_available, diagnostic_msg = _check_poppler_available()
265+
if not poppler_available:
266+
logger.warning(f"Poppler check failed: {diagnostic_msg}")
267+
# Still try to proceed - pdf2image might work if poppler is in a non-standard location
268+
# or the check might be too strict. pdf2image will raise a clearer error if it fails.
269+
logger.info("Attempting PDF conversion anyway - pdf2image will provide detailed error if poppler is truly missing")
270+
else:
271+
logger.debug(f"Poppler check passed: {diagnostic_msg}")
227272

228273
logger.info(f"Converting PDF to images: {file_path}")
229274

@@ -233,13 +278,30 @@ async def _extract_pdf_images(self, file_path: str) -> List[Image.Image]:
233278
# Convert PDF pages to PIL Images
234279
# dpi=150 provides good quality for OCR processing
235280
# first_page and last_page limit the number of pages processed
236-
pdf_images = convert_from_path(
237-
file_path,
238-
dpi=150,
239-
first_page=1,
240-
last_page=max_pages,
241-
fmt='png'
242-
)
281+
try:
282+
pdf_images = convert_from_path(
283+
file_path,
284+
dpi=150,
285+
first_page=1,
286+
last_page=max_pages,
287+
fmt='png'
288+
)
289+
except Exception as pdf_error:
290+
# Check if it's a poppler-related error
291+
error_str = str(pdf_error).lower()
292+
if "poppler" in error_str or "pdfinfo" in error_str or "not installed" in error_str:
293+
raise RuntimeError(
294+
f"poppler-utils is required for PDF processing but is not available. "
295+
f"Error: {pdf_error}\n\n"
296+
f"Installation instructions:\n"
297+
f" Ubuntu/Debian: sudo apt-get install poppler-utils\n"
298+
f" macOS: brew install poppler\n"
299+
f" Windows: Download from http://blog.alivate.com.au/poppler-windows/ or use: choco install poppler\n\n"
300+
f"After installation, ensure poppler-utils binaries are in your PATH. "
301+
f"You may need to restart your application or terminal."
302+
) from pdf_error
303+
# Re-raise other errors as-is
304+
raise
243305

244306
total_pages = len(pdf_images)
245307
logger.info(f"Converted {total_pages} pages from PDF")
@@ -248,6 +310,9 @@ async def _extract_pdf_images(self, file_path: str) -> List[Image.Image]:
248310
images = pdf_images
249311
logger.info(f"Extracted {len(images)} pages from PDF")
250312

313+
except RuntimeError:
314+
# Re-raise RuntimeError (our improved poppler error) as-is
315+
raise
251316
except Exception as e:
252317
logger.error(f"PDF image extraction failed: {e}", exc_info=True)
253318
raise

0 commit comments

Comments
 (0)