@@ -462,91 +462,71 @@ async def _prep_document(self, document_data=None) -> List[str]:
462
462
]
463
463
ALLOWED_EXTENSIONS += IMAGE_EXTENSIONS # Include images
464
464
MAX_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB
465
-
466
- async def get_url_info (url ):
467
- """Get content type and filename from URL via HEAD request"""
468
- async with aiohttp .ClientSession () as session :
469
- async with session .head (url , allow_redirects = True ) as response :
470
- content_type = response .headers .get ("Content-Type" , "" ).lower ()
471
- content_disposition = response .headers .get (
472
- "Content-Disposition" , ""
473
- )
474
- content_length = response .headers .get ("Content-Length" )
475
- if content_length and int (content_length ) > MAX_SIZE_BYTES :
476
- raise ValidationError ("Document exceeds maximum size of 50MB." )
477
- filename_match = re .findall ('filename="(.+)"' , content_disposition )
478
- filename = (
479
- filename_match [0 ]
480
- if filename_match
481
- else os .path .basename (urllib .parse .urlparse (url ).path )
482
- )
483
- return content_type , filename
484
-
485
- async def fetch_document (url ):
486
- async with aiohttp .ClientSession () as session :
487
- async with session .get (url ) as response :
488
- if response .status != 200 :
489
- raise ValidationError ("Failed to fetch document from URL" )
490
- return await response .read ()
491
-
492
465
# Step 1: Get the document data
466
+ filename = None # document.pdf or document.docx
493
467
extension = None
494
- filename = None
495
- # every block should give back a document_data, extension, and filename
496
- if self .s3_file and not document_data :
468
+ # here we should have a document_data and filename
469
+ if document_data :
470
+ logger .info ("Document data provided." )
471
+ # Get MIME type from magic
472
+ mime = magic .Magic (mime = True )
473
+ mime_type = mime .from_buffer (document_data )
474
+ extension = get_extension_from_mime (mime_type ).lstrip ("." )
475
+ filename = f"document.{ extension } "
476
+
477
+ # every block should give back a document_data, and filename w/ extension
478
+ elif self .s3_file :
497
479
logger .info (f"Fetching document from S3: { self .s3_file .name } " )
498
- extension = os .path .splitext (self .s3_file .name )[1 ][1 :].lower ()
499
- logger .info (f"Document extension: { extension } " )
500
480
with self .s3_file .open ("rb" ) as f :
501
481
document_data = f .read ()
502
482
filename = os .path .basename (self .s3_file .name )
483
+ logger .info (f"Document filename: { filename } " )
503
484
504
- elif self .url and not document_data :
505
- content_type , filename = await get_url_info ( self .url )
485
+ elif self .url :
486
+ content_type , filename = await self ._get_url_info ( )
506
487
if "text/html" in content_type :
507
488
logger .info ("Document is a webpage." )
508
489
# It's a webpage, convert to PDF
509
490
document_data = await self ._convert_url_to_pdf (self .url )
510
491
logger .info ("Successfully converted URL to PDF." )
511
- extension = " pdf"
492
+ filename = f" { filename } . pdf"
512
493
else :
513
494
# It's a regular file
514
495
logger .info (f"Fetching document from URL: { self .url } " )
515
- document_data = await fetch_document ( self .url )
496
+ document_data = await self ._fetch_document ( )
516
497
if "application/pdf" in content_type :
517
498
extension = "pdf"
518
499
else :
519
500
extension = get_extension_from_mime (content_type ).lstrip ("." )
520
- logger .info (f"Document extension: { extension } " )
501
+ assert filename , "Filename should be set"
502
+ name = os .path .splitext (filename )[0 ]
503
+ filename = f"{ name } .{ extension } "
504
+ logger .info (f"Document filename: { filename } " )
505
+ else :
506
+ raise ValidationError (
507
+ "Document data is missing. Please provide a document or a URL."
508
+ )
521
509
522
- # here we should have a document_data and extension
523
- if document_data and not extension and not filename :
524
- # Get MIME type from magic
525
- mime = magic .Magic (mime = True )
526
- mime_type = mime .from_buffer (document_data )
527
- extension = get_extension_from_mime (mime_type ).lstrip ("." )
528
- filename = f"document.{ extension } "
510
+ # make sure we have the document data and filename
511
+ assert document_data , "Document data should be set"
512
+ assert filename , "Filename should be set"
529
513
530
- # Validate the document
531
- if not document_data or not extension or not filename :
532
- raise ValidationError ("Document data is missing." )
514
+ if not extension :
515
+ extension = os .path .splitext (filename )[1 ].lstrip ("." )
533
516
534
517
if len (document_data ) > MAX_SIZE_BYTES :
535
518
raise ValidationError ("Document exceeds maximum size of 50MB." )
536
519
537
520
if extension not in ALLOWED_EXTENSIONS :
538
521
raise ValidationError (f"File extension .{ extension } is not allowed." )
539
522
540
- logger .info (f"Document extension: { extension } " )
541
-
542
523
# Determine if the document is an image or PDF
543
524
is_image = extension in IMAGE_EXTENSIONS
544
525
is_pdf = extension == "pdf"
545
526
# Step 2: Convert to PDF if necessary
546
527
if not is_image and not is_pdf :
547
528
logger .info (f"Converting document to PDF. Extension: { extension } " )
548
529
# Use Gotenberg to convert to PDF
549
- filename = f"{ filename } .{ extension } "
550
530
pdf_data = await self ._convert_to_pdf (document_data , filename )
551
531
elif is_pdf :
552
532
logger .info ("Document is already a PDF." )
@@ -559,7 +539,12 @@ async def fetch_document(url):
559
539
560
540
# here all documents are converted to pdf
561
541
# Step 3: Turn the PDF into images via pdf2image
562
- images = convert_from_bytes (pdf_data )
542
+ try :
543
+ images = convert_from_bytes (pdf_data )
544
+ except Exception :
545
+ raise ValidationError (
546
+ "Failed to convert PDF to images. The PDF may be corrupted, which sometimes happens with URLs. Try downloading the document and sending us the base64."
547
+ )
563
548
logger .info (f"Successfully converted PDF to { len (images )} images." )
564
549
565
550
# here all documents are converted to images
@@ -575,6 +560,40 @@ async def fetch_document(url):
575
560
# Step 5: returning the base64 images
576
561
return base64_images
577
562
563
+ async def _get_url_info (self ):
564
+ """Get content type and filename from URL via HEAD request"""
565
+ MAX_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB
566
+ async with aiohttp .ClientSession () as session :
567
+ async with session .head (self .url , allow_redirects = True ) as response :
568
+ # handle when the response is not 200
569
+ if response .status != 200 :
570
+ raise ValidationError (
571
+ "Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64."
572
+ )
573
+ content_type = response .headers .get ("Content-Type" , "" ).lower ()
574
+ content_disposition = response .headers .get ("Content-Disposition" , "" )
575
+ content_length = response .headers .get ("Content-Length" )
576
+ if content_length and int (content_length ) > MAX_SIZE_BYTES :
577
+ raise ValidationError ("Document exceeds maximum size of 50MB." )
578
+ filename_match = re .findall ('filename="(.+)"' , content_disposition )
579
+ filename = (
580
+ filename_match [0 ]
581
+ if filename_match
582
+ else os .path .basename (urllib .parse .urlparse (self .url ).path )
583
+ )
584
+ if not filename :
585
+ filename = "downloaded_file"
586
+ return content_type , filename
587
+
588
+ async def _fetch_document (self ):
589
+ async with aiohttp .ClientSession () as session :
590
+ async with session .get (self .url ) as response :
591
+ if response .status != 200 :
592
+ raise ValidationError (
593
+ "Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64."
594
+ )
595
+ return await response .read ()
596
+
578
597
@retry (
579
598
stop = stop_after_attempt (3 ),
580
599
wait = wait_fixed (2 ),
0 commit comments