Skip to content

Commit b4cf4e0

Browse files
Merge pull request #79 from tjmlabs/main-better-errors
Main better errors
2 parents 4f4e452 + 5146ed2 commit b4cf4e0

File tree

2 files changed

+151
-58
lines changed

2 files changed

+151
-58
lines changed

web/api/models.py

Lines changed: 70 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -462,91 +462,71 @@ async def _prep_document(self, document_data=None) -> List[str]:
462462
]
463463
ALLOWED_EXTENSIONS += IMAGE_EXTENSIONS # Include images
464464
MAX_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB
465-
466-
async def get_url_info(url):
467-
"""Get content type and filename from URL via HEAD request"""
468-
async with aiohttp.ClientSession() as session:
469-
async with session.head(url, allow_redirects=True) as response:
470-
content_type = response.headers.get("Content-Type", "").lower()
471-
content_disposition = response.headers.get(
472-
"Content-Disposition", ""
473-
)
474-
content_length = response.headers.get("Content-Length")
475-
if content_length and int(content_length) > MAX_SIZE_BYTES:
476-
raise ValidationError("Document exceeds maximum size of 50MB.")
477-
filename_match = re.findall('filename="(.+)"', content_disposition)
478-
filename = (
479-
filename_match[0]
480-
if filename_match
481-
else os.path.basename(urllib.parse.urlparse(url).path)
482-
)
483-
return content_type, filename
484-
485-
async def fetch_document(url):
486-
async with aiohttp.ClientSession() as session:
487-
async with session.get(url) as response:
488-
if response.status != 200:
489-
raise ValidationError("Failed to fetch document from URL")
490-
return await response.read()
491-
492465
# Step 1: Get the document data
466+
filename = None # document.pdf or document.docx
493467
extension = None
494-
filename = None
495-
# every block should give back a document_data, extension, and filename
496-
if self.s3_file and not document_data:
468+
# here we should have a document_data and filename
469+
if document_data:
470+
logger.info("Document data provided.")
471+
# Get MIME type from magic
472+
mime = magic.Magic(mime=True)
473+
mime_type = mime.from_buffer(document_data)
474+
extension = get_extension_from_mime(mime_type).lstrip(".")
475+
filename = f"document.{extension}"
476+
477+
# every block should give back a document_data, and filename w/ extension
478+
elif self.s3_file:
497479
logger.info(f"Fetching document from S3: {self.s3_file.name}")
498-
extension = os.path.splitext(self.s3_file.name)[1][1:].lower()
499-
logger.info(f"Document extension: {extension}")
500480
with self.s3_file.open("rb") as f:
501481
document_data = f.read()
502482
filename = os.path.basename(self.s3_file.name)
483+
logger.info(f"Document filename: {filename}")
503484

504-
elif self.url and not document_data:
505-
content_type, filename = await get_url_info(self.url)
485+
elif self.url:
486+
content_type, filename = await self._get_url_info()
506487
if "text/html" in content_type:
507488
logger.info("Document is a webpage.")
508489
# It's a webpage, convert to PDF
509490
document_data = await self._convert_url_to_pdf(self.url)
510491
logger.info("Successfully converted URL to PDF.")
511-
extension = "pdf"
492+
filename = f"{filename}.pdf"
512493
else:
513494
# It's a regular file
514495
logger.info(f"Fetching document from URL: {self.url}")
515-
document_data = await fetch_document(self.url)
496+
document_data = await self._fetch_document()
516497
if "application/pdf" in content_type:
517498
extension = "pdf"
518499
else:
519500
extension = get_extension_from_mime(content_type).lstrip(".")
520-
logger.info(f"Document extension: {extension}")
501+
assert filename, "Filename should be set"
502+
name = os.path.splitext(filename)[0]
503+
filename = f"{name}.{extension}"
504+
logger.info(f"Document filename: {filename}")
505+
else:
506+
raise ValidationError(
507+
"Document data is missing. Please provide a document or a URL."
508+
)
521509

522-
# here we should have a document_data and extension
523-
if document_data and not extension and not filename:
524-
# Get MIME type from magic
525-
mime = magic.Magic(mime=True)
526-
mime_type = mime.from_buffer(document_data)
527-
extension = get_extension_from_mime(mime_type).lstrip(".")
528-
filename = f"document.{extension}"
510+
# make sure we have the document data and filename
511+
assert document_data, "Document data should be set"
512+
assert filename, "Filename should be set"
529513

530-
# Validate the document
531-
if not document_data or not extension or not filename:
532-
raise ValidationError("Document data is missing.")
514+
if not extension:
515+
extension = os.path.splitext(filename)[1].lstrip(".")
533516

534517
if len(document_data) > MAX_SIZE_BYTES:
535518
raise ValidationError("Document exceeds maximum size of 50MB.")
536519

537520
if extension not in ALLOWED_EXTENSIONS:
538521
raise ValidationError(f"File extension .{extension} is not allowed.")
539522

540-
logger.info(f"Document extension: {extension}")
541-
542523
# Determine if the document is an image or PDF
543524
is_image = extension in IMAGE_EXTENSIONS
544525
is_pdf = extension == "pdf"
545526
# Step 2: Convert to PDF if necessary
546527
if not is_image and not is_pdf:
547528
logger.info(f"Converting document to PDF. Extension: {extension}")
548529
# Use Gotenberg to convert to PDF
549-
filename = f"{filename}.{extension}"
550530
pdf_data = await self._convert_to_pdf(document_data, filename)
551531
elif is_pdf:
552532
logger.info("Document is already a PDF.")
@@ -559,7 +539,12 @@ async def fetch_document(url):
559539

560540
# here all documents are converted to pdf
561541
# Step 3: Turn the PDF into images via pdf2image
562-
images = convert_from_bytes(pdf_data)
542+
try:
543+
images = convert_from_bytes(pdf_data)
544+
except Exception:
545+
raise ValidationError(
546+
"Failed to convert PDF to images. The PDF may be corrupted, which sometimes happens with URLs. Try downloading the document and sending us the base64."
547+
)
563548
logger.info(f"Successfully converted PDF to {len(images)} images.")
564549

565550
# here all documents are converted to images
@@ -575,6 +560,40 @@ async def fetch_document(url):
575560
# Step 5: returning the base64 images
576561
return base64_images
577562

563+
async def _get_url_info(self):
564+
"""Get content type and filename from URL via HEAD request"""
565+
MAX_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB
566+
async with aiohttp.ClientSession() as session:
567+
async with session.head(self.url, allow_redirects=True) as response:
568+
# handle when the response is not 200
569+
if response.status != 200:
570+
raise ValidationError(
571+
"Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64."
572+
)
573+
content_type = response.headers.get("Content-Type", "").lower()
574+
content_disposition = response.headers.get("Content-Disposition", "")
575+
content_length = response.headers.get("Content-Length")
576+
if content_length and int(content_length) > MAX_SIZE_BYTES:
577+
raise ValidationError("Document exceeds maximum size of 50MB.")
578+
filename_match = re.findall('filename="(.+)"', content_disposition)
579+
filename = (
580+
filename_match[0]
581+
if filename_match
582+
else os.path.basename(urllib.parse.urlparse(self.url).path)
583+
)
584+
if not filename:
585+
filename = "downloaded_file"
586+
return content_type, filename
587+
588+
async def _fetch_document(self):
589+
async with aiohttp.ClientSession() as session:
590+
async with session.get(self.url) as response:
591+
if response.status != 200:
592+
raise ValidationError(
593+
"Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64."
594+
)
595+
return await response.read()
596+
578597
@retry(
579598
stop=stop_after_attempt(3),
580599
wait=wait_fixed(2),

web/api/tests/tests.py

Lines changed: 81 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,12 @@ async def test_list_collection(async_client, user, collection):
198198
)
199199
assert response.status_code == 200
200200
assert response.json() == [
201-
{"id": 1, "name": "Test Collection Fixture", "metadata": {"key": "value"}, "num_documents": 0}
201+
{
202+
"id": 1,
203+
"name": "Test Collection Fixture",
204+
"metadata": {"key": "value"},
205+
"num_documents": 0,
206+
}
202207
]
203208

204209

@@ -1534,18 +1539,17 @@ async def test_document_fetch_failure_async(async_client, user):
15341539
await asyncio.gather(*pending_tasks)
15351540

15361541
# Assert that both HEAD and GET were called
1537-
mock_head.assert_called_once_with(
1538-
"https://example.com/nonexistent.pdf", allow_redirects=True
1539-
)
1540-
mock_get.assert_called_once_with("https://example.com/nonexistent.pdf")
1542+
mock_head.assert_called_once()
1543+
mock_get.assert_called_once()
15411544

15421545
# Assert that the email was sent
15431546
MockEmailMessage.assert_called_once_with(
15441547
subject="Document Upsertion Failed",
1545-
body="There was an error processing your document: ['Failed to fetch document from URL']",
1548+
body="There was an error processing your document: ['Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64.']",
15461549
to=[user.email, "[email protected]"],
15471550
from_email="[email protected]",
15481551
)
1552+
15491553
mock_email_instance.send.assert_called_once()
15501554

15511555

@@ -1598,7 +1602,7 @@ async def test_document_file_too_big(async_client, user):
15981602
assert response.status_code == 400
15991603

16001604

1601-
async def test_gotenberg_service_down(async_client, user):
1605+
async def test_gotenberg_service_down_with_file(async_client, user):
16021606
GOTENBERG_POST_PATH = "api.models.aiohttp.ClientSession.post"
16031607
# Create a mock response object with status 500
16041608
mock_response = AsyncMock()
@@ -1639,6 +1643,29 @@ async def test_gotenberg_service_down(async_client, user):
16391643
assert response.status_code == 400
16401644

16411645

1646+
async def test_gotenberg_service_down_with_url(async_client, user):
1647+
GOTENBERG_POST_PATH = "api.models.aiohttp.ClientSession.post"
1648+
# Create a mock response object with status 500
1649+
mock_response = AsyncMock()
1650+
mock_response.status = 500
1651+
mock_response.json.return_value = AsyncMock(return_value={"error": "Service Down"})
1652+
# Mock the context manager __aenter__ to return the mock_response
1653+
mock_response.__aenter__.return_value = mock_response
1654+
# Patch the aiohttp.ClientSession.post method to return the mock_response
1655+
1656+
with patch(GOTENBERG_POST_PATH, return_value=mock_response):
1657+
response = await async_client.post(
1658+
"/documents/upsert-document/",
1659+
json={
1660+
"name": "Test Document Fixture",
1661+
"url": "https://example.com/largefile.pdf",
1662+
"wait": True,
1663+
},
1664+
headers={"Authorization": f"Bearer {user.token}"},
1665+
)
1666+
assert response.status_code == 400
1667+
1668+
16421669
async def test_prep_document_document_data_too_large():
16431670
# Initialize Document without a URL or base64 (assuming document_data is handled internally)
16441671
doc = Document()
@@ -1650,6 +1677,17 @@ async def test_prep_document_document_data_too_large():
16501677
await doc._prep_document(document_data=document_data)
16511678

16521679

1680+
async def test_prep_document_pdf_conversion_failure():
1681+
CONVERT_FROM_BYTES_PATH = "api.models.convert_from_bytes"
1682+
1683+
document = Document() #
1684+
pdf_data = b"corrupted_pdf_data"
1685+
1686+
with patch(CONVERT_FROM_BYTES_PATH, side_effect=Exception("PDF conversion failed")):
1687+
with pytest.raises(DjangoValidationError):
1688+
await document._prep_document(document_data=pdf_data)
1689+
1690+
16531691
async def test_prep_document_with_disallowed_extension(collection):
16541692
content = "bad base64 string"
16551693
content_bytes = content.encode("utf-8")
@@ -1714,3 +1752,39 @@ async def test_unknown_mime_type(collection):
17141752

17151753
# Cleanup
17161754
await document.delete_s3_file()
1755+
1756+
1757+
async def test_get_url_info_non_200_response():
1758+
AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head"
1759+
1760+
# Mock response with non-200 status
1761+
mock_response = AsyncMock()
1762+
mock_response.status = 404
1763+
mock_response.__aenter__.return_value = mock_response
1764+
1765+
document = Document(url="https://example.com/doc.pdf")
1766+
1767+
with patch(AIOHTTP_HEAD_PATH, return_value=mock_response):
1768+
with pytest.raises(DjangoValidationError):
1769+
await document._get_url_info()
1770+
1771+
1772+
async def test_get_url_info_empty_filename_fallback():
1773+
AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head"
1774+
1775+
# Mock response with empty filename
1776+
mock_response = AsyncMock()
1777+
mock_response.status = 200
1778+
mock_response.headers = {
1779+
"Content-Type": "application/pdf",
1780+
"Content-Disposition": "", # Empty content disposition
1781+
"Content-Length": "1000",
1782+
}
1783+
mock_response.__aenter__.return_value = mock_response
1784+
1785+
document = Document(url="https://example.com/") # URL with no filename
1786+
1787+
with patch(AIOHTTP_HEAD_PATH, return_value=mock_response):
1788+
content_type, filename = await document._get_url_info()
1789+
1790+
assert filename == "downloaded_file"

0 commit comments

Comments
 (0)