Skip to content

Commit

Permalink
Merge pull request #79 from tjmlabs/main-better-errors
Browse files Browse the repository at this point in the history
Main better errors
  • Loading branch information
Jonathan-Adly authored Nov 8, 2024
2 parents 4f4e452 + 5146ed2 commit b4cf4e0
Show file tree
Hide file tree
Showing 2 changed files with 151 additions and 58 deletions.
121 changes: 70 additions & 51 deletions web/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,91 +462,71 @@ async def _prep_document(self, document_data=None) -> List[str]:
]
ALLOWED_EXTENSIONS += IMAGE_EXTENSIONS # Include images
MAX_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB

async def get_url_info(url):
"""Get content type and filename from URL via HEAD request"""
async with aiohttp.ClientSession() as session:
async with session.head(url, allow_redirects=True) as response:
content_type = response.headers.get("Content-Type", "").lower()
content_disposition = response.headers.get(
"Content-Disposition", ""
)
content_length = response.headers.get("Content-Length")
if content_length and int(content_length) > MAX_SIZE_BYTES:
raise ValidationError("Document exceeds maximum size of 50MB.")
filename_match = re.findall('filename="(.+)"', content_disposition)
filename = (
filename_match[0]
if filename_match
else os.path.basename(urllib.parse.urlparse(url).path)
)
return content_type, filename

async def fetch_document(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status != 200:
raise ValidationError("Failed to fetch document from URL")
return await response.read()

# Step 1: Get the document data
filename = None # document.pdf or document.docx
extension = None
filename = None
# every block should give back a document_data, extension, and filename
if self.s3_file and not document_data:
# here we should have a document_data and filename
if document_data:
logger.info("Document data provided.")
# Get MIME type from magic
mime = magic.Magic(mime=True)
mime_type = mime.from_buffer(document_data)
extension = get_extension_from_mime(mime_type).lstrip(".")
filename = f"document.{extension}"

# every block should give back a document_data, and filename w/ extension
elif self.s3_file:
logger.info(f"Fetching document from S3: {self.s3_file.name}")
extension = os.path.splitext(self.s3_file.name)[1][1:].lower()
logger.info(f"Document extension: {extension}")
with self.s3_file.open("rb") as f:
document_data = f.read()
filename = os.path.basename(self.s3_file.name)
logger.info(f"Document filename: {filename}")

elif self.url and not document_data:
content_type, filename = await get_url_info(self.url)
elif self.url:
content_type, filename = await self._get_url_info()
if "text/html" in content_type:
logger.info("Document is a webpage.")
# It's a webpage, convert to PDF
document_data = await self._convert_url_to_pdf(self.url)
logger.info("Successfully converted URL to PDF.")
extension = "pdf"
filename = f"{filename}.pdf"
else:
# It's a regular file
logger.info(f"Fetching document from URL: {self.url}")
document_data = await fetch_document(self.url)
document_data = await self._fetch_document()
if "application/pdf" in content_type:
extension = "pdf"
else:
extension = get_extension_from_mime(content_type).lstrip(".")
logger.info(f"Document extension: {extension}")
assert filename, "Filename should be set"
name = os.path.splitext(filename)[0]
filename = f"{name}.{extension}"
logger.info(f"Document filename: {filename}")
else:
raise ValidationError(
"Document data is missing. Please provide a document or a URL."
)

# here we should have a document_data and extension
if document_data and not extension and not filename:
# Get MIME type from magic
mime = magic.Magic(mime=True)
mime_type = mime.from_buffer(document_data)
extension = get_extension_from_mime(mime_type).lstrip(".")
filename = f"document.{extension}"
# make sure we have the document data and filename
assert document_data, "Document data should be set"
assert filename, "Filename should be set"

# Validate the document
if not document_data or not extension or not filename:
raise ValidationError("Document data is missing.")
if not extension:
extension = os.path.splitext(filename)[1].lstrip(".")

if len(document_data) > MAX_SIZE_BYTES:
raise ValidationError("Document exceeds maximum size of 50MB.")

if extension not in ALLOWED_EXTENSIONS:
raise ValidationError(f"File extension .{extension} is not allowed.")

logger.info(f"Document extension: {extension}")

# Determine if the document is an image or PDF
is_image = extension in IMAGE_EXTENSIONS
is_pdf = extension == "pdf"
# Step 2: Convert to PDF if necessary
if not is_image and not is_pdf:
logger.info(f"Converting document to PDF. Extension: {extension}")
# Use Gotenberg to convert to PDF
filename = f"{filename}.{extension}"
pdf_data = await self._convert_to_pdf(document_data, filename)
elif is_pdf:
logger.info("Document is already a PDF.")
Expand All @@ -559,7 +539,12 @@ async def fetch_document(url):

# here all documents are converted to pdf
# Step 3: Turn the PDF into images via pdf2image
images = convert_from_bytes(pdf_data)
try:
images = convert_from_bytes(pdf_data)
except Exception:
raise ValidationError(
"Failed to convert PDF to images. The PDF may be corrupted, which sometimes happens with URLs. Try downloading the document and sending us the base64."
)
logger.info(f"Successfully converted PDF to {len(images)} images.")

# here all documents are converted to images
Expand All @@ -575,6 +560,40 @@ async def fetch_document(url):
# Step 5: returning the base64 images
return base64_images

async def _get_url_info(self):
"""Get content type and filename from URL via HEAD request"""
MAX_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB
async with aiohttp.ClientSession() as session:
async with session.head(self.url, allow_redirects=True) as response:
# handle when the response is not 200
if response.status != 200:
raise ValidationError(
"Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64."
)
content_type = response.headers.get("Content-Type", "").lower()
content_disposition = response.headers.get("Content-Disposition", "")
content_length = response.headers.get("Content-Length")
if content_length and int(content_length) > MAX_SIZE_BYTES:
raise ValidationError("Document exceeds maximum size of 50MB.")
filename_match = re.findall('filename="(.+)"', content_disposition)
filename = (
filename_match[0]
if filename_match
else os.path.basename(urllib.parse.urlparse(self.url).path)
)
if not filename:
filename = "downloaded_file"
return content_type, filename

async def _fetch_document(self):
async with aiohttp.ClientSession() as session:
async with session.get(self.url) as response:
if response.status != 200:
raise ValidationError(
"Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64."
)
return await response.read()

@retry(
stop=stop_after_attempt(3),
wait=wait_fixed(2),
Expand Down
88 changes: 81 additions & 7 deletions web/api/tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,12 @@ async def test_list_collection(async_client, user, collection):
)
assert response.status_code == 200
assert response.json() == [
{"id": 1, "name": "Test Collection Fixture", "metadata": {"key": "value"}, "num_documents": 0}
{
"id": 1,
"name": "Test Collection Fixture",
"metadata": {"key": "value"},
"num_documents": 0,
}
]


Expand Down Expand Up @@ -1534,18 +1539,17 @@ async def test_document_fetch_failure_async(async_client, user):
await asyncio.gather(*pending_tasks)

# Assert that both HEAD and GET were called
mock_head.assert_called_once_with(
"https://example.com/nonexistent.pdf", allow_redirects=True
)
mock_get.assert_called_once_with("https://example.com/nonexistent.pdf")
mock_head.assert_called_once()
mock_get.assert_called_once()

# Assert that the email was sent
MockEmailMessage.assert_called_once_with(
subject="Document Upsertion Failed",
body="There was an error processing your document: ['Failed to fetch document from URL']",
body="There was an error processing your document: ['Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64.']",
to=[user.email, "[email protected]"],
from_email="[email protected]",
)

mock_email_instance.send.assert_called_once()


Expand Down Expand Up @@ -1598,7 +1602,7 @@ async def test_document_file_too_big(async_client, user):
assert response.status_code == 400


async def test_gotenberg_service_down(async_client, user):
async def test_gotenberg_service_down_with_file(async_client, user):
GOTENBERG_POST_PATH = "api.models.aiohttp.ClientSession.post"
# Create a mock response object with status 500
mock_response = AsyncMock()
Expand Down Expand Up @@ -1639,6 +1643,29 @@ async def test_gotenberg_service_down(async_client, user):
assert response.status_code == 400


async def test_gotenberg_service_down_with_url(async_client, user):
GOTENBERG_POST_PATH = "api.models.aiohttp.ClientSession.post"
# Create a mock response object with status 500
mock_response = AsyncMock()
mock_response.status = 500
mock_response.json.return_value = AsyncMock(return_value={"error": "Service Down"})
# Mock the context manager __aenter__ to return the mock_response
mock_response.__aenter__.return_value = mock_response
# Patch the aiohttp.ClientSession.post method to return the mock_response

with patch(GOTENBERG_POST_PATH, return_value=mock_response):
response = await async_client.post(
"/documents/upsert-document/",
json={
"name": "Test Document Fixture",
"url": "https://example.com/largefile.pdf",
"wait": True,
},
headers={"Authorization": f"Bearer {user.token}"},
)
assert response.status_code == 400


async def test_prep_document_document_data_too_large():
# Initialize Document without a URL or base64 (assuming document_data is handled internally)
doc = Document()
Expand All @@ -1650,6 +1677,17 @@ async def test_prep_document_document_data_too_large():
await doc._prep_document(document_data=document_data)


async def test_prep_document_pdf_conversion_failure():
CONVERT_FROM_BYTES_PATH = "api.models.convert_from_bytes"

document = Document() #
pdf_data = b"corrupted_pdf_data"

with patch(CONVERT_FROM_BYTES_PATH, side_effect=Exception("PDF conversion failed")):
with pytest.raises(DjangoValidationError):
await document._prep_document(document_data=pdf_data)


async def test_prep_document_with_disallowed_extension(collection):
content = "bad base64 string"
content_bytes = content.encode("utf-8")
Expand Down Expand Up @@ -1714,3 +1752,39 @@ async def test_unknown_mime_type(collection):

# Cleanup
await document.delete_s3_file()


async def test_get_url_info_non_200_response():
AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head"

# Mock response with non-200 status
mock_response = AsyncMock()
mock_response.status = 404
mock_response.__aenter__.return_value = mock_response

document = Document(url="https://example.com/doc.pdf")

with patch(AIOHTTP_HEAD_PATH, return_value=mock_response):
with pytest.raises(DjangoValidationError):
await document._get_url_info()


async def test_get_url_info_empty_filename_fallback():
AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head"

# Mock response with empty filename
mock_response = AsyncMock()
mock_response.status = 200
mock_response.headers = {
"Content-Type": "application/pdf",
"Content-Disposition": "", # Empty content disposition
"Content-Length": "1000",
}
mock_response.__aenter__.return_value = mock_response

document = Document(url="https://example.com/") # URL with no filename

with patch(AIOHTTP_HEAD_PATH, return_value=mock_response):
content_type, filename = await document._get_url_info()

assert filename == "downloaded_file"

0 comments on commit b4cf4e0

Please sign in to comment.