Merge pull request #79 from tjmlabs/main-better-errors

Jonathan-Adly · web-flow · commit b4cf4e0e062e · 2024-11-08T00:23:05.000-05:00
Main better errors
diff --git a/web/api/models.py b/web/api/models.py
@@ -462,91 +462,71 @@ async def _prep_document(self, document_data=None) -> List[str]:
         ]
         ALLOWED_EXTENSIONS += IMAGE_EXTENSIONS  # Include images
         MAX_SIZE_BYTES = 50 * 1024 * 1024  # 50 MB
-
-        async def get_url_info(url):
-            """Get content type and filename from URL via HEAD request"""
-            async with aiohttp.ClientSession() as session:
-                async with session.head(url, allow_redirects=True) as response:
-                    content_type = response.headers.get("Content-Type", "").lower()
-                    content_disposition = response.headers.get(
-                        "Content-Disposition", ""
-                    )
-                    content_length = response.headers.get("Content-Length")
-                    if content_length and int(content_length) > MAX_SIZE_BYTES:
-                        raise ValidationError("Document exceeds maximum size of 50MB.")
-                    filename_match = re.findall('filename="(.+)"', content_disposition)
-                    filename = (
-                        filename_match[0]
-                        if filename_match
-                        else os.path.basename(urllib.parse.urlparse(url).path)
-                    )
-                    return content_type, filename
-
-        async def fetch_document(url):
-            async with aiohttp.ClientSession() as session:
-                async with session.get(url) as response:
-                    if response.status != 200:
-                        raise ValidationError("Failed to fetch document from URL")
-                    return await response.read()
-
         # Step 1: Get the document data
+        filename = None  # document.pdf or document.docx
         extension = None
-        filename = None
-        # every block should give back a document_data, extension, and filename
-        if self.s3_file and not document_data:
+        # here we should have a document_data and filename
+        if document_data:
+            logger.info("Document data provided.")
+            # Get MIME type from magic
+            mime = magic.Magic(mime=True)
+            mime_type = mime.from_buffer(document_data)
+            extension = get_extension_from_mime(mime_type).lstrip(".")
+            filename = f"document.{extension}"
+
+        # every block should give back a document_data, and filename w/ extension
+        elif self.s3_file:
             logger.info(f"Fetching document from S3: {self.s3_file.name}")
-            extension = os.path.splitext(self.s3_file.name)[1][1:].lower()
-            logger.info(f"Document extension: {extension}")
             with self.s3_file.open("rb") as f:
                 document_data = f.read()
             filename = os.path.basename(self.s3_file.name)
+            logger.info(f"Document filename: {filename}")
 
-        elif self.url and not document_data:
-            content_type, filename = await get_url_info(self.url)
+        elif self.url:
+            content_type, filename = await self._get_url_info()
             if "text/html" in content_type:
                 logger.info("Document is a webpage.")
                 # It's a webpage, convert to PDF
                 document_data = await self._convert_url_to_pdf(self.url)
                 logger.info("Successfully converted URL to PDF.")
-                extension = "pdf"
+                filename = f"{filename}.pdf"
             else:
                 # It's a regular file
                 logger.info(f"Fetching document from URL: {self.url}")
-                document_data = await fetch_document(self.url)
+                document_data = await self._fetch_document()
                 if "application/pdf" in content_type:
                     extension = "pdf"
                 else:
                     extension = get_extension_from_mime(content_type).lstrip(".")
-                logger.info(f"Document extension: {extension}")
+                assert filename, "Filename should be set"
+                name = os.path.splitext(filename)[0]
+                filename = f"{name}.{extension}"
+                logger.info(f"Document filename: {filename}")
+        else:
+            raise ValidationError(
+                "Document data is missing. Please provide a document or a URL."
+            )
 
-        # here we should have a document_data and extension
-        if document_data and not extension and not filename:
-            # Get MIME type from magic
-            mime = magic.Magic(mime=True)
-            mime_type = mime.from_buffer(document_data)
-            extension = get_extension_from_mime(mime_type).lstrip(".")
-            filename = f"document.{extension}"
+        # make sure we have the document data and filename
+        assert document_data, "Document data should be set"
+        assert filename, "Filename should be set"
 
-        # Validate the document
-        if not document_data or not extension or not filename:
-            raise ValidationError("Document data is missing.")
+        if not extension:
+            extension = os.path.splitext(filename)[1].lstrip(".")
 
         if len(document_data) > MAX_SIZE_BYTES:
             raise ValidationError("Document exceeds maximum size of 50MB.")
 
         if extension not in ALLOWED_EXTENSIONS:
             raise ValidationError(f"File extension .{extension} is not allowed.")
 
-        logger.info(f"Document extension: {extension}")
-
         # Determine if the document is an image or PDF
         is_image = extension in IMAGE_EXTENSIONS
         is_pdf = extension == "pdf"
         # Step 2: Convert to PDF if necessary
         if not is_image and not is_pdf:
             logger.info(f"Converting document to PDF. Extension: {extension}")
             # Use Gotenberg to convert to PDF
-            filename = f"{filename}.{extension}"
             pdf_data = await self._convert_to_pdf(document_data, filename)
         elif is_pdf:
             logger.info("Document is already a PDF.")
@@ -559,7 +539,12 @@ async def fetch_document(url):
 
         # here all documents are converted to pdf
         # Step 3: Turn the PDF into images via pdf2image
-        images = convert_from_bytes(pdf_data)
+        try:
+            images = convert_from_bytes(pdf_data)
+        except Exception:
+            raise ValidationError(
+                "Failed to convert PDF to images. The PDF may be corrupted, which sometimes happens with URLs. Try downloading the document and sending us the base64."
+            )
         logger.info(f"Successfully converted PDF to {len(images)} images.")
 
         # here all documents are converted to images
@@ -575,6 +560,40 @@ async def fetch_document(url):
         # Step 5: returning the base64 images
         return base64_images
 
+    async def _get_url_info(self):
+        """Get content type and filename from URL via HEAD request"""
+        MAX_SIZE_BYTES = 50 * 1024 * 1024  # 50 MB
+        async with aiohttp.ClientSession() as session:
+            async with session.head(self.url, allow_redirects=True) as response:
+                # handle when the response is not 200
+                if response.status != 200:
+                    raise ValidationError(
+                        "Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64."
+                    )
+                content_type = response.headers.get("Content-Type", "").lower()
+                content_disposition = response.headers.get("Content-Disposition", "")
+                content_length = response.headers.get("Content-Length")
+                if content_length and int(content_length) > MAX_SIZE_BYTES:
+                    raise ValidationError("Document exceeds maximum size of 50MB.")
+                filename_match = re.findall('filename="(.+)"', content_disposition)
+                filename = (
+                    filename_match[0]
+                    if filename_match
+                    else os.path.basename(urllib.parse.urlparse(self.url).path)
+                )
+                if not filename:
+                    filename = "downloaded_file"
+                return content_type, filename
+
+    async def _fetch_document(self):
+        async with aiohttp.ClientSession() as session:
+            async with session.get(self.url) as response:
+                if response.status != 200:
+                    raise ValidationError(
+                        "Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64."
+                    )
+                return await response.read()
+
     @retry(
         stop=stop_after_attempt(3),
         wait=wait_fixed(2),
diff --git a/web/api/tests/tests.py b/web/api/tests/tests.py
@@ -198,7 +198,12 @@ async def test_list_collection(async_client, user, collection):
     )
     assert response.status_code == 200
     assert response.json() == [
-        {"id": 1, "name": "Test Collection Fixture", "metadata": {"key": "value"}, "num_documents": 0}
+        {
+            "id": 1,
+            "name": "Test Collection Fixture",
+            "metadata": {"key": "value"},
+            "num_documents": 0,
+        }
     ]
 
 
@@ -1534,18 +1539,17 @@ async def test_document_fetch_failure_async(async_client, user):
             await asyncio.gather(*pending_tasks)
 
             # Assert that both HEAD and GET were called
-            mock_head.assert_called_once_with(
-                "https://example.com/nonexistent.pdf", allow_redirects=True
-            )
-            mock_get.assert_called_once_with("https://example.com/nonexistent.pdf")
+            mock_head.assert_called_once()
+            mock_get.assert_called_once()
 
             # Assert that the email was sent
             MockEmailMessage.assert_called_once_with(
                 subject="Document Upsertion Failed",
-                body="There was an error processing your document: ['Failed to fetch document from URL']",
+                body="There was an error processing your document: ['Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64.']",
                 to=[user.email, "dummy@example.com"],
                 from_email="dummy@example.com",
             )
+
             mock_email_instance.send.assert_called_once()
 
 
@@ -1598,7 +1602,7 @@ async def test_document_file_too_big(async_client, user):
         assert response.status_code == 400
 
 
-async def test_gotenberg_service_down(async_client, user):
+async def test_gotenberg_service_down_with_file(async_client, user):
     GOTENBERG_POST_PATH = "api.models.aiohttp.ClientSession.post"
     # Create a mock response object with status 500
     mock_response = AsyncMock()
@@ -1639,6 +1643,29 @@ async def test_gotenberg_service_down(async_client, user):
         assert response.status_code == 400
 
 
+async def test_gotenberg_service_down_with_url(async_client, user):
+    GOTENBERG_POST_PATH = "api.models.aiohttp.ClientSession.post"
+    # Create a mock response object with status 500
+    mock_response = AsyncMock()
+    mock_response.status = 500
+    mock_response.json.return_value = AsyncMock(return_value={"error": "Service Down"})
+    # Mock the context manager __aenter__ to return the mock_response
+    mock_response.__aenter__.return_value = mock_response
+    # Patch the aiohttp.ClientSession.post method to return the mock_response
+
+    with patch(GOTENBERG_POST_PATH, return_value=mock_response):
+        response = await async_client.post(
+            "/documents/upsert-document/",
+            json={
+                "name": "Test Document Fixture",
+                "url": "https://example.com/largefile.pdf",
+                "wait": True,
+            },
+            headers={"Authorization": f"Bearer {user.token}"},
+        )
+        assert response.status_code == 400
+
+
 async def test_prep_document_document_data_too_large():
     # Initialize Document without a URL or base64 (assuming document_data is handled internally)
     doc = Document()
@@ -1650,6 +1677,17 @@ async def test_prep_document_document_data_too_large():
         await doc._prep_document(document_data=document_data)
 
 
+async def test_prep_document_pdf_conversion_failure():
+    CONVERT_FROM_BYTES_PATH = "api.models.convert_from_bytes"
+
+    document = Document()  #
+    pdf_data = b"corrupted_pdf_data"
+
+    with patch(CONVERT_FROM_BYTES_PATH, side_effect=Exception("PDF conversion failed")):
+        with pytest.raises(DjangoValidationError):
+            await document._prep_document(document_data=pdf_data)
+
+
 async def test_prep_document_with_disallowed_extension(collection):
     content = "bad base64 string"
     content_bytes = content.encode("utf-8")
@@ -1714,3 +1752,39 @@ async def test_unknown_mime_type(collection):
 
     # Cleanup
     await document.delete_s3_file()
+
+
+async def test_get_url_info_non_200_response():
+    AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head"
+
+    # Mock response with non-200 status
+    mock_response = AsyncMock()
+    mock_response.status = 404
+    mock_response.__aenter__.return_value = mock_response
+
+    document = Document(url="https://example.com/doc.pdf")
+
+    with patch(AIOHTTP_HEAD_PATH, return_value=mock_response):
+        with pytest.raises(DjangoValidationError):
+            await document._get_url_info()
+
+
+async def test_get_url_info_empty_filename_fallback():
+    AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head"
+
+    # Mock response with empty filename
+    mock_response = AsyncMock()
+    mock_response.status = 200
+    mock_response.headers = {
+        "Content-Type": "application/pdf",
+        "Content-Disposition": "",  # Empty content disposition
+        "Content-Length": "1000",
+    }
+    mock_response.__aenter__.return_value = mock_response
+
+    document = Document(url="https://example.com/")  # URL with no filename
+
+    with patch(AIOHTTP_HEAD_PATH, return_value=mock_response):
+        content_type, filename = await document._get_url_info()
+
+        assert filename == "downloaded_file"