From 6ca3590e659a7c4f170385fbe3cfcf2cc1d5daa6 Mon Sep 17 00:00:00 2001 From: Abdullah Nassar Date: Thu, 5 Dec 2024 21:48:38 +0200 Subject: [PATCH 1/7] proxy support --- web/api/models.py | 43 +++++++------ web/api/tests/tests.py | 135 ++++++----------------------------------- web/api/views.py | 11 ++-- web/config/settings.py | 16 +++++ 4 files changed, 64 insertions(+), 141 deletions(-) diff --git a/web/api/models.py b/web/api/models.py index 844ce8b..b22c50d 100644 --- a/web/api/models.py +++ b/web/api/models.py @@ -6,7 +6,7 @@ import re import urllib.parse from io import BytesIO -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import aiohttp import magic @@ -20,7 +20,8 @@ from django_stubs_ext.db.models import TypedModelMeta from pdf2image import convert_from_bytes from pgvector.django import HalfVectorField -from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed +from tenacity import (retry, retry_if_exception_type, stop_after_attempt, + wait_fixed) logger = logging.getLogger(__name__) @@ -167,7 +168,7 @@ async def get_url(self) -> str: return self.s3_file.url return self.url - async def embed_document(self) -> None: + async def embed_document(self, use_proxy: Optional[bool] = False) -> None: """ Process a document by embedding its pages and storing the results. @@ -247,7 +248,7 @@ async def send_batch( ) return out["output"]["data"] - base64_images = await self._prep_document() + base64_images = await self._prep_document(use_proxy=use_proxy) logger.info(f"Successfully prepped document {self.name}") # Split the images into batches batches = [ @@ -326,7 +327,9 @@ async def send_batch( return - async def _prep_document(self, document_data=None) -> List[str]: + async def _prep_document( + self, document_data=None, use_proxy: Optional[bool] = False + ) -> List[str]: """ The goal of this method is to take a document and convert it into a series of base64 images. Steps: @@ -492,7 +495,9 @@ async def _prep_document(self, document_data=None) -> List[str]: logger.info(f"Document filename: {filename}") elif self.url: - content_type, filename = await self._get_url_info() + content_type, filename, document_data = await self._fetch_document( + use_proxy + ) if "text/html" in content_type: logger.info("Document is a webpage.") # It's a webpage, convert to PDF @@ -502,7 +507,7 @@ async def _prep_document(self, document_data=None) -> List[str]: else: # It's a regular file logger.info(f"Fetching document from URL: {self.url}") - document_data = await self._fetch_document() + # document_data = await self._fetch_document() if "application/pdf" in content_type: extension = "pdf" else: @@ -569,13 +574,20 @@ async def _prep_document(self, document_data=None) -> List[str]: # Step 5: returning the base64 images return base64_images - async def _get_url_info(self): - """Get content type and filename from URL via HEAD request""" + async def _fetch_document(self, use_proxy: Optional[bool] = False): + proxy = None + if use_proxy: + proxy = settings.PROXY_URL + # replace https with http for the proxy + self.url = self.url.replace("https://", "http://") + logger.info("Using proxy to fetch document.") + MAX_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB async with aiohttp.ClientSession() as session: - async with session.head(self.url, allow_redirects=True) as response: + async with session.get(self.url, proxy=proxy) as response: # handle when the response is not 200 if response.status != 200: + logger.info(f"response status: {response.status}") raise ValidationError( "Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64." ) @@ -592,16 +604,7 @@ async def _get_url_info(self): ) if not filename: filename = "downloaded_file" - return content_type, filename - - async def _fetch_document(self): - async with aiohttp.ClientSession() as session: - async with session.get(self.url) as response: - if response.status != 200: - raise ValidationError( - "Failed to fetch document info from URL. Some documents are protected by anti-scrapping measures. We recommend you download them and send us base64." - ) - return await response.read() + return content_type, filename, await response.read() @retry( stop=stop_after_attempt(3), diff --git a/web/api/tests/tests.py b/web/api/tests/tests.py index 14f39d7..674106e 100644 --- a/web/api/tests/tests.py +++ b/web/api/tests/tests.py @@ -6,15 +6,8 @@ from accounts.models import CustomUser from api.middleware import add_slash from api.models import Collection, Document, Page, PageEmbedding -from api.views import ( - Bearer, - QueryFilter, - QueryIn, - filter_collections, - filter_documents, - filter_query, - router, -) +from api.views import (Bearer, QueryFilter, QueryIn, filter_collections, + filter_documents, filter_query, router) from django.core.exceptions import ValidationError as DjangoValidationError from django.core.files.uploadedfile import SimpleUploadedFile from django.test import override_settings @@ -1950,16 +1943,6 @@ async def test_embed_document_arxiv_async(async_client, user): async def test_document_fetch_failure_await(async_client, user): AIOHTTP_GET_PATH = "api.models.aiohttp.ClientSession.get" - AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head" - - # Mock for HEAD request - mock_head_response = AsyncMock() - mock_head_response.status = 200 - mock_head_response.headers = { - "Content-Type": "application/pdf", - "Content-Length": "1000", - } - mock_head_response.__aenter__.return_value = mock_head_response # Mock for GET request mock_get_response = AsyncMock() @@ -1968,10 +1951,8 @@ async def test_document_fetch_failure_await(async_client, user): mock_get_response.read = AsyncMock(return_value=b"") mock_get_response.__aenter__.return_value = mock_get_response - # Patch both HEAD and GET methods - with patch(AIOHTTP_HEAD_PATH, return_value=mock_head_response) as mock_head, patch( - AIOHTTP_GET_PATH, return_value=mock_get_response - ) as mock_get: + # Patch GET method + with patch(AIOHTTP_GET_PATH, return_value=mock_get_response) as mock_get: response = await async_client.post( "/documents/upsert-document/", json={ @@ -1982,11 +1963,10 @@ async def test_document_fetch_failure_await(async_client, user): headers={"Authorization": f"Bearer {user.token}"}, ) - # Assert both HEAD and GET were called - mock_head.assert_called_once_with( - "https://example.com/nonexistent.pdf", allow_redirects=True + # Assert GET was called + mock_get.assert_called_once_with( + "https://example.com/nonexistent.pdf", proxy=None ) - mock_get.assert_called_once_with("https://example.com/nonexistent.pdf") # Assert that the response status code reflects the failure assert response.status_code == 400 @@ -1994,16 +1974,6 @@ async def test_document_fetch_failure_await(async_client, user): async def test_document_fetch_failure_async(async_client, user): AIOHTTP_GET_PATH = "api.models.aiohttp.ClientSession.get" - AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head" - - # Mock for HEAD request - mock_head_response = AsyncMock() - mock_head_response.status = 200 - mock_head_response.headers = { - "Content-Type": "application/pdf", - "Content-Length": "1000", - } - mock_head_response.__aenter__.return_value = mock_head_response # Mock for GET request (failing response) mock_get_response = AsyncMock() @@ -2012,10 +1982,8 @@ async def test_document_fetch_failure_async(async_client, user): mock_get_response.read = AsyncMock(return_value=b"") mock_get_response.__aenter__.return_value = mock_get_response - # Patch both HEAD and GET methods - with patch(AIOHTTP_HEAD_PATH, return_value=mock_head_response) as mock_head, patch( - AIOHTTP_GET_PATH, return_value=mock_get_response - ) as mock_get: + # Patch GET method + with patch(AIOHTTP_GET_PATH, return_value=mock_get_response) as mock_get: # Mock EmailMessage with patch("api.views.EmailMessage") as MockEmailMessage: mock_email_instance = MockEmailMessage.return_value @@ -2042,8 +2010,7 @@ async def test_document_fetch_failure_async(async_client, user): ] await asyncio.gather(*pending_tasks) - # Assert that both HEAD and GET were called - mock_head.assert_called_once() + # Assert that GET was called mock_get.assert_called_once() # Assert that the email was sent @@ -2060,16 +2027,6 @@ async def test_document_fetch_failure_async(async_client, user): async def test_document_fetch_failure_async_webhook(async_client, user): AIOHTTP_GET_PATH = "api.models.aiohttp.ClientSession.get" - AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head" - - # Mock for HEAD request - mock_head_response = AsyncMock() - mock_head_response.status = 200 - mock_head_response.headers = { - "Content-Type": "application/pdf", - "Content-Length": "1000", - } - mock_head_response.__aenter__.return_value = mock_head_response # Mock for GET request (failing response) mock_get_response = AsyncMock() @@ -2078,10 +2035,8 @@ async def test_document_fetch_failure_async_webhook(async_client, user): mock_get_response.read = AsyncMock(return_value=b"") mock_get_response.__aenter__.return_value = mock_get_response - # Patch both HEAD and GET methods - with patch(AIOHTTP_HEAD_PATH, return_value=mock_head_response) as mock_head, patch( - AIOHTTP_GET_PATH, return_value=mock_get_response - ) as mock_get: + # Patch GET method + with patch(AIOHTTP_GET_PATH, return_value=mock_get_response) as mock_get: # Define a mock webhook URL webhook_url = "http://localhost:8000/webhook-receive" @@ -2154,8 +2109,7 @@ async def test_document_fetch_failure_async_webhook(async_client, user): ] await asyncio.gather(*pending_tasks) - # Assert that both HEAD and GET were called - mock_head.assert_called_once() + # Assert that GET was called mock_get.assert_called_once() # Verify that Svix message.create was called @@ -2167,19 +2121,9 @@ async def test_document_fetch_failure_async_webhook(async_client, user): async def test_document_file_too_big(async_client, user): AIOHTTP_GET_PATH = "api.models.aiohttp.ClientSession.get" - AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head" MAX_SIZE_BYTES = 50 * 1024 * 1024 - # Mock for HEAD request - mock_head_response = AsyncMock() - mock_head_response.status = 200 - mock_head_response.headers = { - "Content-Type": "application/pdf", - "Content-Length": str(MAX_SIZE_BYTES + 1), # 50MB + 1 byte - } - mock_head_response.__aenter__.return_value = mock_head_response - - # Mock for GET request (shouldn't be called due to size check in HEAD) + # Mock for GET request mock_get_response = AsyncMock() mock_get_response.status = 200 mock_get_response.headers = { @@ -2188,10 +2132,8 @@ async def test_document_file_too_big(async_client, user): mock_get_response.read = AsyncMock(return_value=b"x" * (MAX_SIZE_BYTES + 1)) mock_get_response.__aenter__.return_value = mock_get_response - # Patch both HEAD and GET methods - with patch(AIOHTTP_HEAD_PATH, return_value=mock_head_response) as mock_head, patch( - AIOHTTP_GET_PATH, return_value=mock_get_response - ) as mock_get: + # Patch GET method + with patch(AIOHTTP_GET_PATH, return_value=mock_get_response) as mock_get: response = await async_client.post( "/documents/upsert-document/", json={ @@ -2202,13 +2144,8 @@ async def test_document_file_too_big(async_client, user): headers={"Authorization": f"Bearer {user.token}"}, ) - # Assert that HEAD was called - mock_head.assert_called_once_with( - "https://example.com/largefile.pdf", allow_redirects=True - ) - - # Assert that GET was never called (should fail at HEAD check) - mock_get.assert_not_called() + # Assert that GET was called + mock_get.assert_called_once() # Assert that the response status code reflects the failure assert response.status_code == 400 @@ -2364,39 +2301,3 @@ async def test_unknown_mime_type(collection): # Cleanup await document.delete_s3_file() - - -async def test_get_url_info_non_200_response(): - AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head" - - # Mock response with non-200 status - mock_response = AsyncMock() - mock_response.status = 404 - mock_response.__aenter__.return_value = mock_response - - document = Document(url="https://example.com/doc.pdf") - - with patch(AIOHTTP_HEAD_PATH, return_value=mock_response): - with pytest.raises(DjangoValidationError): - await document._get_url_info() - - -async def test_get_url_info_empty_filename_fallback(): - AIOHTTP_HEAD_PATH = "api.models.aiohttp.ClientSession.head" - - # Mock response with empty filename - mock_response = AsyncMock() - mock_response.status = 200 - mock_response.headers = { - "Content-Type": "application/pdf", - "Content-Disposition": "", # Empty content disposition - "Content-Length": "1000", - } - mock_response.__aenter__.return_value = mock_response - - document = Document(url="https://example.com/") # URL with no filename - - with patch(AIOHTTP_HEAD_PATH, return_value=mock_response): - content_type, filename = await document._get_url_info() - - assert filename == "downloaded_file" diff --git a/web/api/views.py b/web/api/views.py index 6e03191..f6598a6 100644 --- a/web/api/views.py +++ b/web/api/views.py @@ -18,7 +18,8 @@ from ninja.security import HttpBearer from pgvector.utils import HalfVector from pydantic import Field, model_validator -from svix.api import ApplicationIn, EndpointIn, EndpointUpdate, MessageIn, SvixAsync +from svix.api import (ApplicationIn, EndpointIn, EndpointUpdate, MessageIn, + SvixAsync) from typing_extensions import Self from .models import Collection, Document, MaxSim, Page @@ -320,6 +321,7 @@ class DocumentIn(Schema): url: Optional[str] = None base64: Optional[str] = None wait: Optional[bool] = False + use_proxy: Optional[bool] = False @model_validator(mode="after") def base64_or_url(self) -> Self: @@ -355,6 +357,7 @@ class DocumentInPatch(Schema): ) url: Optional[str] = None base64: Optional[str] = None + use_proxy: Optional[bool] = False @model_validator(mode="after") def at_least_one_field(self) -> Self: @@ -402,7 +405,7 @@ async def process_upsert_document( await document.save_base64_to_s3(payload.base64) # this method will embed the document and save it to the database - await document.embed_document() + await document.embed_document(payload.use_proxy) document = ( await Document.objects.select_related("collection") .annotate(num_pages=Count("pages")) @@ -750,14 +753,14 @@ async def partial_update_document( document.name = payload.name or document.name # we want to delete the old pages, since we will re-embed the document await document.pages.all().adelete() - await document.embed_document() + await document.embed_document(payload.use_proxy) elif payload.base64: document.metadata = payload.metadata or document.metadata document.name = payload.name or document.name await document.save_base64_to_s3(payload.base64) await document.pages.all().adelete() - await document.embed_document() + await document.embed_document(payload.use_proxy) else: document.name = payload.name or document.name diff --git a/web/config/settings.py b/web/config/settings.py index f35aed7..0e6a07f 100644 --- a/web/config/settings.py +++ b/web/config/settings.py @@ -286,6 +286,22 @@ # Svix SVIX_TOKEN = env("SVIX_TOKEN", default="") +# SENTRY +SENTRY_DSN = env("SENTRY_DSN", default=None) + +if SENTRY_DSN: + import sentry_sdk + + sentry_sdk.init( + dsn=SENTRY_DSN, + traces_sample_rate=0.1, + ) + + logger.info("Sentry is enabled.") + +# PROXY +PROXY_URL = env("PROXY_URL", default=None) + STORAGES = { "default": { "BACKEND": "storages.backends.s3.S3Storage", From 5075e2b0529033f9dbf944860092ca8f7c499d8f Mon Sep 17 00:00:00 2001 From: Abdullah Nassar Date: Thu, 5 Dec 2024 21:57:21 +0200 Subject: [PATCH 2/7] add test --- web/api/tests/tests.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/web/api/tests/tests.py b/web/api/tests/tests.py index 674106e..1c66ff0 100644 --- a/web/api/tests/tests.py +++ b/web/api/tests/tests.py @@ -1972,6 +1972,36 @@ async def test_document_fetch_failure_await(async_client, user): assert response.status_code == 400 +async def test_document_fetch_failure_await_proxy(async_client, user): + AIOHTTP_GET_PATH = "api.models.aiohttp.ClientSession.get" + + # Mock for GET request + mock_get_response = AsyncMock() + mock_get_response.status = 500 + mock_get_response.headers = {} + mock_get_response.read = AsyncMock(return_value=b"") + mock_get_response.__aenter__.return_value = mock_get_response + + # Patch GET method + with patch(AIOHTTP_GET_PATH, return_value=mock_get_response) as mock_get: + response = await async_client.post( + "/documents/upsert-document/", + json={ + "name": "Test Document Fetch Failure", + "url": "https://example.com/nonexistent.pdf", + "wait": True, + "use_proxy": True, + }, + headers={"Authorization": f"Bearer {user.token}"}, + ) + + # Assert GET was called + mock_get.assert_called_once() + + # Assert that the response status code reflects the failure + assert response.status_code == 400 + + async def test_document_fetch_failure_async(async_client, user): AIOHTTP_GET_PATH = "api.models.aiohttp.ClientSession.get" From 2a4d5a3ed170475ece5c374b3af60850f98d579a Mon Sep 17 00:00:00 2001 From: Abdullah Nassar Date: Thu, 5 Dec 2024 22:20:40 +0200 Subject: [PATCH 3/7] add test --- web/api/tests/tests.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/web/api/tests/tests.py b/web/api/tests/tests.py index 1c66ff0..624ece2 100644 --- a/web/api/tests/tests.py +++ b/web/api/tests/tests.py @@ -1894,6 +1894,39 @@ async def test_embeddings_service_down(async_client, user): } +async def test_embeddings_service_error(async_client, user): + EMBEDDINGS_POST_PATH = "api.models.aiohttp.ClientSession.post" + # Create a mock response object with status 200 with an error message + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.json.return_value = AsyncMock(return_value={"error": "Service Down"}) + + # Mock the context manager __aenter__ to return the mock_response + mock_response.__aenter__.return_value = mock_response + + # Patch the aiohttp.ClientSession.post method to return the mock_response + with patch(EMBEDDINGS_POST_PATH, return_value=mock_response) as mock_post: + # Perform the POST request to trigger embed_document + response = await async_client.post( + "/documents/upsert-document/", + json={ + "name": "Test Document Fixture", + "url": "https://pdfobject.com/pdf/sample.pdf", + "wait": True, + }, + headers={"Authorization": f"Bearer {user.token}"}, + ) + + args, kwargs = mock_post.call_args + assert kwargs["json"]["input"]["task"] == "image" + assert "Authorization" in kwargs["headers"] + + # Assert that the response status code reflects the failure + assert ( + response.status_code == 400 + ) # Assuming your view returns 400 on ValidationError + + async def test_embedding_service_down_query(async_client, user): EMBEDDINGS_POST_PATH = "api.views.aiohttp.ClientSession.post" # Create a mock response object with status 500 From 5ff45e8601848a5be82fc22512c3b28ea201a812 Mon Sep 17 00:00:00 2001 From: Abdullah Nassar Date: Thu, 5 Dec 2024 22:30:23 +0200 Subject: [PATCH 4/7] add test --- web/api/tests/tests.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/web/api/tests/tests.py b/web/api/tests/tests.py index 624ece2..33f9d03 100644 --- a/web/api/tests/tests.py +++ b/web/api/tests/tests.py @@ -1153,6 +1153,34 @@ async def test_patch_document_url(async_client, user, collection, document): await Document.objects.all().adelete() +async def test_patch_document_url_proxy(async_client, user, collection, document): + # we update the URL of the document + response = await async_client.patch( + f"/documents/{document.name}/", + json={ + "name": "Test Document Update", + "url": "https://www.w3schools.com/w3css/img_lights.jpg", + "collection_name": collection.name, + "use_proxy": True, + }, + headers={"Authorization": f"Bearer {user.token}"}, + ) + assert response.status_code == 200 + response_data = response.json() + assert response_data["id"] == 1 + assert response_data["name"] == "Test Document Update" + assert response_data["metadata"] == {"important": True} + assert ( + response_data["url"] == "http://www.w3schools.com/w3css/img_lights.jpg" + ) # converted to http because of the proxy + assert response_data["num_pages"] == 1 + assert response_data["collection_name"] == "Test Collection Fixture" + assert response_data["pages"] is None + + # now check if the document was actually updated + await Document.objects.all().adelete() + + async def test_patch_document_url_and_base64(async_client, user, collection, document): # we updated the base64 string of the page response = await async_client.patch( From edc44e2b9d093582676847046e4931ed443245f0 Mon Sep 17 00:00:00 2001 From: Abdullah Nassar Date: Thu, 5 Dec 2024 22:41:09 +0200 Subject: [PATCH 5/7] add test --- web/api/tests/tests.py | 65 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/web/api/tests/tests.py b/web/api/tests/tests.py index 33f9d03..1adc1b9 100644 --- a/web/api/tests/tests.py +++ b/web/api/tests/tests.py @@ -2033,6 +2033,37 @@ async def test_document_fetch_failure_await(async_client, user): assert response.status_code == 400 +async def test_document_fetch_missing_output(async_client, user): + AIOHTTP_GET_PATH = "api.models.aiohttp.ClientSession.get" + + # Mock for GET request + mock_get_response = AsyncMock() + mock_get_response.status = 200 + mock_get_response.headers = {} + mock_get_response.read = AsyncMock(return_value=b"") + mock_get_response.__aenter__.return_value = mock_get_response + + # Patch GET method + with patch(AIOHTTP_GET_PATH, return_value=mock_get_response) as mock_get: + response = await async_client.post( + "/documents/upsert-document/", + json={ + "name": "Test Document Fetch Failure", + "url": "https://example.com/nonexistent.pdf", + "wait": True, + }, + headers={"Authorization": f"Bearer {user.token}"}, + ) + + # Assert GET was called + mock_get.assert_called_once_with( + "https://example.com/nonexistent.pdf", proxy=None + ) + + # Assert that the response status code reflects the failure + assert response.status_code == 400 + + async def test_document_fetch_failure_await_proxy(async_client, user): AIOHTTP_GET_PATH = "api.models.aiohttp.ClientSession.get" @@ -2242,6 +2273,40 @@ async def test_document_file_too_big(async_client, user): assert response.status_code == 400 +async def test_document_file_good_size(async_client, user): + AIOHTTP_GET_PATH = "api.models.aiohttp.ClientSession.get" + MAX_SIZE_BYTES = 50 * 1024 * 1024 + + # Mock for GET request + mock_get_response = AsyncMock() + mock_get_response.status = 200 + mock_get_response.headers = { + "Content-Length": str(MAX_SIZE_BYTES - 1), + } + mock_get_response.read = AsyncMock(return_value=b"x" * (MAX_SIZE_BYTES + 1)) + mock_get_response.__aenter__.return_value = mock_get_response + + # Patch GET method + with patch(AIOHTTP_GET_PATH, return_value=mock_get_response) as mock_get: + response = await async_client.post( + "/documents/upsert-document/", + json={ + "name": "Test Document File Too Large", + "url": "https://example.com/largefile.pdf", + "wait": True, + }, + headers={"Authorization": f"Bearer {user.token}"}, + ) + + # Assert that GET was called + mock_get.assert_called_once() + + # Assert that the response status code reflects the failure + assert ( + response.status_code == 400 + ) # still fails because we are not actually downloading the file + + async def test_gotenberg_service_down_with_file(async_client, user): GOTENBERG_POST_PATH = "api.models.aiohttp.ClientSession.post" # Create a mock response object with status 500 From 271e0f7fe823623785d6a932199d9abf45f0ce79 Mon Sep 17 00:00:00 2001 From: Abdullah Nassar Date: Thu, 5 Dec 2024 22:58:43 +0200 Subject: [PATCH 6/7] add test --- web/api/tests/tests.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/web/api/tests/tests.py b/web/api/tests/tests.py index 1adc1b9..806db5e 100644 --- a/web/api/tests/tests.py +++ b/web/api/tests/tests.py @@ -2457,3 +2457,24 @@ async def test_unknown_mime_type(collection): # Cleanup await document.delete_s3_file() + + +async def test_prep_document_no_data(): + document = Document() + with pytest.raises(DjangoValidationError): + await document._prep_document() + + +async def test_convert_url_non_200_response(): + AIOHTTP_POST_PATH = "api.models.aiohttp.ClientSession.post" + + # Mock response with non-200 status + mock_response = AsyncMock() + mock_response.status = 404 + mock_response.__aenter__.return_value = mock_response + + document = Document() + + with patch(AIOHTTP_POST_PATH, return_value=mock_response): + with pytest.raises(DjangoValidationError): + await document._convert_url_to_pdf("https://example.com/doc.pdf") From 28e2b1b599064a77669627ca55bef46920ddd787 Mon Sep 17 00:00:00 2001 From: Abdullah Nassar Date: Thu, 5 Dec 2024 23:08:13 +0200 Subject: [PATCH 7/7] update tests --- web/api/models.py | 1 - web/api/tests/tests.py | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/web/api/models.py b/web/api/models.py index b22c50d..a8e4077 100644 --- a/web/api/models.py +++ b/web/api/models.py @@ -507,7 +507,6 @@ async def _prep_document( else: # It's a regular file logger.info(f"Fetching document from URL: {self.url}") - # document_data = await self._fetch_document() if "application/pdf" in content_type: extension = "pdf" else: diff --git a/web/api/tests/tests.py b/web/api/tests/tests.py index 806db5e..ea8657b 100644 --- a/web/api/tests/tests.py +++ b/web/api/tests/tests.py @@ -2478,3 +2478,25 @@ async def test_convert_url_non_200_response(): with patch(AIOHTTP_POST_PATH, return_value=mock_response): with pytest.raises(DjangoValidationError): await document._convert_url_to_pdf("https://example.com/doc.pdf") + + +async def test_fetch_document_200_response(): + AIOHTTP_GET_PATH = "api.models.aiohttp.ClientSession.get" + + # Mock response with non-200 status + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.headers = { + "Content-Type": "application/pdf", + "Content-Disposition": "", # Empty content disposition + "Content-Length": "1000", + } + mock_response.__aenter__.return_value = mock_response + + document = Document(url="https://examplepdf.com") + + with patch(AIOHTTP_GET_PATH, return_value=mock_response): + content_type, filename, data = await document._fetch_document() + + assert content_type == "application/pdf" + assert filename == "downloaded_file"