diff --git a/.github/workflows/build-container-images.yaml b/.github/workflows/build-container-images.yaml index 49223de8..79861726 100644 --- a/.github/workflows/build-container-images.yaml +++ b/.github/workflows/build-container-images.yaml @@ -42,6 +42,9 @@ jobs: - name: reis containerfile: services/reis/Dockerfile context: services/reis/ + - name: confluence-importer + containerfile: services/confluence-importer/Dockerfile + context: services/confluence-importer/ steps: - uses: actions/checkout@v4 - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3 diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 5957a31f..6d4d3dea 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -83,6 +83,7 @@ jobs: - frontend - backend - reis + - confluence-importer needs: - upstream-workflows - verify-version-strings diff --git a/services/confluence-importer/Dockerfile b/services/confluence-importer/Dockerfile index 2881f25c..89a81cb3 100644 --- a/services/confluence-importer/Dockerfile +++ b/services/confluence-importer/Dockerfile @@ -11,7 +11,7 @@ FROM python:3.13-slim WORKDIR /app -COPY --from=builder --chown=app:app /app/.venv /app/.venv +COPY --from=builder /app/.venv /app/.venv COPY confluence_importer/ confluence_importer/ COPY main.py . diff --git a/services/confluence-importer/confluence_importer/c4.py b/services/confluence-importer/confluence_importer/c4.py index 235f9f63..c1d738c5 100644 --- a/services/confluence-importer/confluence_importer/c4.py +++ b/services/confluence-importer/confluence_importer/c4.py @@ -11,6 +11,58 @@ def clear_previous_ingests() -> None: """ Clears all previously ingested files from the C4 bucket. """ + logger.info("Starting deletion of all Confluence pages from c4", bucket_id=bucket_id) + + deletion_counter = {"success": 0, "error": 0} + + files = fetch_bucket_files_list() + + for index, item in enumerate(files): + num_items = len(files) + file_name = item.get("fileName") + + is_confluence_page_file = file_name.startswith("confluence_page_") and file_name.endswith(".md") + + if is_confluence_page_file: + try: + delete_confluence_page(item.get("id")) + except Exception as e: + deletion_counter["error"] += 1 + logger.error( + "Error deleting Confluence page from c4", + bucket_id=bucket_id, + file_name=file_name, + progress=f"{index + 1}/{num_items}", + status="error", + error=str(e), + ) + else: + deletion_counter["success"] += 1 + logger.info( + "Delete Confluence page in c4", + bucket_id=bucket_id, + file_name=file_name, + progress=f"{index + 1}/{num_items}", + status="success", + ) + + if deletion_counter["error"] > 0: + logger.error( + "Deletion of Confluence pages from c4 completed with errors! See log for more information.", + bucket_id=bucket_id, + deletion_counter=deletion_counter, + ) + else: + logger.info( + "Deletion of Confluence pages from c4 completed.", bucket_id=bucket_id, deletion_counter=deletion_counter + ) + + +def delete_confluence_page(file_id): + requests.delete(f"{c4_base_url}/api/buckets/{bucket_id}/files/{file_id}", headers={"x-api-key": config.c4_token}) + + +def fetch_bucket_files_list(): page = 1 batch_size = 50 @@ -21,32 +73,18 @@ def clear_previous_ingests() -> None: response = requests.get(f"{c4_base_url}/api/buckets/{bucket_id}/files", headers={"x-api-key": config.c4_token}) total = response.json().get("total") + items_in_page = response.json().get("items") - items.extend(response.json().get("items")) + items.extend(items_in_page) - if page * batch_size >= total: + if page * batch_size >= total or len(items_in_page) == 0: break else: page += 1 logger.info("Full list of files in c4 fetched", bucket_id=bucket_id, num_files=total) - for index, item in enumerate(items): - num_items = len(items) - file_name = item.get("fileName") - - if file_name.startswith("confluence_page_") and file_name.endswith(".md"): - requests.delete( - f"{c4_base_url}/api/buckets/{bucket_id}/files/{item.get('id')}", headers={"x-api-key": config.c4_token} - ) - logger.info( - "Delete Confluence page in c4", - bucket_id=bucket_id, - file_name=file_name, - progress=f"{index + 1}/{num_items}", - status="success", - ) - logger.info("All Confluence pages deleted from c4", bucket_id=bucket_id) + return items def import_confluence_page(page_id: int, page_markdown: str) -> None: diff --git a/services/confluence-importer/confluence_importer/confluence.py b/services/confluence-importer/confluence_importer/confluence.py index b2c21937..f81f06fc 100644 --- a/services/confluence-importer/confluence_importer/confluence.py +++ b/services/confluence-importer/confluence_importer/confluence.py @@ -50,13 +50,14 @@ def get_pages_for_space(space_key: str) -> Generator[ConfluencePage]: A list of ConfluencePage dataclasses containing the page information and content as HTML """ crawling_done = False - batch_size = 100 + batch_size = 100 # Don't change. See comment regarding `get_all_pages_from_space_as_generator()` below. offset = 0 while not crawling_done: logger.debug("Fetch Pages for Confluence Space", space_key=space_key, offset=offset, limit=batch_size) - # It seems that limit is broken in `atlassian-python-api`. It always defaults to 100? TODO figure out whats up. + # It seems that the `limit` parameter is broken and is always 100. + # This is fine as long as we keep our `batch_size` at 100. result = confluence_api.get_all_pages_from_space_as_generator( space_key, start=offset, diff --git a/services/confluence-importer/main.py b/services/confluence-importer/main.py index 545a6c03..58c43d8a 100644 --- a/services/confluence-importer/main.py +++ b/services/confluence-importer/main.py @@ -9,35 +9,72 @@ page_ids = config.confluence_page_ids_to_import -def main(): - logger.info("Starting synchronization Confluence to c4") - clear_previous_ingests() - +def process_confluence_spaces(page_import_counter): logger.info("Starting import of Confluence Spaces", num_spaces=len(space_keys)) + for space_key in space_keys: logger.info("Starting import of Confluence Space", space_key=space_key) pages = confluence.get_pages_for_space(space_key) for index, page in enumerate(pages, start=1): - page_markdown = html_to_markdown(page) - import_confluence_page(page.id, page_markdown) - logger.info("Import Confluence page", space_key=space_key, page_id=page.id, page_count=f"{index}") + try: + page_markdown = html_to_markdown(page) + import_confluence_page(page.id, page_markdown) + page_import_counter["success"] += 1 + logger.info("Import Confluence page", space_key=space_key, page_id=page.id, page_count=f"{index}") + except Exception as e: + page_import_counter["error"] += 1 + logger.error( + "Error importing Confluence page", + error=str(e), + space_key=space_key, + page_id=page.id, + page_count=f"{index}", + ) logger.info("Import of Confluence Space completed", space_key=space_key) logger.info("Import of all Confluence Spaces completed") + +def process_individual_pages(page_import_counter): num_pages = len(page_ids) logger.info("Starting import of individual Confluence pages", num_pages=num_pages) + for index, page_id in enumerate(page_ids): - page = confluence.get_page(page_id) - page_markdown = html_to_markdown(page) - import_confluence_page(page_id, page_markdown) - print(f"Ingested individual Confluence page {index + 1}/{num_pages}.") - logger.info("Importing Confluence page", page_id=page_id, progress=f"{index + 1}/{num_pages}") + try: + page = confluence.get_page(page_id) + page_markdown = html_to_markdown(page) + import_confluence_page(page_id, page_markdown) + page_import_counter["success"] += 1 + logger.info("Import Confluence page", page_id=page_id, progress=f"{index + 1}/{num_pages}") + except Exception as e: + page_import_counter["error"] += 1 + logger.error( + "Error importing Confluence page", error=str(e), page_id=page_id, progress=f"{index + 1}/{num_pages}" + ) logger.info("Import of individual Confluence pages completed") - logger.info("Synchronization Confluence to c4 completed") + +def log_final_results(page_import_counter): + if page_import_counter["error"] > 0: + logger.error( + "Synchronization Confluence to c4 completed with errors! See log for more information.", + page_import_counter=page_import_counter, + ) + else: + logger.info("Synchronization Confluence to c4 completed.", page_import_counter) + + +def main(): + logger.info("Starting synchronization Confluence to c4") + + clear_previous_ingests() + + page_import_counter = {"error": 0, "success": 0} + process_confluence_spaces(page_import_counter) + process_individual_pages(page_import_counter) + log_final_results(page_import_counter) main() diff --git a/services/confluence-importer/tests/test_c4.py b/services/confluence-importer/tests/test_c4.py new file mode 100644 index 00000000..0b7e6a59 --- /dev/null +++ b/services/confluence-importer/tests/test_c4.py @@ -0,0 +1,197 @@ +from pytest_mock import MockerFixture + +from confluence_importer.c4 import ( + clear_previous_ingests, + delete_confluence_page, + fetch_bucket_files_list, + import_confluence_page, +) + + +class TestC4: + def test_delete_confluence_page(self, mocker: MockerFixture): + # arrange + mock_requests = mocker.patch("confluence_importer.c4.requests") + mocker.patch("confluence_importer.c4.c4_base_url", "http://test-url") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + mocker.patch("confluence_importer.c4.config.c4_token", "test-token") + file_id = "test-file-id" + + # act + delete_confluence_page(file_id) + + # assert + mock_requests.delete.assert_called_once_with( + "http://test-url/api/buckets/test-bucket/files/test-file-id", headers={"x-api-key": "test-token"} + ) + + def test_fetch_bucket_files_list_single_page(self, mocker: MockerFixture): + # arrange + mock_requests = mocker.patch("confluence_importer.c4.requests") + mock_logger = mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.c4_base_url", "http://test-url") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + mocker.patch("confluence_importer.c4.config.c4_token", "test-token") + + mock_response = mocker.MagicMock() + mock_response.json.return_value = { + "total": 2, + "items": [ + {"id": "file1", "fileName": "confluence_page_1.md"}, + {"id": "file2", "fileName": "confluence_page_2.md"}, + ], + } + mock_requests.get.return_value = mock_response + + # act + result = fetch_bucket_files_list() + + # assert + mock_requests.get.assert_called_once_with( + "http://test-url/api/buckets/test-bucket/files", headers={"x-api-key": "test-token"} + ) + assert len(result) == 2 + assert result[0]["id"] == "file1" + assert result[1]["id"] == "file2" + mock_logger.info.assert_called_once() + + def test_fetch_bucket_files_list_multiple_pages(self, mocker: MockerFixture): + # arrange + mock_requests = mocker.patch("confluence_importer.c4.requests") + mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.c4_base_url", "http://test-url") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + mocker.patch("confluence_importer.c4.config.c4_token", "test-token") + + first_response = mocker.MagicMock() + first_response.json.return_value = { + "total": 3, + "items": [ + {"id": "file1", "fileName": "confluence_page_1.md"}, + {"id": "file2", "fileName": "confluence_page_2.md"}, + ], + } + + second_response = mocker.MagicMock() + second_response.json.return_value = {"total": 3, "items": [{"id": "file3", "fileName": "confluence_page_3.md"}]} + + mock_requests.get.return_value = first_response + + # act + result = fetch_bucket_files_list() + + # assert + mock_requests.get.assert_called_once_with( + "http://test-url/api/buckets/test-bucket/files", headers={"x-api-key": "test-token"} + ) + assert len(result) == 2 + assert result[0]["id"] == "file1" + assert result[1]["id"] == "file2" + + def test_import_confluence_page_success(self, mocker: MockerFixture): + # arrange + mock_requests = mocker.patch("confluence_importer.c4.requests") + mock_logger = mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.c4_base_url", "http://test-url") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + mocker.patch("confluence_importer.c4.config.c4_token", "test-token") + + page_id = 12345 + page_markdown = "# Test Page" + + mock_response = mocker.MagicMock() + mock_response.status_code = 201 + mock_requests.post.return_value = mock_response + + # act + import_confluence_page(page_id, page_markdown) + + # assert + mock_requests.post.assert_called_once_with( + "http://test-url/api/buckets/test-bucket/files", + files={"file": (f"confluence_page_{page_id}.md", page_markdown, "text/markdown")}, + headers={"x-api-key": "test-token"}, + ) + + mock_logger.debug.assert_called_once() + mock_logger.error.assert_not_called() + + def test_import_confluence_page_error(self, mocker: MockerFixture): + # arrange + mock_requests = mocker.patch("confluence_importer.c4.requests") + mock_logger = mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.c4_base_url", "http://test-url") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + mocker.patch("confluence_importer.c4.config.c4_token", "test-token") + + page_id = 12345 + page_markdown = "# Test Page" + + mock_response = mocker.MagicMock() + mock_response.status_code = 500 + mock_response.text = "Internal Server Error" + mock_requests.post.return_value = mock_response + + # act + import_confluence_page(page_id, page_markdown) + + # assert + mock_requests.post.assert_called_once_with( + "http://test-url/api/buckets/test-bucket/files", + files={"file": (f"confluence_page_{page_id}.md", page_markdown, "text/markdown")}, + headers={"x-api-key": "test-token"}, + ) + mock_logger.debug.assert_not_called() + mock_logger.error.assert_called_once() + + def test_clear_previous_ingests(self, mocker: MockerFixture): + # arrange + mock_fetch_bucket_files = mocker.patch( + "confluence_importer.c4.fetch_bucket_files_list", + return_value=[ + {"id": "file1", "fileName": "confluence_page_1.md"}, + {"id": "file2", "fileName": "other_file.txt"}, + {"id": "file3", "fileName": "confluence_page_2.md"}, + ], + ) + mock_delete_confluence_page = mocker.patch("confluence_importer.c4.delete_confluence_page") + mock_logger = mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + + # act + clear_previous_ingests() + + # assert + mock_fetch_bucket_files.assert_called_once() + assert mock_delete_confluence_page.call_count == 2 + mock_delete_confluence_page.assert_any_call("file1") + mock_delete_confluence_page.assert_any_call("file3") + mock_logger.info.assert_called() + + def test_clear_previous_ingests_with_error(self, mocker: MockerFixture): + # arrange + mock_fetch_bucket_files = mocker.patch( + "confluence_importer.c4.fetch_bucket_files_list", + return_value=[ + {"id": "file1", "fileName": "confluence_page_1.md"}, + {"id": "file2", "fileName": "confluence_page_2.md"}, + ], + ) + + def delete_side_effect(file_id): + if file_id == "file2": + raise Exception("Delete failed") + + mock_delete_confluence_page = mocker.patch( + "confluence_importer.c4.delete_confluence_page", side_effect=delete_side_effect + ) + mock_logger = mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + + # act + clear_previous_ingests() + + # assert + mock_fetch_bucket_files.assert_called_once() + assert mock_delete_confluence_page.call_count == 2 + mock_logger.error.assert_called() diff --git a/services/confluence-importer/tests/test_confluence.py b/services/confluence-importer/tests/test_confluence.py index d196423e..9aae689f 100644 --- a/services/confluence-importer/tests/test_confluence.py +++ b/services/confluence-importer/tests/test_confluence.py @@ -1,22 +1,144 @@ from pytest_mock import MockerFixture -from confluence_importer.confluence import get_page +from confluence_importer.confluence import get_page, get_pages_for_space, ConfluencePage, confluence_url class TestConfluence: def test_get_page(self, mocker: MockerFixture): # arrange + page_id = 123456 + mock_page_data = { + "body": {"storage": {"value": "

Test Page

"}}, + "history": {"lastUpdated": {"when": "2025-07-29T13:56:00.000Z"}}, + "_links": {"webui": "/rest/api/content/123456"}, + } + mock_get_page_by_id = mocker.patch( "confluence_importer.confluence.confluence_api.get_page_by_id", - return_value={ - "body": {"storage": {"value": "

Test Page

"}}, + return_value=mock_page_data, + ) + + # act + result = get_page(page_id) + + # assert + mock_get_page_by_id.assert_called_once_with(page_id, expand="body.storage,history.lastUpdated") + assert isinstance(result, ConfluencePage) + assert result.id == page_id + assert result.last_updated == "2025-07-29T13:56:00.000Z" + assert result.url == f"{confluence_url}/rest/api/content/123456" + assert result.html_content == "

Test Page

" + + def test_get_pages_for_space(self, mocker: MockerFixture): + # arrange + space_key = "TEST" + mock_pages = [ + { + "id": "123456", "history": {"lastUpdated": {"when": "2025-07-29T13:56:00.000Z"}}, - "_links": {"webui": "https://confluence.example.com/rest/api/content/123456"}, + "_links": {"webui": "/rest/api/content/123456"}, + "body": {"storage": {"value": "

Test Page 1

"}}, }, + { + "id": "789012", + "history": {"lastUpdated": {"when": "2025-07-30T10:15:00.000Z"}}, + "_links": {"webui": "/rest/api/content/789012"}, + "body": {"storage": {"value": "

Test Page 2

"}}, + }, + ] + + mock_generator = mocker.patch( + "confluence_importer.confluence.confluence_api.get_all_pages_from_space_as_generator", + return_value=mock_pages, ) + mocker.patch("confluence_importer.confluence.logger.debug") + mocker.patch("confluence_importer.confluence.logger.info") + # act - get_page(123456) + results = list(get_pages_for_space(space_key)) # assert - mock_get_page_by_id.assert_called_once_with(123456, expand="body.storage,history.lastUpdated") + mock_generator.assert_called_once_with( + space_key, + start=0, + limit=100, + content_type="page", + expand="body.storage,history.lastUpdated", + status="current", + ) + + assert len(results) == 2 + + assert isinstance(results[0], ConfluencePage) + assert results[0].id == "123456" + assert results[0].last_updated == "2025-07-29T13:56:00.000Z" + assert results[0].url == f"{confluence_url}/rest/api/content/123456" + assert results[0].html_content == "

Test Page 1

" + + assert isinstance(results[1], ConfluencePage) + assert results[1].id == "789012" + assert results[1].last_updated == "2025-07-30T10:15:00.000Z" + assert results[1].url == f"{confluence_url}/rest/api/content/789012" + assert results[1].html_content == "

Test Page 2

" + + def test_get_pages_for_space_pagination(self, mocker: MockerFixture): + # arrange + space_key = "TEST" + + first_batch = [ + { + "id": f"{i}", + "history": {"lastUpdated": {"when": "2025-07-29T13:56:00.000Z"}}, + "_links": {"webui": f"/rest/api/content/{i}"}, + "body": {"storage": {"value": f"

Page {i}

"}}, + } + for i in range(100) + ] + + second_batch = [ + { + "id": f"{i + 100}", + "history": {"lastUpdated": {"when": "2025-07-30T10:15:00.000Z"}}, + "_links": {"webui": f"/rest/api/content/{i + 100}"}, + "body": {"storage": {"value": f"

Page {i + 100}

"}}, + } + for i in range(50) + ] + + mock_generator = mocker.patch( + "confluence_importer.confluence.confluence_api.get_all_pages_from_space_as_generator", + side_effect=[first_batch, second_batch], + ) + + mocker.patch("confluence_importer.confluence.logger.debug") + mocker.patch("confluence_importer.confluence.logger.info") + + # act + results = list(get_pages_for_space(space_key)) + + # assert + assert mock_generator.call_count == 2 + assert mock_generator.call_args_list[0][0] == (space_key,) + assert mock_generator.call_args_list[0][1] == { + "start": 0, + "limit": 100, + "content_type": "page", + "expand": "body.storage,history.lastUpdated", + "status": "current", + } + + assert mock_generator.call_args_list[1][0] == (space_key,) + assert mock_generator.call_args_list[1][1] == { + "start": 100, + "limit": 100, + "content_type": "page", + "expand": "body.storage,history.lastUpdated", + "status": "current", + } + + assert len(results) == 150 + assert results[0].id == "0" + assert results[99].id == "99" + assert results[100].id == "100" + assert results[149].id == "149"