From 2c936acc2a3ecc2e0131f1ffee0f13640f5d0d50 Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 15:17:09 +0200 Subject: [PATCH 01/15] fix(confluence-importer): remove `--chown` flag from `.venv` copy command --- services/confluence-importer/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/confluence-importer/Dockerfile b/services/confluence-importer/Dockerfile index 2881f25c..89a81cb3 100644 --- a/services/confluence-importer/Dockerfile +++ b/services/confluence-importer/Dockerfile @@ -11,7 +11,7 @@ FROM python:3.13-slim WORKDIR /app -COPY --from=builder --chown=app:app /app/.venv /app/.venv +COPY --from=builder /app/.venv /app/.venv COPY confluence_importer/ confluence_importer/ COPY main.py . From cbf6a3e264a77b609cc35f6e188b70e68564d92d Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 15:20:07 +0200 Subject: [PATCH 02/15] ci(confluence-importer): add confluence-importer to `build-container-images` workflow --- .github/workflows/build-container-images.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build-container-images.yaml b/.github/workflows/build-container-images.yaml index 49223de8..79861726 100644 --- a/.github/workflows/build-container-images.yaml +++ b/.github/workflows/build-container-images.yaml @@ -42,6 +42,9 @@ jobs: - name: reis containerfile: services/reis/Dockerfile context: services/reis/ + - name: confluence-importer + containerfile: services/confluence-importer/Dockerfile + context: services/confluence-importer/ steps: - uses: actions/checkout@v4 - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3 From 439a426ca61b107f4b12508953196487974fb5bf Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 15:24:56 +0200 Subject: [PATCH 03/15] ci(confluence-importer): add confluence-importer to `publish` workflow --- .github/workflows/publish.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 5957a31f..6d4d3dea 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -83,6 +83,7 @@ jobs: - frontend - backend - reis + - confluence-importer needs: - upstream-workflows - verify-version-strings From eea74a40b91b7c552a2a70839e78cac930e29676 Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 15:36:43 +0200 Subject: [PATCH 04/15] fix(confluence-importer): handle errors during page import and track success/error counts --- services/confluence-importer/main.py | 39 +++++++++++++++++++++------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/services/confluence-importer/main.py b/services/confluence-importer/main.py index 545a6c03..a5b48aef 100644 --- a/services/confluence-importer/main.py +++ b/services/confluence-importer/main.py @@ -11,6 +11,9 @@ def main(): logger.info("Starting synchronization Confluence to c4") + + page_import_counter = {"error": 0, "success": 0} + clear_previous_ingests() logger.info("Starting import of Confluence Spaces", num_spaces=len(space_keys)) @@ -19,9 +22,17 @@ def main(): pages = confluence.get_pages_for_space(space_key) for index, page in enumerate(pages, start=1): - page_markdown = html_to_markdown(page) - import_confluence_page(page.id, page_markdown) - logger.info("Import Confluence page", space_key=space_key, page_id=page.id, page_count=f"{index}") + try: + page_markdown = html_to_markdown(page) + import_confluence_page(page.id, page_markdown) + except Exception as e: + page_import_counter["error"] += 1 + logger.error( + "Error importing Confluence page", error=str(e), space_key=space_key, page_id=page.id, page_count=f"{index}" + ) + else: + page_import_counter["success"] += 1 + logger.info("Import Confluence page", space_key=space_key, page_id=page.id, page_count=f"{index}") logger.info("Import of Confluence Space completed", space_key=space_key) logger.info("Import of all Confluence Spaces completed") @@ -29,15 +40,25 @@ def main(): num_pages = len(page_ids) logger.info("Starting import of individual Confluence pages", num_pages=num_pages) for index, page_id in enumerate(page_ids): - page = confluence.get_page(page_id) - page_markdown = html_to_markdown(page) - import_confluence_page(page_id, page_markdown) - print(f"Ingested individual Confluence page {index + 1}/{num_pages}.") - logger.info("Importing Confluence page", page_id=page_id, progress=f"{index + 1}/{num_pages}") + try: + page = confluence.get_page(page_id) + page_markdown = html_to_markdown(page) + import_confluence_page(page_id, page_markdown) + except Exception as e: + page_import_counter["error"] += 1 + logger.error( + "Error importing Confluence page", error=str(e), page_id=page_id, progress=f"{index + 1}/{num_pages}" + ) + else: + page_import_counter["success"] += 1 + logger.info("Import Confluence page", page_id=page_id, progress=f"{index + 1}/{num_pages}") logger.info("Import of individual Confluence pages completed") - logger.info("Synchronization Confluence to c4 completed") + if page_import_counter["error"] > 0: + logger.error("Synchronization Confluence to c4 completed with errors! See log for more information.", page_import_counter=page_import_counter) + else: + logger.info("Synchronization Confluence to c4 completed.", page_import_counter) main() From 7782240a07b9fb56e2a6555e2eedb7b758449633 Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 15:54:12 +0200 Subject: [PATCH 05/15] refactor(confluence-importer): extract logic into modular functions for improved readability and maintainability --- services/confluence-importer/main.py | 41 ++++++++++++++++++---------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/services/confluence-importer/main.py b/services/confluence-importer/main.py index a5b48aef..a7c69665 100644 --- a/services/confluence-importer/main.py +++ b/services/confluence-importer/main.py @@ -9,14 +9,9 @@ page_ids = config.confluence_page_ids_to_import -def main(): - logger.info("Starting synchronization Confluence to c4") - - page_import_counter = {"error": 0, "success": 0} - - clear_previous_ingests() - +def process_confluence_spaces(page_import_counter): logger.info("Starting import of Confluence Spaces", num_spaces=len(space_keys)) + for space_key in space_keys: logger.info("Starting import of Confluence Space", space_key=space_key) pages = confluence.get_pages_for_space(space_key) @@ -25,40 +20,56 @@ def main(): try: page_markdown = html_to_markdown(page) import_confluence_page(page.id, page_markdown) + page_import_counter["success"] += 1 + logger.info("Import Confluence page", space_key=space_key, page_id=page.id, page_count=f"{index}") except Exception as e: page_import_counter["error"] += 1 logger.error( - "Error importing Confluence page", error=str(e), space_key=space_key, page_id=page.id, page_count=f"{index}" + "Error importing Confluence page", error=str(e), space_key=space_key, page_id=page.id, + page_count=f"{index}" ) - else: - page_import_counter["success"] += 1 - logger.info("Import Confluence page", space_key=space_key, page_id=page.id, page_count=f"{index}") logger.info("Import of Confluence Space completed", space_key=space_key) logger.info("Import of all Confluence Spaces completed") + +def process_individual_pages(page_import_counter): num_pages = len(page_ids) logger.info("Starting import of individual Confluence pages", num_pages=num_pages) + for index, page_id in enumerate(page_ids): try: page = confluence.get_page(page_id) page_markdown = html_to_markdown(page) import_confluence_page(page_id, page_markdown) + page_import_counter["success"] += 1 + logger.info("Import Confluence page", page_id=page_id, progress=f"{index + 1}/{num_pages}") except Exception as e: page_import_counter["error"] += 1 logger.error( "Error importing Confluence page", error=str(e), page_id=page_id, progress=f"{index + 1}/{num_pages}" ) - else: - page_import_counter["success"] += 1 - logger.info("Import Confluence page", page_id=page_id, progress=f"{index + 1}/{num_pages}") logger.info("Import of individual Confluence pages completed") + +def log_final_results(page_import_counter): if page_import_counter["error"] > 0: - logger.error("Synchronization Confluence to c4 completed with errors! See log for more information.", page_import_counter=page_import_counter) + logger.error("Synchronization Confluence to c4 completed with errors! See log for more information.", + page_import_counter=page_import_counter) else: logger.info("Synchronization Confluence to c4 completed.", page_import_counter) +def main(): + logger.info("Starting synchronization Confluence to c4") + + clear_previous_ingests() + + page_import_counter = {"error": 0, "success": 0} + process_confluence_spaces(page_import_counter) + process_individual_pages(page_import_counter) + log_final_results(page_import_counter) + + main() From ccfac4f114e69908325c12793004098dc075f77b Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 15:55:29 +0200 Subject: [PATCH 06/15] style(confluence-importer): reformat using ruff --- services/confluence-importer/main.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/services/confluence-importer/main.py b/services/confluence-importer/main.py index a7c69665..58c43d8a 100644 --- a/services/confluence-importer/main.py +++ b/services/confluence-importer/main.py @@ -25,8 +25,11 @@ def process_confluence_spaces(page_import_counter): except Exception as e: page_import_counter["error"] += 1 logger.error( - "Error importing Confluence page", error=str(e), space_key=space_key, page_id=page.id, - page_count=f"{index}" + "Error importing Confluence page", + error=str(e), + space_key=space_key, + page_id=page.id, + page_count=f"{index}", ) logger.info("Import of Confluence Space completed", space_key=space_key) @@ -55,8 +58,10 @@ def process_individual_pages(page_import_counter): def log_final_results(page_import_counter): if page_import_counter["error"] > 0: - logger.error("Synchronization Confluence to c4 completed with errors! See log for more information.", - page_import_counter=page_import_counter) + logger.error( + "Synchronization Confluence to c4 completed with errors! See log for more information.", + page_import_counter=page_import_counter, + ) else: logger.info("Synchronization Confluence to c4 completed.", page_import_counter) From a369ef106de25dffeba880fbbe3c78c6f8df3437 Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 16:04:39 +0200 Subject: [PATCH 07/15] test(confluence-importer): additional unit tests for confluence.py --- .../tests/test_confluence.py | 134 +++++++++++++++++- 1 file changed, 128 insertions(+), 6 deletions(-) diff --git a/services/confluence-importer/tests/test_confluence.py b/services/confluence-importer/tests/test_confluence.py index d196423e..9aae689f 100644 --- a/services/confluence-importer/tests/test_confluence.py +++ b/services/confluence-importer/tests/test_confluence.py @@ -1,22 +1,144 @@ from pytest_mock import MockerFixture -from confluence_importer.confluence import get_page +from confluence_importer.confluence import get_page, get_pages_for_space, ConfluencePage, confluence_url class TestConfluence: def test_get_page(self, mocker: MockerFixture): # arrange + page_id = 123456 + mock_page_data = { + "body": {"storage": {"value": "

Test Page

"}}, + "history": {"lastUpdated": {"when": "2025-07-29T13:56:00.000Z"}}, + "_links": {"webui": "/rest/api/content/123456"}, + } + mock_get_page_by_id = mocker.patch( "confluence_importer.confluence.confluence_api.get_page_by_id", - return_value={ - "body": {"storage": {"value": "

Test Page

"}}, + return_value=mock_page_data, + ) + + # act + result = get_page(page_id) + + # assert + mock_get_page_by_id.assert_called_once_with(page_id, expand="body.storage,history.lastUpdated") + assert isinstance(result, ConfluencePage) + assert result.id == page_id + assert result.last_updated == "2025-07-29T13:56:00.000Z" + assert result.url == f"{confluence_url}/rest/api/content/123456" + assert result.html_content == "

Test Page

" + + def test_get_pages_for_space(self, mocker: MockerFixture): + # arrange + space_key = "TEST" + mock_pages = [ + { + "id": "123456", "history": {"lastUpdated": {"when": "2025-07-29T13:56:00.000Z"}}, - "_links": {"webui": "https://confluence.example.com/rest/api/content/123456"}, + "_links": {"webui": "/rest/api/content/123456"}, + "body": {"storage": {"value": "

Test Page 1

"}}, }, + { + "id": "789012", + "history": {"lastUpdated": {"when": "2025-07-30T10:15:00.000Z"}}, + "_links": {"webui": "/rest/api/content/789012"}, + "body": {"storage": {"value": "

Test Page 2

"}}, + }, + ] + + mock_generator = mocker.patch( + "confluence_importer.confluence.confluence_api.get_all_pages_from_space_as_generator", + return_value=mock_pages, ) + mocker.patch("confluence_importer.confluence.logger.debug") + mocker.patch("confluence_importer.confluence.logger.info") + # act - get_page(123456) + results = list(get_pages_for_space(space_key)) # assert - mock_get_page_by_id.assert_called_once_with(123456, expand="body.storage,history.lastUpdated") + mock_generator.assert_called_once_with( + space_key, + start=0, + limit=100, + content_type="page", + expand="body.storage,history.lastUpdated", + status="current", + ) + + assert len(results) == 2 + + assert isinstance(results[0], ConfluencePage) + assert results[0].id == "123456" + assert results[0].last_updated == "2025-07-29T13:56:00.000Z" + assert results[0].url == f"{confluence_url}/rest/api/content/123456" + assert results[0].html_content == "

Test Page 1

" + + assert isinstance(results[1], ConfluencePage) + assert results[1].id == "789012" + assert results[1].last_updated == "2025-07-30T10:15:00.000Z" + assert results[1].url == f"{confluence_url}/rest/api/content/789012" + assert results[1].html_content == "

Test Page 2

" + + def test_get_pages_for_space_pagination(self, mocker: MockerFixture): + # arrange + space_key = "TEST" + + first_batch = [ + { + "id": f"{i}", + "history": {"lastUpdated": {"when": "2025-07-29T13:56:00.000Z"}}, + "_links": {"webui": f"/rest/api/content/{i}"}, + "body": {"storage": {"value": f"

Page {i}

"}}, + } + for i in range(100) + ] + + second_batch = [ + { + "id": f"{i + 100}", + "history": {"lastUpdated": {"when": "2025-07-30T10:15:00.000Z"}}, + "_links": {"webui": f"/rest/api/content/{i + 100}"}, + "body": {"storage": {"value": f"

Page {i + 100}

"}}, + } + for i in range(50) + ] + + mock_generator = mocker.patch( + "confluence_importer.confluence.confluence_api.get_all_pages_from_space_as_generator", + side_effect=[first_batch, second_batch], + ) + + mocker.patch("confluence_importer.confluence.logger.debug") + mocker.patch("confluence_importer.confluence.logger.info") + + # act + results = list(get_pages_for_space(space_key)) + + # assert + assert mock_generator.call_count == 2 + assert mock_generator.call_args_list[0][0] == (space_key,) + assert mock_generator.call_args_list[0][1] == { + "start": 0, + "limit": 100, + "content_type": "page", + "expand": "body.storage,history.lastUpdated", + "status": "current", + } + + assert mock_generator.call_args_list[1][0] == (space_key,) + assert mock_generator.call_args_list[1][1] == { + "start": 100, + "limit": 100, + "content_type": "page", + "expand": "body.storage,history.lastUpdated", + "status": "current", + } + + assert len(results) == 150 + assert results[0].id == "0" + assert results[99].id == "99" + assert results[100].id == "100" + assert results[149].id == "149" From 09933bcc7785f5ac8be16fa0960632f0fc77defb Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 16:12:09 +0200 Subject: [PATCH 08/15] refactor(confluence-importer): extract `fetch_bucket_files_list()` method --- .../confluence_importer/c4.py | 38 +++++++++++-------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/services/confluence-importer/confluence_importer/c4.py b/services/confluence-importer/confluence_importer/c4.py index 235f9f63..4e1552fa 100644 --- a/services/confluence-importer/confluence_importer/c4.py +++ b/services/confluence-importer/confluence_importer/c4.py @@ -11,6 +11,27 @@ def clear_previous_ingests() -> None: """ Clears all previously ingested files from the C4 bucket. """ + files = fetch_bucket_files_list() + + for index, item in enumerate(files): + num_items = len(files) + file_name = item.get("fileName") + + if file_name.startswith("confluence_page_") and file_name.endswith(".md"): + requests.delete( + f"{c4_base_url}/api/buckets/{bucket_id}/files/{item.get('id')}", headers={"x-api-key": config.c4_token} + ) + logger.info( + "Delete Confluence page in c4", + bucket_id=bucket_id, + file_name=file_name, + progress=f"{index + 1}/{num_items}", + status="success", + ) + logger.info("All Confluence pages deleted from c4", bucket_id=bucket_id) + + +def fetch_bucket_files_list(): page = 1 batch_size = 50 @@ -31,22 +52,7 @@ def clear_previous_ingests() -> None: logger.info("Full list of files in c4 fetched", bucket_id=bucket_id, num_files=total) - for index, item in enumerate(items): - num_items = len(items) - file_name = item.get("fileName") - - if file_name.startswith("confluence_page_") and file_name.endswith(".md"): - requests.delete( - f"{c4_base_url}/api/buckets/{bucket_id}/files/{item.get('id')}", headers={"x-api-key": config.c4_token} - ) - logger.info( - "Delete Confluence page in c4", - bucket_id=bucket_id, - file_name=file_name, - progress=f"{index + 1}/{num_items}", - status="success", - ) - logger.info("All Confluence pages deleted from c4", bucket_id=bucket_id) + return items def import_confluence_page(page_id: int, page_markdown: str) -> None: From 820597dc7795816fb1d881eb744af4f303127eee Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 16:14:55 +0200 Subject: [PATCH 09/15] refactor(confluence-importer): introduce descriptive variable for deletion condition --- services/confluence-importer/confluence_importer/c4.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/confluence-importer/confluence_importer/c4.py b/services/confluence-importer/confluence_importer/c4.py index 4e1552fa..6e53abe7 100644 --- a/services/confluence-importer/confluence_importer/c4.py +++ b/services/confluence-importer/confluence_importer/c4.py @@ -17,7 +17,9 @@ def clear_previous_ingests() -> None: num_items = len(files) file_name = item.get("fileName") - if file_name.startswith("confluence_page_") and file_name.endswith(".md"): + is_confluence_page_file = file_name.startswith("confluence_page_") and file_name.endswith(".md") + + if is_confluence_page_file: requests.delete( f"{c4_base_url}/api/buckets/{bucket_id}/files/{item.get('id')}", headers={"x-api-key": config.c4_token} ) From 7728517f677a41d7582afd1de86403d79566dbc7 Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 16:17:02 +0200 Subject: [PATCH 10/15] refactor(confluence-importer): extract `delete_confluence_page()` method --- services/confluence-importer/confluence_importer/c4.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/services/confluence-importer/confluence_importer/c4.py b/services/confluence-importer/confluence_importer/c4.py index 6e53abe7..2ff2f3a7 100644 --- a/services/confluence-importer/confluence_importer/c4.py +++ b/services/confluence-importer/confluence_importer/c4.py @@ -20,9 +20,7 @@ def clear_previous_ingests() -> None: is_confluence_page_file = file_name.startswith("confluence_page_") and file_name.endswith(".md") if is_confluence_page_file: - requests.delete( - f"{c4_base_url}/api/buckets/{bucket_id}/files/{item.get('id')}", headers={"x-api-key": config.c4_token} - ) + delete_confluence_page(item.get("id")) logger.info( "Delete Confluence page in c4", bucket_id=bucket_id, @@ -33,6 +31,12 @@ def clear_previous_ingests() -> None: logger.info("All Confluence pages deleted from c4", bucket_id=bucket_id) +def delete_confluence_page(file_id): + requests.delete( + f"{c4_base_url}/api/buckets/{bucket_id}/files/{file_id}", headers={"x-api-key": config.c4_token} + ) + + def fetch_bucket_files_list(): page = 1 batch_size = 50 From 47c5a95181923d13ac47d8100a2394d99736045e Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 16:19:17 +0200 Subject: [PATCH 11/15] chore(confluence-importer): handle errors during page deletion --- .../confluence_importer/c4.py | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/services/confluence-importer/confluence_importer/c4.py b/services/confluence-importer/confluence_importer/c4.py index 2ff2f3a7..a1d3c8d2 100644 --- a/services/confluence-importer/confluence_importer/c4.py +++ b/services/confluence-importer/confluence_importer/c4.py @@ -20,21 +20,31 @@ def clear_previous_ingests() -> None: is_confluence_page_file = file_name.startswith("confluence_page_") and file_name.endswith(".md") if is_confluence_page_file: - delete_confluence_page(item.get("id")) - logger.info( - "Delete Confluence page in c4", - bucket_id=bucket_id, - file_name=file_name, - progress=f"{index + 1}/{num_items}", - status="success", - ) + try: + delete_confluence_page(item.get("id")) + except Exception as e: + logger.error( + "Error deleting Confluence page from c4", + bucket_id=bucket_id, + file_name=file_name, + progress=f"{index + 1}/{num_items}", + status="error", + error=str(e), + ) + else: + logger.info( + "Delete Confluence page in c4", + bucket_id=bucket_id, + file_name=file_name, + progress=f"{index + 1}/{num_items}", + status="success", + ) + logger.info("All Confluence pages deleted from c4", bucket_id=bucket_id) def delete_confluence_page(file_id): - requests.delete( - f"{c4_base_url}/api/buckets/{bucket_id}/files/{file_id}", headers={"x-api-key": config.c4_token} - ) + requests.delete(f"{c4_base_url}/api/buckets/{bucket_id}/files/{file_id}", headers={"x-api-key": config.c4_token}) def fetch_bucket_files_list(): From 62b53d860609d67e1f6442187136c3559d117eef Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 16:24:17 +0200 Subject: [PATCH 12/15] chore(confluence-importer): track success/error counts during page deletion and log summary upon completion --- .../confluence_importer/c4.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/services/confluence-importer/confluence_importer/c4.py b/services/confluence-importer/confluence_importer/c4.py index a1d3c8d2..608baca6 100644 --- a/services/confluence-importer/confluence_importer/c4.py +++ b/services/confluence-importer/confluence_importer/c4.py @@ -11,6 +11,10 @@ def clear_previous_ingests() -> None: """ Clears all previously ingested files from the C4 bucket. """ + logger.info("Starting deletion of all Confluence pages from c4", bucket_id=bucket_id) + + deletion_counter = {"success": 0, "error": 0} + files = fetch_bucket_files_list() for index, item in enumerate(files): @@ -23,6 +27,7 @@ def clear_previous_ingests() -> None: try: delete_confluence_page(item.get("id")) except Exception as e: + deletion_counter["error"] += 1 logger.error( "Error deleting Confluence page from c4", bucket_id=bucket_id, @@ -32,6 +37,7 @@ def clear_previous_ingests() -> None: error=str(e), ) else: + deletion_counter["success"] += 1 logger.info( "Delete Confluence page in c4", bucket_id=bucket_id, @@ -40,7 +46,16 @@ def clear_previous_ingests() -> None: status="success", ) - logger.info("All Confluence pages deleted from c4", bucket_id=bucket_id) + if deletion_counter["error"] > 0: + logger.error( + "Deletion of Confluence pages from c4 completed with errors! See log for more information.", + bucket_id=bucket_id, + deletion_counter=deletion_counter, + ) + else: + logger.info( + "Deletion of Confluence pages from c4 completed.", bucket_id=bucket_id, deletion_counter=deletion_counter + ) def delete_confluence_page(file_id): From b83218e2f7fd1239d8b2455fbc7cd4f67f7ccdcb Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 16:53:44 +0200 Subject: [PATCH 13/15] test(confluence-importer): tests for `c4.py` --- services/confluence-importer/tests/test_c4.py | 197 ++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 services/confluence-importer/tests/test_c4.py diff --git a/services/confluence-importer/tests/test_c4.py b/services/confluence-importer/tests/test_c4.py new file mode 100644 index 00000000..0b7e6a59 --- /dev/null +++ b/services/confluence-importer/tests/test_c4.py @@ -0,0 +1,197 @@ +from pytest_mock import MockerFixture + +from confluence_importer.c4 import ( + clear_previous_ingests, + delete_confluence_page, + fetch_bucket_files_list, + import_confluence_page, +) + + +class TestC4: + def test_delete_confluence_page(self, mocker: MockerFixture): + # arrange + mock_requests = mocker.patch("confluence_importer.c4.requests") + mocker.patch("confluence_importer.c4.c4_base_url", "http://test-url") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + mocker.patch("confluence_importer.c4.config.c4_token", "test-token") + file_id = "test-file-id" + + # act + delete_confluence_page(file_id) + + # assert + mock_requests.delete.assert_called_once_with( + "http://test-url/api/buckets/test-bucket/files/test-file-id", headers={"x-api-key": "test-token"} + ) + + def test_fetch_bucket_files_list_single_page(self, mocker: MockerFixture): + # arrange + mock_requests = mocker.patch("confluence_importer.c4.requests") + mock_logger = mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.c4_base_url", "http://test-url") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + mocker.patch("confluence_importer.c4.config.c4_token", "test-token") + + mock_response = mocker.MagicMock() + mock_response.json.return_value = { + "total": 2, + "items": [ + {"id": "file1", "fileName": "confluence_page_1.md"}, + {"id": "file2", "fileName": "confluence_page_2.md"}, + ], + } + mock_requests.get.return_value = mock_response + + # act + result = fetch_bucket_files_list() + + # assert + mock_requests.get.assert_called_once_with( + "http://test-url/api/buckets/test-bucket/files", headers={"x-api-key": "test-token"} + ) + assert len(result) == 2 + assert result[0]["id"] == "file1" + assert result[1]["id"] == "file2" + mock_logger.info.assert_called_once() + + def test_fetch_bucket_files_list_multiple_pages(self, mocker: MockerFixture): + # arrange + mock_requests = mocker.patch("confluence_importer.c4.requests") + mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.c4_base_url", "http://test-url") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + mocker.patch("confluence_importer.c4.config.c4_token", "test-token") + + first_response = mocker.MagicMock() + first_response.json.return_value = { + "total": 3, + "items": [ + {"id": "file1", "fileName": "confluence_page_1.md"}, + {"id": "file2", "fileName": "confluence_page_2.md"}, + ], + } + + second_response = mocker.MagicMock() + second_response.json.return_value = {"total": 3, "items": [{"id": "file3", "fileName": "confluence_page_3.md"}]} + + mock_requests.get.return_value = first_response + + # act + result = fetch_bucket_files_list() + + # assert + mock_requests.get.assert_called_once_with( + "http://test-url/api/buckets/test-bucket/files", headers={"x-api-key": "test-token"} + ) + assert len(result) == 2 + assert result[0]["id"] == "file1" + assert result[1]["id"] == "file2" + + def test_import_confluence_page_success(self, mocker: MockerFixture): + # arrange + mock_requests = mocker.patch("confluence_importer.c4.requests") + mock_logger = mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.c4_base_url", "http://test-url") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + mocker.patch("confluence_importer.c4.config.c4_token", "test-token") + + page_id = 12345 + page_markdown = "# Test Page" + + mock_response = mocker.MagicMock() + mock_response.status_code = 201 + mock_requests.post.return_value = mock_response + + # act + import_confluence_page(page_id, page_markdown) + + # assert + mock_requests.post.assert_called_once_with( + "http://test-url/api/buckets/test-bucket/files", + files={"file": (f"confluence_page_{page_id}.md", page_markdown, "text/markdown")}, + headers={"x-api-key": "test-token"}, + ) + + mock_logger.debug.assert_called_once() + mock_logger.error.assert_not_called() + + def test_import_confluence_page_error(self, mocker: MockerFixture): + # arrange + mock_requests = mocker.patch("confluence_importer.c4.requests") + mock_logger = mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.c4_base_url", "http://test-url") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + mocker.patch("confluence_importer.c4.config.c4_token", "test-token") + + page_id = 12345 + page_markdown = "# Test Page" + + mock_response = mocker.MagicMock() + mock_response.status_code = 500 + mock_response.text = "Internal Server Error" + mock_requests.post.return_value = mock_response + + # act + import_confluence_page(page_id, page_markdown) + + # assert + mock_requests.post.assert_called_once_with( + "http://test-url/api/buckets/test-bucket/files", + files={"file": (f"confluence_page_{page_id}.md", page_markdown, "text/markdown")}, + headers={"x-api-key": "test-token"}, + ) + mock_logger.debug.assert_not_called() + mock_logger.error.assert_called_once() + + def test_clear_previous_ingests(self, mocker: MockerFixture): + # arrange + mock_fetch_bucket_files = mocker.patch( + "confluence_importer.c4.fetch_bucket_files_list", + return_value=[ + {"id": "file1", "fileName": "confluence_page_1.md"}, + {"id": "file2", "fileName": "other_file.txt"}, + {"id": "file3", "fileName": "confluence_page_2.md"}, + ], + ) + mock_delete_confluence_page = mocker.patch("confluence_importer.c4.delete_confluence_page") + mock_logger = mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + + # act + clear_previous_ingests() + + # assert + mock_fetch_bucket_files.assert_called_once() + assert mock_delete_confluence_page.call_count == 2 + mock_delete_confluence_page.assert_any_call("file1") + mock_delete_confluence_page.assert_any_call("file3") + mock_logger.info.assert_called() + + def test_clear_previous_ingests_with_error(self, mocker: MockerFixture): + # arrange + mock_fetch_bucket_files = mocker.patch( + "confluence_importer.c4.fetch_bucket_files_list", + return_value=[ + {"id": "file1", "fileName": "confluence_page_1.md"}, + {"id": "file2", "fileName": "confluence_page_2.md"}, + ], + ) + + def delete_side_effect(file_id): + if file_id == "file2": + raise Exception("Delete failed") + + mock_delete_confluence_page = mocker.patch( + "confluence_importer.c4.delete_confluence_page", side_effect=delete_side_effect + ) + mock_logger = mocker.patch("confluence_importer.c4.logger") + mocker.patch("confluence_importer.c4.bucket_id", "test-bucket") + + # act + clear_previous_ingests() + + # assert + mock_fetch_bucket_files.assert_called_once() + assert mock_delete_confluence_page.call_count == 2 + mock_logger.error.assert_called() From cdfbdfa04ef05eedbe574b6cf0131a6811c1aba9 Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 18:35:27 +0200 Subject: [PATCH 14/15] chore(confluence-importer): add second loop breaking condition for robustness --- services/confluence-importer/confluence_importer/c4.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/services/confluence-importer/confluence_importer/c4.py b/services/confluence-importer/confluence_importer/c4.py index 608baca6..c1d738c5 100644 --- a/services/confluence-importer/confluence_importer/c4.py +++ b/services/confluence-importer/confluence_importer/c4.py @@ -73,10 +73,11 @@ def fetch_bucket_files_list(): response = requests.get(f"{c4_base_url}/api/buckets/{bucket_id}/files", headers={"x-api-key": config.c4_token}) total = response.json().get("total") + items_in_page = response.json().get("items") - items.extend(response.json().get("items")) + items.extend(items_in_page) - if page * batch_size >= total: + if page * batch_size >= total or len(items_in_page) == 0: break else: page += 1 From 1f130b1386794f4ce9eff85b522fc24deace91ce Mon Sep 17 00:00:00 2001 From: Marcus Legendre Date: Thu, 31 Jul 2025 18:43:17 +0200 Subject: [PATCH 15/15] docs(confluence-importer): replace TODO message with a clear warning --- .../confluence-importer/confluence_importer/confluence.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/services/confluence-importer/confluence_importer/confluence.py b/services/confluence-importer/confluence_importer/confluence.py index b2c21937..f81f06fc 100644 --- a/services/confluence-importer/confluence_importer/confluence.py +++ b/services/confluence-importer/confluence_importer/confluence.py @@ -50,13 +50,14 @@ def get_pages_for_space(space_key: str) -> Generator[ConfluencePage]: A list of ConfluencePage dataclasses containing the page information and content as HTML """ crawling_done = False - batch_size = 100 + batch_size = 100 # Don't change. See comment regarding `get_all_pages_from_space_as_generator()` below. offset = 0 while not crawling_done: logger.debug("Fetch Pages for Confluence Space", space_key=space_key, offset=offset, limit=batch_size) - # It seems that limit is broken in `atlassian-python-api`. It always defaults to 100? TODO figure out whats up. + # It seems that the `limit` parameter is broken and is always 100. + # This is fine as long as we keep our `batch_size` at 100. result = confluence_api.get_all_pages_from_space_as_generator( space_key, start=offset,