Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/build-container-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ jobs:
- name: reis
containerfile: services/reis/Dockerfile
context: services/reis/
- name: confluence-importer
containerfile: services/confluence-importer/Dockerfile
context: services/confluence-importer/
steps:
- uses: actions/checkout@v4
- uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ jobs:
- frontend
- backend
- reis
- confluence-importer
needs:
- upstream-workflows
- verify-version-strings
Expand Down
2 changes: 1 addition & 1 deletion services/confluence-importer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ FROM python:3.13-slim

WORKDIR /app

COPY --from=builder --chown=app:app /app/.venv /app/.venv
COPY --from=builder /app/.venv /app/.venv
COPY confluence_importer/ confluence_importer/

COPY main.py .
Expand Down
74 changes: 56 additions & 18 deletions services/confluence-importer/confluence_importer/c4.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,58 @@ def clear_previous_ingests() -> None:
"""
Clears all previously ingested files from the C4 bucket.
"""
logger.info("Starting deletion of all Confluence pages from c4", bucket_id=bucket_id)

deletion_counter = {"success": 0, "error": 0}

files = fetch_bucket_files_list()

for index, item in enumerate(files):
num_items = len(files)
file_name = item.get("fileName")

is_confluence_page_file = file_name.startswith("confluence_page_") and file_name.endswith(".md")

if is_confluence_page_file:
try:
delete_confluence_page(item.get("id"))
except Exception as e:
deletion_counter["error"] += 1
logger.error(
"Error deleting Confluence page from c4",
bucket_id=bucket_id,
file_name=file_name,
progress=f"{index + 1}/{num_items}",
status="error",
error=str(e),
)
else:
deletion_counter["success"] += 1
logger.info(
"Delete Confluence page in c4",
bucket_id=bucket_id,
file_name=file_name,
progress=f"{index + 1}/{num_items}",
status="success",
)

if deletion_counter["error"] > 0:
logger.error(
"Deletion of Confluence pages from c4 completed with errors! See log for more information.",
bucket_id=bucket_id,
deletion_counter=deletion_counter,
)
else:
logger.info(
"Deletion of Confluence pages from c4 completed.", bucket_id=bucket_id, deletion_counter=deletion_counter
)


def delete_confluence_page(file_id):
requests.delete(f"{c4_base_url}/api/buckets/{bucket_id}/files/{file_id}", headers={"x-api-key": config.c4_token})


def fetch_bucket_files_list():
page = 1
batch_size = 50

Expand All @@ -21,32 +73,18 @@ def clear_previous_ingests() -> None:
response = requests.get(f"{c4_base_url}/api/buckets/{bucket_id}/files", headers={"x-api-key": config.c4_token})

total = response.json().get("total")
items_in_page = response.json().get("items")

items.extend(response.json().get("items"))
items.extend(items_in_page)

if page * batch_size >= total:
if page * batch_size >= total or len(items_in_page) == 0:
break
else:
page += 1

logger.info("Full list of files in c4 fetched", bucket_id=bucket_id, num_files=total)

for index, item in enumerate(items):
num_items = len(items)
file_name = item.get("fileName")

if file_name.startswith("confluence_page_") and file_name.endswith(".md"):
requests.delete(
f"{c4_base_url}/api/buckets/{bucket_id}/files/{item.get('id')}", headers={"x-api-key": config.c4_token}
)
logger.info(
"Delete Confluence page in c4",
bucket_id=bucket_id,
file_name=file_name,
progress=f"{index + 1}/{num_items}",
status="success",
)
logger.info("All Confluence pages deleted from c4", bucket_id=bucket_id)
return items


def import_confluence_page(page_id: int, page_markdown: str) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,14 @@ def get_pages_for_space(space_key: str) -> Generator[ConfluencePage]:
A list of ConfluencePage dataclasses containing the page information and content as HTML
"""
crawling_done = False
batch_size = 100
batch_size = 100 # Don't change. See comment regarding `get_all_pages_from_space_as_generator()` below.
offset = 0

while not crawling_done:
logger.debug("Fetch Pages for Confluence Space", space_key=space_key, offset=offset, limit=batch_size)

# It seems that limit is broken in `atlassian-python-api`. It always defaults to 100? TODO figure out whats up.
# It seems that the `limit` parameter is broken and is always 100.
# This is fine as long as we keep our `batch_size` at 100.
result = confluence_api.get_all_pages_from_space_as_generator(
space_key,
start=offset,
Expand Down
63 changes: 50 additions & 13 deletions services/confluence-importer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,35 +9,72 @@
page_ids = config.confluence_page_ids_to_import


def main():
logger.info("Starting synchronization Confluence to c4")
clear_previous_ingests()

def process_confluence_spaces(page_import_counter):
logger.info("Starting import of Confluence Spaces", num_spaces=len(space_keys))

for space_key in space_keys:
logger.info("Starting import of Confluence Space", space_key=space_key)
pages = confluence.get_pages_for_space(space_key)

for index, page in enumerate(pages, start=1):
page_markdown = html_to_markdown(page)
import_confluence_page(page.id, page_markdown)
logger.info("Import Confluence page", space_key=space_key, page_id=page.id, page_count=f"{index}")
try:
page_markdown = html_to_markdown(page)
import_confluence_page(page.id, page_markdown)
page_import_counter["success"] += 1
logger.info("Import Confluence page", space_key=space_key, page_id=page.id, page_count=f"{index}")
except Exception as e:
page_import_counter["error"] += 1
logger.error(
"Error importing Confluence page",
error=str(e),
space_key=space_key,
page_id=page.id,
page_count=f"{index}",
)

logger.info("Import of Confluence Space completed", space_key=space_key)
logger.info("Import of all Confluence Spaces completed")


def process_individual_pages(page_import_counter):
num_pages = len(page_ids)
logger.info("Starting import of individual Confluence pages", num_pages=num_pages)

for index, page_id in enumerate(page_ids):
page = confluence.get_page(page_id)
page_markdown = html_to_markdown(page)
import_confluence_page(page_id, page_markdown)
print(f"Ingested individual Confluence page {index + 1}/{num_pages}.")
logger.info("Importing Confluence page", page_id=page_id, progress=f"{index + 1}/{num_pages}")
try:
page = confluence.get_page(page_id)
page_markdown = html_to_markdown(page)
import_confluence_page(page_id, page_markdown)
page_import_counter["success"] += 1
logger.info("Import Confluence page", page_id=page_id, progress=f"{index + 1}/{num_pages}")
except Exception as e:
page_import_counter["error"] += 1
logger.error(
"Error importing Confluence page", error=str(e), page_id=page_id, progress=f"{index + 1}/{num_pages}"
)

logger.info("Import of individual Confluence pages completed")

logger.info("Synchronization Confluence to c4 completed")

def log_final_results(page_import_counter):
if page_import_counter["error"] > 0:
logger.error(
"Synchronization Confluence to c4 completed with errors! See log for more information.",
page_import_counter=page_import_counter,
)
else:
logger.info("Synchronization Confluence to c4 completed.", page_import_counter)


def main():
logger.info("Starting synchronization Confluence to c4")

clear_previous_ingests()

page_import_counter = {"error": 0, "success": 0}
process_confluence_spaces(page_import_counter)
process_individual_pages(page_import_counter)
log_final_results(page_import_counter)


main()
Loading