onyx-dot-app
diff --git a/‎backend/onyx/background/indexing/checkpointing_utils.py
+2-2 b/‎backend/onyx/background/indexing/checkpointing_utils.py
+2-2
diff --git a/‎backend/onyx/connectors/confluence/connector.py
+142-81 b/‎backend/onyx/connectors/confluence/connector.py
+142-81
diff --git a/‎backend/onyx/connectors/connector_runner.py
+3-3 b/‎backend/onyx/connectors/connector_runner.py
+3-3
diff --git a/‎backend/onyx/connectors/factory.py
+2-2 b/‎backend/onyx/connectors/factory.py
+2-2
diff --git a/‎backend/onyx/connectors/github/connector.py
+2-2 b/‎backend/onyx/connectors/github/connector.py
+2-2
@@ -7,7 +7,7 @@
 
 from onyx.configs.constants import FileOrigin
 from onyx.connectors.interfaces import BaseConnector
-from onyx.connectors.interfaces import CheckpointConnector
+from onyx.connectors.interfaces import CheckpointedConnector
 from onyx.connectors.models import ConnectorCheckpoint
 from onyx.db.engine import get_db_current_time
 from onyx.db.index_attempt import get_index_attempt
@@ -61,7 +61,7 @@ def load_checkpoint(
     try:
         checkpoint_io = file_store.read_file(checkpoint_pointer, mode="rb")
         checkpoint_data = checkpoint_io.read().decode("utf-8")
-        if isinstance(connector, CheckpointConnector):
+        if isinstance(connector, CheckpointedConnector):
             return connector.validate_checkpoint_json(checkpoint_data)
         return ConnectorCheckpoint.model_validate_json(checkpoint_data)
     except RuntimeError:
 
@@ -1,10 +1,12 @@
+import copy
 from datetime import datetime
 from datetime import timedelta
 from datetime import timezone
 from typing import Any
 from urllib.parse import quote
 
 from requests.exceptions import HTTPError
+from typing_extensions import override
 
 from onyx.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
 from onyx.configs.app_configs import CONFLUENCE_TIMEZONE_OFFSET
@@ -22,17 +24,19 @@
 from onyx.connectors.exceptions import CredentialExpiredError
 from onyx.connectors.exceptions import InsufficientPermissionsError
 from onyx.connectors.exceptions import UnexpectedValidationError
+from onyx.connectors.interfaces import CheckpointedConnector
+from onyx.connectors.interfaces import CheckpointOutput
+from onyx.connectors.interfaces import ConnectorCheckpoint
+from onyx.connectors.interfaces import ConnectorFailure
 from onyx.connectors.interfaces import CredentialsConnector
 from onyx.connectors.interfaces import CredentialsProviderInterface
-from onyx.connectors.interfaces import GenerateDocumentsOutput
 from onyx.connectors.interfaces import GenerateSlimDocumentOutput
-from onyx.connectors.interfaces import LoadConnector
-from onyx.connectors.interfaces import PollConnector
 from onyx.connectors.interfaces import SecondsSinceUnixEpoch
 from onyx.connectors.interfaces import SlimConnector
 from onyx.connectors.models import BasicExpertInfo
 from onyx.connectors.models import ConnectorMissingCredentialError
 from onyx.connectors.models import Document
+from onyx.connectors.models import DocumentFailure
 from onyx.connectors.models import ImageSection
 from onyx.connectors.models import SlimDocument
 from onyx.connectors.models import TextSection
@@ -68,9 +72,16 @@
 ONE_HOUR = 3600
 
 
+def _should_propagate_error(e: Exception) -> bool:
+    return "field 'updated' is invalid" in str(e)
+
+
+class ConfluenceCheckpoint(ConnectorCheckpoint):
+    last_updated: SecondsSinceUnixEpoch
+
+
 class ConfluenceConnector(
-    LoadConnector,
-    PollConnector,
+    CheckpointedConnector[ConfluenceCheckpoint],
     SlimConnector,
     CredentialsConnector,
 ):
@@ -211,6 +222,8 @@ def _construct_page_query(
                 "%Y-%m-%d %H:%M"
             )
             page_query += f" and lastmodified <= '{formatted_end_time}'"
+
+        page_query += " order by lastmodified asc"
         return page_query
 
     def _construct_attachment_query(self, confluence_page_id: str) -> str:
@@ -236,11 +249,14 @@ def _get_comment_string_for_page_id(self, page_id: str) -> str:
             )
         return comment_string
 
-    def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
+    def _convert_page_to_document(
+        self, page: dict[str, Any]
+    ) -> Document | ConnectorFailure:
         """
         Converts a Confluence page to a Document object.
         Includes the page content, comments, and attachments.
         """
+        page_id = page_url = ""
         try:
             # Extract basic page information
             page_id = page["id"]
@@ -336,25 +352,103 @@ def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
             )
         except Exception as e:
             logger.error(f"Error converting page {page.get('id', 'unknown')}: {e}")
-            if not self.continue_on_failure:
+            if _should_propagate_error(e):
                 raise
-            return None
+            return ConnectorFailure(
+                failed_document=DocumentFailure(
+                    document_id=page_id,
+                    document_link=page_url,
+                ),
+                failure_message=f"Error converting page {page.get('id', 'unknown')}: {e}",
+                exception=e,
+            )
+
+    def _fetch_page_attachments(
+        self, page: dict[str, Any], doc: Document
+    ) -> Document | ConnectorFailure:
+        attachment_query = self._construct_attachment_query(page["id"])
+
+        for attachment in self.confluence_client.paginated_cql_retrieval(
+            cql=attachment_query,
+            expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
+        ):
+            attachment["metadata"].get("mediaType", "")
+            if not validate_attachment_filetype(
+                attachment,
+            ):
+                logger.info(f"Skipping attachment: {attachment['title']}")
+                continue
+
+            logger.info(f"Processing attachment: {attachment['title']}")
+
+            # Attempt to get textual content or image summarization:
+            object_url = build_confluence_document_id(
+                self.wiki_base, attachment["_links"]["webui"], self.is_cloud
+            )
+            try:
+                response = convert_attachment_to_content(
+                    confluence_client=self.confluence_client,
+                    attachment=attachment,
+                    page_id=page["id"],
+                    allow_images=self.allow_images,
+                )
+                if response is None:
+                    continue
+
+                content_text, file_storage_name = response
+
+                if content_text:
+                    doc.sections.append(
+                        TextSection(
+                            text=content_text,
+                            link=object_url,
+                        )
+                    )
+                elif file_storage_name:
+                    doc.sections.append(
+                        ImageSection(
+                            link=object_url,
+                            image_file_name=file_storage_name,
+                        )
+                    )
+            except Exception as e:
+                logger.error(
+                    f"Failed to extract/summarize attachment {attachment['title']}",
+                    exc_info=e,
+                )
+                if not self.continue_on_failure:
+                    if _should_propagate_error(e):
+                        raise
+                    # TODO: should we remove continue_on_failure entirely now that we have checkpointing?
+                    return ConnectorFailure(
+                        failed_document=DocumentFailure(
+                            document_id=doc.id,
+                            document_link=object_url,
+                        ),
+                        failure_message=f"Failed to extract/summarize attachment {attachment['title']} for doc {doc.id}",
+                        exception=e,
+                    )
+        return doc
 
     def _fetch_document_batches(
         self,
+        checkpoint: ConfluenceCheckpoint,
         start: SecondsSinceUnixEpoch | None = None,
         end: SecondsSinceUnixEpoch | None = None,
-    ) -> GenerateDocumentsOutput:
+    ) -> CheckpointOutput[ConfluenceCheckpoint]:
         """
         Yields batches of Documents. For each page:
          - Create a Document with 1 Section for the page text/comments
          - Then fetch attachments. For each attachment:
              - Attempt to convert it with convert_attachment_to_content(...)
              - If successful, create a new Section with the extracted text or summary.
         """
-        doc_batch: list[Document] = []
+        doc_count = 0
 
-        page_query = self._construct_page_query(start, end)
+        checkpoint = copy.deepcopy(checkpoint)
+
+        # use "start" when last_updated is 0
+        page_query = self._construct_page_query(checkpoint.last_updated or start, end)
         logger.debug(f"page_query: {page_query}")
 
         for page in self.confluence_client.paginated_cql_retrieval(
@@ -363,94 +457,61 @@ def _fetch_document_batches(
             limit=self.batch_size,
         ):
             # Build doc from page
-            doc = self._convert_page_to_document(page)
-            if not doc:
-                continue
-
-            # Now get attachments for that page:
-            attachment_query = self._construct_attachment_query(page["id"])
-            # We'll use the page's XML to provide context if we summarize an image
-            page.get("body", {}).get("storage", {}).get("value", "")
-
-            for attachment in self.confluence_client.paginated_cql_retrieval(
-                cql=attachment_query,
-                expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
-            ):
-                attachment["metadata"].get("mediaType", "")
-                if not validate_attachment_filetype(
-                    attachment,
-                ):
-                    logger.info(f"Skipping attachment: {attachment['title']}")
-                    continue
+            doc_or_failure = self._convert_page_to_document(page)
 
-                logger.info(f"Processing attachment: {attachment['title']}")
+            if isinstance(doc_or_failure, ConnectorFailure):
+                yield doc_or_failure
+                continue
 
-                # Attempt to get textual content or image summarization:
-                try:
-                    response = convert_attachment_to_content(
-                        confluence_client=self.confluence_client,
-                        attachment=attachment,
-                        page_id=page["id"],
-                        allow_images=self.allow_images,
-                    )
-                    if response is None:
-                        continue
+            checkpoint.last_updated = datetime_from_string(
+                page["version"]["when"]
+            ).timestamp()
 
-                    content_text, file_storage_name = response
-                    object_url = build_confluence_document_id(
-                        self.wiki_base, attachment["_links"]["webui"], self.is_cloud
-                    )
-                    if content_text:
-                        doc.sections.append(
-                            TextSection(
-                                text=content_text,
-                                link=object_url,
-                            )
-                        )
-                    elif file_storage_name:
-                        doc.sections.append(
-                            ImageSection(
-                                link=object_url,
-                                image_file_name=file_storage_name,
-                            )
-                        )
-                except Exception as e:
-                    logger.error(
-                        f"Failed to extract/summarize attachment {attachment['title']}",
-                        exc_info=e,
-                    )
-                    if not self.continue_on_failure:
-                        raise
+            # Now get attachments for that page:
+            doc_or_failure = self._fetch_page_attachments(page, doc_or_failure)
 
-            doc_batch.append(doc)
+            if isinstance(doc_or_failure, ConnectorFailure):
+                yield doc_or_failure
+                continue
 
-            if len(doc_batch) >= self.batch_size:
-                yield doc_batch
-                doc_batch = []
+            # yield completed document
+            doc_count += 1
+            yield doc_or_failure
 
-        if doc_batch:
-            yield doc_batch
+            # create checkpoint after enough documents have been processed
+            if doc_count >= self.batch_size:
+                return checkpoint
 
-    def load_from_state(self) -> GenerateDocumentsOutput:
-        return self._fetch_document_batches()
+        checkpoint.has_more = False
+        return checkpoint
 
-    def poll_source(
+    @override
+    def load_from_checkpoint(
         self,
-        start: SecondsSinceUnixEpoch | None = None,
-        end: SecondsSinceUnixEpoch | None = None,
-    ) -> GenerateDocumentsOutput:
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+        checkpoint: ConfluenceCheckpoint,
+    ) -> CheckpointOutput[ConfluenceCheckpoint]:
         try:
-            return self._fetch_document_batches(start, end)
+            return self._fetch_document_batches(checkpoint, start, end)
         except Exception as e:
-            if "field 'updated' is invalid" in str(e) and start is not None:
+            if _should_propagate_error(e) and start is not None:
                 logger.warning(
                     "Confluence says we provided an invalid 'updated' field. This may indicate"
                     "a real issue, but can also appear during edge cases like daylight"
                     f"savings time changes. Retrying with a 1 hour offset. Error: {e}"
                 )
-                return self._fetch_document_batches(start - ONE_HOUR, end)
+                return self._fetch_document_batches(checkpoint, start - ONE_HOUR, end)
             raise
 
+    @override
+    def build_dummy_checkpoint(self) -> ConfluenceCheckpoint:
+        return ConfluenceCheckpoint(last_updated=0, has_more=True)
+
+    @override
+    def validate_checkpoint_json(self, checkpoint_json: str) -> ConfluenceCheckpoint:
+        return ConfluenceCheckpoint.model_validate_json(checkpoint_json)
+
     def retrieve_all_slim_documents(
         self,
         start: SecondsSinceUnixEpoch | None = None,
 
@@ -6,7 +6,7 @@
 from typing import TypeVar
 
 from onyx.connectors.interfaces import BaseConnector
-from onyx.connectors.interfaces import CheckpointConnector
+from onyx.connectors.interfaces import CheckpointedConnector
 from onyx.connectors.interfaces import CheckpointOutput
 from onyx.connectors.interfaces import LoadConnector
 from onyx.connectors.interfaces import PollConnector
@@ -95,9 +95,9 @@ def run(self, checkpoint: CT) -> Generator[
     ]:
         """Adds additional exception logging to the connector."""
         try:
-            if isinstance(self.connector, CheckpointConnector):
+            if isinstance(self.connector, CheckpointedConnector):
                 if self.time_range is None:
-                    raise ValueError("time_range is required for CheckpointConnector")
+                    raise ValueError("time_range is required for CheckpointedConnector")
 
                 start = time.monotonic()
                 checkpoint_connector_generator = self.connector.load_from_checkpoint(
 
@@ -34,7 +34,7 @@
 from onyx.connectors.highspot.connector import HighspotConnector
 from onyx.connectors.hubspot.connector import HubSpotConnector
 from onyx.connectors.interfaces import BaseConnector
-from onyx.connectors.interfaces import CheckpointConnector
+from onyx.connectors.interfaces import CheckpointedConnector
 from onyx.connectors.interfaces import CredentialsConnector
 from onyx.connectors.interfaces import EventConnector
 from onyx.connectors.interfaces import LoadConnector
@@ -148,7 +148,7 @@ def identify_connector_class(
                 # all connectors should be checkpoint connectors
                 and (
                     not issubclass(connector, PollConnector)
-                    and not issubclass(connector, CheckpointConnector)
+                    and not issubclass(connector, CheckpointedConnector)
                 )
             ),
             (
 
@@ -25,7 +25,7 @@
 from onyx.connectors.exceptions import CredentialExpiredError
 from onyx.connectors.exceptions import InsufficientPermissionsError
 from onyx.connectors.exceptions import UnexpectedValidationError
-from onyx.connectors.interfaces import CheckpointConnector
+from onyx.connectors.interfaces import CheckpointedConnector
 from onyx.connectors.interfaces import CheckpointOutput
 from onyx.connectors.interfaces import ConnectorCheckpoint
 from onyx.connectors.interfaces import ConnectorFailure
@@ -143,7 +143,7 @@ class GithubConnectorCheckpoint(ConnectorCheckpoint):
     cached_repo: SerializedRepository | None = None
 
 
-class GithubConnector(CheckpointConnector[GithubConnectorCheckpoint]):
+class GithubConnector(CheckpointedConnector[GithubConnectorCheckpoint]):
     def __init__(
         self,
         repo_owner: str,