1
+ import copy
1
2
from datetime import datetime
2
3
from datetime import timedelta
3
4
from datetime import timezone
4
5
from typing import Any
5
6
from urllib .parse import quote
6
7
7
8
from requests .exceptions import HTTPError
9
+ from typing_extensions import override
8
10
9
11
from onyx .configs .app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
10
12
from onyx .configs .app_configs import CONFLUENCE_TIMEZONE_OFFSET
22
24
from onyx .connectors .exceptions import CredentialExpiredError
23
25
from onyx .connectors .exceptions import InsufficientPermissionsError
24
26
from onyx .connectors .exceptions import UnexpectedValidationError
27
+ from onyx .connectors .interfaces import CheckpointedConnector
28
+ from onyx .connectors .interfaces import CheckpointOutput
29
+ from onyx .connectors .interfaces import ConnectorCheckpoint
30
+ from onyx .connectors .interfaces import ConnectorFailure
25
31
from onyx .connectors .interfaces import CredentialsConnector
26
32
from onyx .connectors .interfaces import CredentialsProviderInterface
27
- from onyx .connectors .interfaces import GenerateDocumentsOutput
28
33
from onyx .connectors .interfaces import GenerateSlimDocumentOutput
29
- from onyx .connectors .interfaces import LoadConnector
30
- from onyx .connectors .interfaces import PollConnector
31
34
from onyx .connectors .interfaces import SecondsSinceUnixEpoch
32
35
from onyx .connectors .interfaces import SlimConnector
33
36
from onyx .connectors .models import BasicExpertInfo
34
37
from onyx .connectors .models import ConnectorMissingCredentialError
35
38
from onyx .connectors .models import Document
39
+ from onyx .connectors .models import DocumentFailure
36
40
from onyx .connectors .models import ImageSection
37
41
from onyx .connectors .models import SlimDocument
38
42
from onyx .connectors .models import TextSection
68
72
ONE_HOUR = 3600
69
73
70
74
75
+ def _should_propagate_error (e : Exception ) -> bool :
76
+ return "field 'updated' is invalid" in str (e )
77
+
78
+
79
+ class ConfluenceCheckpoint (ConnectorCheckpoint ):
80
+ last_updated : SecondsSinceUnixEpoch
81
+
82
+
71
83
class ConfluenceConnector (
72
- LoadConnector ,
73
- PollConnector ,
84
+ CheckpointedConnector [ConfluenceCheckpoint ],
74
85
SlimConnector ,
75
86
CredentialsConnector ,
76
87
):
@@ -211,6 +222,8 @@ def _construct_page_query(
211
222
"%Y-%m-%d %H:%M"
212
223
)
213
224
page_query += f" and lastmodified <= '{ formatted_end_time } '"
225
+
226
+ page_query += " order by lastmodified asc"
214
227
return page_query
215
228
216
229
def _construct_attachment_query (self , confluence_page_id : str ) -> str :
@@ -236,11 +249,14 @@ def _get_comment_string_for_page_id(self, page_id: str) -> str:
236
249
)
237
250
return comment_string
238
251
239
- def _convert_page_to_document (self , page : dict [str , Any ]) -> Document | None :
252
+ def _convert_page_to_document (
253
+ self , page : dict [str , Any ]
254
+ ) -> Document | ConnectorFailure :
240
255
"""
241
256
Converts a Confluence page to a Document object.
242
257
Includes the page content, comments, and attachments.
243
258
"""
259
+ page_id = page_url = ""
244
260
try :
245
261
# Extract basic page information
246
262
page_id = page ["id" ]
@@ -336,25 +352,103 @@ def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
336
352
)
337
353
except Exception as e :
338
354
logger .error (f"Error converting page { page .get ('id' , 'unknown' )} : { e } " )
339
- if not self . continue_on_failure :
355
+ if _should_propagate_error ( e ) :
340
356
raise
341
- return None
357
+ return ConnectorFailure (
358
+ failed_document = DocumentFailure (
359
+ document_id = page_id ,
360
+ document_link = page_url ,
361
+ ),
362
+ failure_message = f"Error converting page { page .get ('id' , 'unknown' )} : { e } " ,
363
+ exception = e ,
364
+ )
365
+
366
+ def _fetch_page_attachments (
367
+ self , page : dict [str , Any ], doc : Document
368
+ ) -> Document | ConnectorFailure :
369
+ attachment_query = self ._construct_attachment_query (page ["id" ])
370
+
371
+ for attachment in self .confluence_client .paginated_cql_retrieval (
372
+ cql = attachment_query ,
373
+ expand = "," .join (_ATTACHMENT_EXPANSION_FIELDS ),
374
+ ):
375
+ attachment ["metadata" ].get ("mediaType" , "" )
376
+ if not validate_attachment_filetype (
377
+ attachment ,
378
+ ):
379
+ logger .info (f"Skipping attachment: { attachment ['title' ]} " )
380
+ continue
381
+
382
+ logger .info (f"Processing attachment: { attachment ['title' ]} " )
383
+
384
+ # Attempt to get textual content or image summarization:
385
+ object_url = build_confluence_document_id (
386
+ self .wiki_base , attachment ["_links" ]["webui" ], self .is_cloud
387
+ )
388
+ try :
389
+ response = convert_attachment_to_content (
390
+ confluence_client = self .confluence_client ,
391
+ attachment = attachment ,
392
+ page_id = page ["id" ],
393
+ allow_images = self .allow_images ,
394
+ )
395
+ if response is None :
396
+ continue
397
+
398
+ content_text , file_storage_name = response
399
+
400
+ if content_text :
401
+ doc .sections .append (
402
+ TextSection (
403
+ text = content_text ,
404
+ link = object_url ,
405
+ )
406
+ )
407
+ elif file_storage_name :
408
+ doc .sections .append (
409
+ ImageSection (
410
+ link = object_url ,
411
+ image_file_name = file_storage_name ,
412
+ )
413
+ )
414
+ except Exception as e :
415
+ logger .error (
416
+ f"Failed to extract/summarize attachment { attachment ['title' ]} " ,
417
+ exc_info = e ,
418
+ )
419
+ if not self .continue_on_failure :
420
+ if _should_propagate_error (e ):
421
+ raise
422
+ # TODO: should we remove continue_on_failure entirely now that we have checkpointing?
423
+ return ConnectorFailure (
424
+ failed_document = DocumentFailure (
425
+ document_id = doc .id ,
426
+ document_link = object_url ,
427
+ ),
428
+ failure_message = f"Failed to extract/summarize attachment { attachment ['title' ]} for doc { doc .id } " ,
429
+ exception = e ,
430
+ )
431
+ return doc
342
432
343
433
def _fetch_document_batches (
344
434
self ,
435
+ checkpoint : ConfluenceCheckpoint ,
345
436
start : SecondsSinceUnixEpoch | None = None ,
346
437
end : SecondsSinceUnixEpoch | None = None ,
347
- ) -> GenerateDocumentsOutput :
438
+ ) -> CheckpointOutput [ ConfluenceCheckpoint ] :
348
439
"""
349
440
Yields batches of Documents. For each page:
350
441
- Create a Document with 1 Section for the page text/comments
351
442
- Then fetch attachments. For each attachment:
352
443
- Attempt to convert it with convert_attachment_to_content(...)
353
444
- If successful, create a new Section with the extracted text or summary.
354
445
"""
355
- doc_batch : list [ Document ] = []
446
+ doc_count = 0
356
447
357
- page_query = self ._construct_page_query (start , end )
448
+ checkpoint = copy .deepcopy (checkpoint )
449
+
450
+ # use "start" when last_updated is 0
451
+ page_query = self ._construct_page_query (checkpoint .last_updated or start , end )
358
452
logger .debug (f"page_query: { page_query } " )
359
453
360
454
for page in self .confluence_client .paginated_cql_retrieval (
@@ -363,94 +457,61 @@ def _fetch_document_batches(
363
457
limit = self .batch_size ,
364
458
):
365
459
# Build doc from page
366
- doc = self ._convert_page_to_document (page )
367
- if not doc :
368
- continue
369
-
370
- # Now get attachments for that page:
371
- attachment_query = self ._construct_attachment_query (page ["id" ])
372
- # We'll use the page's XML to provide context if we summarize an image
373
- page .get ("body" , {}).get ("storage" , {}).get ("value" , "" )
374
-
375
- for attachment in self .confluence_client .paginated_cql_retrieval (
376
- cql = attachment_query ,
377
- expand = "," .join (_ATTACHMENT_EXPANSION_FIELDS ),
378
- ):
379
- attachment ["metadata" ].get ("mediaType" , "" )
380
- if not validate_attachment_filetype (
381
- attachment ,
382
- ):
383
- logger .info (f"Skipping attachment: { attachment ['title' ]} " )
384
- continue
460
+ doc_or_failure = self ._convert_page_to_document (page )
385
461
386
- logger .info (f"Processing attachment: { attachment ['title' ]} " )
462
+ if isinstance (doc_or_failure , ConnectorFailure ):
463
+ yield doc_or_failure
464
+ continue
387
465
388
- # Attempt to get textual content or image summarization:
389
- try :
390
- response = convert_attachment_to_content (
391
- confluence_client = self .confluence_client ,
392
- attachment = attachment ,
393
- page_id = page ["id" ],
394
- allow_images = self .allow_images ,
395
- )
396
- if response is None :
397
- continue
466
+ checkpoint .last_updated = datetime_from_string (
467
+ page ["version" ]["when" ]
468
+ ).timestamp ()
398
469
399
- content_text , file_storage_name = response
400
- object_url = build_confluence_document_id (
401
- self .wiki_base , attachment ["_links" ]["webui" ], self .is_cloud
402
- )
403
- if content_text :
404
- doc .sections .append (
405
- TextSection (
406
- text = content_text ,
407
- link = object_url ,
408
- )
409
- )
410
- elif file_storage_name :
411
- doc .sections .append (
412
- ImageSection (
413
- link = object_url ,
414
- image_file_name = file_storage_name ,
415
- )
416
- )
417
- except Exception as e :
418
- logger .error (
419
- f"Failed to extract/summarize attachment { attachment ['title' ]} " ,
420
- exc_info = e ,
421
- )
422
- if not self .continue_on_failure :
423
- raise
470
+ # Now get attachments for that page:
471
+ doc_or_failure = self ._fetch_page_attachments (page , doc_or_failure )
424
472
425
- doc_batch .append (doc )
473
+ if isinstance (doc_or_failure , ConnectorFailure ):
474
+ yield doc_or_failure
475
+ continue
426
476
427
- if len ( doc_batch ) >= self . batch_size :
428
- yield doc_batch
429
- doc_batch = []
477
+ # yield completed document
478
+ doc_count += 1
479
+ yield doc_or_failure
430
480
431
- if doc_batch :
432
- yield doc_batch
481
+ # create checkpoint after enough documents have been processed
482
+ if doc_count >= self .batch_size :
483
+ return checkpoint
433
484
434
- def load_from_state ( self ) -> GenerateDocumentsOutput :
435
- return self . _fetch_document_batches ()
485
+ checkpoint . has_more = False
486
+ return checkpoint
436
487
437
- def poll_source (
488
+ @override
489
+ def load_from_checkpoint (
438
490
self ,
439
- start : SecondsSinceUnixEpoch | None = None ,
440
- end : SecondsSinceUnixEpoch | None = None ,
441
- ) -> GenerateDocumentsOutput :
491
+ start : SecondsSinceUnixEpoch ,
492
+ end : SecondsSinceUnixEpoch ,
493
+ checkpoint : ConfluenceCheckpoint ,
494
+ ) -> CheckpointOutput [ConfluenceCheckpoint ]:
442
495
try :
443
- return self ._fetch_document_batches (start , end )
496
+ return self ._fetch_document_batches (checkpoint , start , end )
444
497
except Exception as e :
445
- if "field 'updated' is invalid" in str (e ) and start is not None :
498
+ if _should_propagate_error (e ) and start is not None :
446
499
logger .warning (
447
500
"Confluence says we provided an invalid 'updated' field. This may indicate"
448
501
"a real issue, but can also appear during edge cases like daylight"
449
502
f"savings time changes. Retrying with a 1 hour offset. Error: { e } "
450
503
)
451
- return self ._fetch_document_batches (start - ONE_HOUR , end )
504
+ return self ._fetch_document_batches (checkpoint , start - ONE_HOUR , end )
452
505
raise
453
506
507
+ @override
508
+ def build_dummy_checkpoint (self ) -> ConfluenceCheckpoint :
509
+ return ConfluenceCheckpoint (last_updated = 0 , has_more = True )
510
+
511
+ @override
512
+ def validate_checkpoint_json (self , checkpoint_json : str ) -> ConfluenceCheckpoint :
513
+ return ConfluenceCheckpoint .model_validate_json (checkpoint_json )
514
+
454
515
def retrieve_all_slim_documents (
455
516
self ,
456
517
start : SecondsSinceUnixEpoch | None = None ,
0 commit comments