@@ -420,7 +420,7 @@ def _impersonate_user_for_retrieval(
420
420
is_slim : bool ,
421
421
checkpoint : GoogleDriveCheckpoint ,
422
422
concurrent_drive_itr : Callable [[str ], Iterator [str ]],
423
- filtered_folder_ids : set [str ],
423
+ sorted_filtered_folder_ids : list [str ],
424
424
start : SecondsSinceUnixEpoch | None = None ,
425
425
end : SecondsSinceUnixEpoch | None = None ,
426
426
) -> Iterator [RetrievedDriveFile ]:
@@ -509,6 +509,7 @@ def _yield_from_drive(
509
509
yield from _yield_from_drive (drive_id , start )
510
510
curr_stage .stage = DriveRetrievalStage .FOLDER_FILES
511
511
resuming = False # we are starting the next stage for the first time
512
+
512
513
if curr_stage .stage == DriveRetrievalStage .FOLDER_FILES :
513
514
514
515
def _yield_from_folder_crawl (
@@ -526,16 +527,28 @@ def _yield_from_folder_crawl(
526
527
)
527
528
528
529
# resume from a checkpoint
530
+ last_processed_folder = None
529
531
if resuming :
530
532
folder_id = curr_stage .completed_until_parent_id
531
533
assert folder_id is not None , "folder id not set in checkpoint"
532
534
resume_start = curr_stage .completed_until
533
535
yield from _yield_from_folder_crawl (folder_id , resume_start )
536
+ last_processed_folder = folder_id
537
+
538
+ skipping_seen_folders = last_processed_folder is not None
539
+ for folder_id in sorted_filtered_folder_ids :
540
+ if skipping_seen_folders :
541
+ skipping_seen_folders = folder_id != last_processed_folder
542
+ continue
534
543
535
- remaining_folders = filtered_folder_ids - self ._retrieved_ids
536
- for folder_id in remaining_folders :
544
+ if folder_id in self ._retrieved_ids :
545
+ continue
546
+
547
+ curr_stage .completed_until = 0
548
+ curr_stage .completed_until_parent_id = folder_id
537
549
logger .info (f"Getting files in folder '{ folder_id } ' as '{ user_email } '" )
538
550
yield from _yield_from_folder_crawl (folder_id , start )
551
+
539
552
curr_stage .stage = DriveRetrievalStage .DONE
540
553
541
554
def _manage_service_account_retrieval (
@@ -584,11 +597,13 @@ def _manage_service_account_retrieval(
584
597
drive_ids_to_retrieve , checkpoint
585
598
)
586
599
600
+ sorted_filtered_folder_ids = sorted (folder_ids_to_retrieve )
601
+
587
602
# only process emails that we haven't already completed retrieval for
588
603
non_completed_org_emails = [
589
604
user_email
590
- for user_email , stage in checkpoint .completion_map .items ()
591
- if stage != DriveRetrievalStage .DONE
605
+ for user_email , stage_completion in checkpoint .completion_map .items ()
606
+ if stage_completion . stage != DriveRetrievalStage .DONE
592
607
]
593
608
594
609
# don't process too many emails before returning a checkpoint. This is
@@ -609,7 +624,7 @@ def _manage_service_account_retrieval(
609
624
is_slim ,
610
625
checkpoint ,
611
626
drive_id_iterator ,
612
- folder_ids_to_retrieve ,
627
+ sorted_filtered_folder_ids ,
613
628
start ,
614
629
end ,
615
630
)
0 commit comments