@@ -708,11 +708,29 @@ class WikidataRelation(Base, TimestampMixin, SoftDeleteMixin, UpsertMixin):
708708 )
709709
710710
711+ class DownloadAlreadyCompleteError (Exception ):
712+ """Raised when attempting to download a dump that's already been downloaded."""
713+
714+ pass
715+
716+
717+ class DownloadInProgressError (Exception ):
718+ """Raised when another download is already in progress for this dump."""
719+
720+ def __init__ (self , message : str , hours_elapsed : float ):
721+ super ().__init__ (message )
722+ self .hours_elapsed = hours_elapsed
723+
724+
711725class WikidataDump (Base , TimestampMixin ):
712726 """WikidataDump entity for tracking dump download and processing stages."""
713727
714728 __tablename__ = "wikidata_dumps"
715729
730+ # Default stale threshold: downloads taking longer than 24 hours are considered failed
731+ # (typical download time is ~10 hours)
732+ STALE_THRESHOLD_HOURS = 24
733+
716734 id = Column (
717735 UUID (as_uuid = True ), primary_key = True , server_default = text ("gen_random_uuid()" )
718736 )
@@ -734,6 +752,101 @@ class WikidataDump(Base, TimestampMixin):
734752 DateTime , nullable = True
735753 ) # When politicians import completed
736754
755+ @classmethod
756+ def prepare_for_download (
757+ cls ,
758+ session : Session ,
759+ url : str ,
760+ last_modified : datetime ,
761+ force : bool = False ,
762+ ) -> "WikidataDump" :
763+ """Prepare a WikidataDump record for downloading.
764+
765+ Handles checking for existing downloads (completed or in-progress),
766+ stale download detection, and record cleanup.
767+
768+ Args:
769+ session: Database session
770+ url: URL of the dump file
771+ last_modified: Last-Modified timestamp from the server
772+ force: If True, bypass existing download checks
773+
774+ Returns:
775+ WikidataDump record ready for download
776+
777+ Raises:
778+ DownloadAlreadyCompleteError: If dump was already downloaded (and not force)
779+ DownloadInProgressError: If another download is in progress (and not force/stale)
780+ """
781+ from datetime import timedelta , timezone
782+
783+ existing_dump = (
784+ session .query (cls )
785+ .filter (cls .url == url )
786+ .filter (cls .last_modified == last_modified )
787+ .first ()
788+ )
789+
790+ if existing_dump and not force :
791+ if existing_dump .downloaded_at :
792+ raise DownloadAlreadyCompleteError (
793+ f"Dump from { last_modified .strftime ('%Y-%m-%d %H:%M:%S' )} UTC "
794+ "already downloaded"
795+ )
796+ else :
797+ # Check if the download is stale
798+ created_at_utc = existing_dump .created_at .replace (tzinfo = timezone .utc )
799+ age = datetime .now (timezone .utc ) - created_at_utc
800+ hours_elapsed = age .total_seconds () / 3600
801+
802+ if age > timedelta (hours = cls .STALE_THRESHOLD_HOURS ):
803+ # Stale download - clean up and allow retry
804+ session .delete (existing_dump )
805+ session .flush ()
806+ existing_dump = None
807+ else :
808+ raise DownloadInProgressError (
809+ f"Download for dump from { last_modified .strftime ('%Y-%m-%d %H:%M:%S' )} UTC "
810+ "already in progress" ,
811+ hours_elapsed = hours_elapsed ,
812+ )
813+ elif existing_dump and force :
814+ # Force mode - delete existing record
815+ session .delete (existing_dump )
816+ session .flush ()
817+ existing_dump = None
818+
819+ # Create new dump record
820+ new_dump = cls (url = url , last_modified = last_modified )
821+ session .add (new_dump )
822+ session .flush ()
823+
824+ return new_dump
825+
826+ def mark_downloaded (self , session : Session ) -> None :
827+ """Mark this dump as successfully downloaded.
828+
829+ Args:
830+ session: Database session
831+ """
832+ from datetime import timezone
833+
834+ self .downloaded_at = datetime .now (timezone .utc )
835+ session .merge (self )
836+ session .flush ()
837+
838+ def cleanup_failed_download (self , session : Session ) -> None :
839+ """Clean up this dump record after a failed download.
840+
841+ Removes the record to allow future retry attempts.
842+
843+ Args:
844+ session: Database session
845+ """
846+ session .merge (self )
847+ session .delete (self )
848+ session .flush ()
849+
737850
738851class CurrentImportEntity (Base ):
739852 """Temporary tracking table for entities seen during current import."""
0 commit comments