11from pathlib import Path , PurePosixPath , PureWindowsPath
22from collections .abc import Mapping
33from tqdm import tqdm
4+ import logging
45from .settings import config
56from .errors import DataJointError , MissingExternalFile
67from .hash import uuid_from_buffer , uuid_from_file
1011from . import s3
1112from .utils import safe_write , safe_copy
1213
14+ logger = logging .getLogger (__name__ .split ("." )[0 ])
15+
1316CACHE_SUBFOLDING = (
1417 2 ,
1518 2 ,
@@ -72,9 +75,7 @@ def definition(self):
7275
7376 @property
7477 def table_name (self ):
75- return "{external_table_root}_{store}" .format (
76- external_table_root = EXTERNAL_TABLE_ROOT , store = self .store
77- )
78+ return f"{ EXTERNAL_TABLE_ROOT } _{ self .store } "
7879
7980 @property
8081 def s3 (self ):
@@ -276,9 +277,7 @@ def upload_filepath(self, local_filepath):
276277 # the tracking entry exists, check that it's the same file as before
277278 if contents_hash != check_hash [0 ]:
278279 raise DataJointError (
279- "A different version of '{file}' has already been placed." .format (
280- file = relative_filepath
281- )
280+ f"A different version of '{ relative_filepath } ' has already been placed."
282281 )
283282 else :
284283 # upload the file and create its tracking entry
@@ -304,27 +303,43 @@ def download_filepath(self, filepath_hash):
304303 :param filepath_hash: The hash (UUID) of the relative_path
305304 :return: hash (UUID) of the contents of the downloaded file or Nones
306305 """
306+
307+ def _need_checksum (local_filepath , expected_size ):
308+ limit = config .get ("filepath_checksum_size_limit" )
309+ actual_size = Path (local_filepath ).stat ().st_size
310+ if expected_size != actual_size :
311+ # this should never happen without outside interference
312+ raise DataJointError (
313+ f"'{ local_filepath } ' downloaded but size did not match."
314+ )
315+ return limit is None or actual_size < limit
316+
307317 if filepath_hash is not None :
308- relative_filepath , contents_hash = ( self & { "hash" : filepath_hash }). fetch1 (
309- "filepath" , "contents_hash"
310- )
318+ relative_filepath , contents_hash , size = (
319+ self & { "hash" : filepath_hash }
320+ ). fetch1 ( "filepath" , "contents_hash" , "size" )
311321 external_path = self ._make_external_filepath (relative_filepath )
312322 local_filepath = Path (self .spec ["stage" ]).absolute () / relative_filepath
313- file_exists = (
314- Path (local_filepath ).is_file ()
315- and uuid_from_file (local_filepath ) == contents_hash
323+
324+ file_exists = Path (local_filepath ).is_file () and (
325+ not _need_checksum (local_filepath , size )
326+ or uuid_from_file (local_filepath ) == contents_hash
316327 )
328+
317329 if not file_exists :
318330 self ._download_file (external_path , local_filepath )
319- checksum = uuid_from_file (local_filepath )
320331 if (
321- checksum != contents_hash
322- ): # this should never happen without outside interference
332+ _need_checksum (local_filepath , size )
333+ and uuid_from_file (local_filepath ) != contents_hash
334+ ):
335+ # this should never happen without outside interference
323336 raise DataJointError (
324- "'{file}' downloaded but did not pass checksum'" .format (
325- file = local_filepath
326- )
337+ f"'{ local_filepath } ' downloaded but did not pass checksum."
327338 )
339+ if not _need_checksum (local_filepath , size ):
340+ logger .warning (
341+ f"Skipped checksum for file with hash: { contents_hash } , and path: { local_filepath } "
342+ )
328343 return str (local_filepath ), contents_hash
329344
330345 # --- UTILITIES ---
@@ -402,7 +417,7 @@ def delete(
402417 delete_external_files = None ,
403418 limit = None ,
404419 display_progress = True ,
405- errors_as_string = True
420+ errors_as_string = True ,
406421 ):
407422 """
408423
0 commit comments