11import os
2- from tqdm import tqdm
2+ import itertools
33from .settings import config
44from .errors import DataJointError
55from .hash import long_hash
6- from .blob import pack , unpack
76from .table import Table
87from .declare import STORE_HASH_LENGTH , HASH_DATA_TYPE
9- from .s3 import Folder as S3Folder
8+ from . import s3
109from .utils import safe_write
1110
1211
12+ def subfold (name , folds ):
13+ """
14+ subfolding for external storage: e.g. subfold('abcdefg', (2, 3)) --> ['ab','cde']
15+ """
16+ return (name [:folds [0 ]].lower (),) + subfold (name [folds [0 ]:], folds [1 :]) if folds else ()
17+
18+
1319class ExternalTable (Table ):
1420 """
1521 The table tracking externally stored objects.
@@ -42,15 +48,15 @@ def definition(self):
4248 def table_name (self ):
4349 return '~external'
4450
45- def put (self , store , obj ):
51+ def put (self , store , blob ):
4652 """
4753 put an object in external store
4854 """
49- spec = self . _get_store_spec (store )
50- blob = pack ( obj )
51- blob_hash = long_hash (blob ) + store [ len ( 'external-' ):]
55+ store = '' . join (store . split ( '-' )[ 1 :] )
56+ spec = config . get_store_spec ( store )
57+ blob_hash = long_hash (blob ) + store
5258 if spec ['protocol' ] == 'file' :
53- folder = os .path .join (spec ['location' ], self .database )
59+ folder = os .path .join (spec ['location' ], self .database , * subfold ( blob_hash , spec [ 'subfolding' ]) )
5460 full_path = os .path .join (folder , blob_hash )
5561 if not os .path .isfile (full_path ):
5662 try :
@@ -59,9 +65,10 @@ def put(self, store, obj):
5965 os .makedirs (folder )
6066 safe_write (full_path , blob )
6167 elif spec ['protocol' ] == 's3' :
62- S3Folder (database = self .database , ** spec ).put (blob_hash , blob )
68+ folder = '/' .join (subfold (blob_hash , spec ['subfolding' ]))
69+ s3 .Folder (database = self .database , ** spec ).put ('/' .join ((folder , blob_hash )), blob )
6370 else :
64- raise DataJointError ('Unknown external storage protocol {protocol} for {store}' .format (
71+ raise DataJointError ('Unknown external storage protocol {protocol} in store "- {store}" ' .format (
6572 store = store , protocol = spec ['protocol' ]))
6673
6774 # insert tracking info
@@ -80,31 +87,33 @@ def get(self, blob_hash):
8087 """
8188 if blob_hash is None :
8289 return None
83- store = blob_hash [STORE_HASH_LENGTH :]
84- store = 'external' + ('-' if store else '' ) + store
85-
86- cache_folder = config .get ('cache' , None )
8790
91+ # attempt to get object from cache
8892 blob = None
93+ cache_folder = config .get ('cache' , None )
8994 if cache_folder :
9095 try :
9196 with open (os .path .join (cache_folder , blob_hash ), 'rb' ) as f :
9297 blob = f .read ()
9398 except FileNotFoundError :
9499 pass
95100
101+ # attempt to get object from store
96102 if blob is None :
97- spec = self ._get_store_spec (store )
103+ store = blob_hash [STORE_HASH_LENGTH :]
104+ spec = config .get_store_spec (store )
98105 if spec ['protocol' ] == 'file' :
99- full_path = os .path .join (spec ['location' ], self .database , blob_hash )
106+ subfolders = os .path .join (* subfold (blob_hash , spec ['subfolding' ]))
107+ full_path = os .path .join (spec ['location' ], self .database , subfolders , blob_hash )
100108 try :
101109 with open (full_path , 'rb' ) as f :
102110 blob = f .read ()
103111 except FileNotFoundError :
104112 raise DataJointError ('Lost access to external blob %s.' % full_path ) from None
105113 elif spec ['protocol' ] == 's3' :
106114 try :
107- blob = S3Folder (database = self .database , ** spec ).get (blob_hash )
115+ subfolder = '/' .join (subfold (blob_hash , spec ['subfolding' ]))
116+ blob = s3 .Folder (database = self .database , ** spec ).get ('/' .join ((subfolder , blob_hash )))
108117 except TypeError :
109118 raise DataJointError ('External store {store} configuration is incomplete.' .format (store = store ))
110119 else :
@@ -115,7 +124,7 @@ def get(self, blob_hash):
115124 os .makedirs (cache_folder )
116125 safe_write (os .path .join (cache_folder , blob_hash ), blob )
117126
118- return unpack ( blob )
127+ return blob
119128
120129 @property
121130 def references (self ):
@@ -156,34 +165,35 @@ def delete_garbage(self):
156165 for ref in self .references ) or "TRUE" )
157166 print ('Deleted %d items' % self .connection .query ("SELECT ROW_COUNT()" ).fetchone ()[0 ])
158167
159- def clean_store (self , store , display_progress = True ):
168+ def clean_store (self , store , verbose = True ):
160169 """
161170 Clean unused data in an external storage repository from unused blobs.
162171 This must be performed after delete_garbage during low-usage periods to reduce risks of data loss.
163172 """
164- spec = self . _get_store_spec (store )
165- progress = tqdm if display_progress else lambda x : x
173+ spec = config . get_store_spec (store )
174+ in_use = set ( x for x in ( self & '`hash` LIKE "%%{store}"' . format ( store = store )). fetch ( 'hash' ))
166175 if spec ['protocol' ] == 'file' :
167- folder = os .path .join (spec ['location' ], self .database )
168- delete_list = set (os .listdir (folder )).difference (self .fetch ('hash' ))
169- print ('Deleting %d unused items from %s' % (len (delete_list ), folder ), flush = True )
170- for f in progress (delete_list ):
171- os .remove (os .path .join (folder , f ))
176+ count = itertools .count ()
177+ print ('Deleting...' )
178+ deleted_folders = set ()
179+ for folder , dirs , files in os .walk (os .path .join (spec ['location' ], self .database ), topdown = False ):
180+ if dirs and files :
181+ raise DataJointError ('Invalid repository with files in non-terminal folder %s' % folder )
182+ dirs = set (d for d in dirs if os .path .join (folder , d ) not in deleted_folders )
183+ if not dirs :
184+ files_not_in_use = [f for f in files if f not in in_use ]
185+ for f in files_not_in_use :
186+ filename = os .path .join (folder , f )
187+ next (count )
188+ if verbose :
189+ print (filename )
190+ os .remove (filename )
191+ if len (files_not_in_use ) == len (files ):
192+ os .rmdir (folder )
193+ deleted_folders .add (folder )
194+ print ('Deleted %d objects' % next (count ))
172195 elif spec ['protocol' ] == 's3' :
173196 try :
174- S3Folder (database = self .database , ** spec ).clean (self . fetch ( 'hash' ) )
197+ failed_deletes = s3 . Folder (database = self .database , ** spec ).clean (in_use , verbose = verbose )
175198 except TypeError :
176199 raise DataJointError ('External store {store} configuration is incomplete.' .format (store = store ))
177-
178- @staticmethod
179- def _get_store_spec (store ):
180- try :
181- spec = config [store ]
182- except KeyError :
183- raise DataJointError ('Storage {store} is requested but not configured' .format (store = store )) from None
184- if 'protocol' not in spec :
185- raise DataJointError ('Storage {store} config is missing the protocol field' .format (store = store ))
186- if spec ['protocol' ] not in {'file' , 's3' }:
187- raise DataJointError (
188- 'Unknown external storage protocol "{protocol}" in "{store}"' .format (store = store , ** spec ))
189- return spec
0 commit comments