Skip to content

Commit 7cbf35e

Browse files
Merge pull request #628 from dimitri-yatsenko/blob-migrate
blobs can be read from the explicit filepath in the external tables,
2 parents 70d12c2 + 0aa444f commit 7cbf35e

File tree

4 files changed

+50
-13
lines changed

4 files changed

+50
-13
lines changed

datajoint/errors.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,9 @@ class DuplicateError(DataJointError):
3535
Error caused by a violation of a unique constraint when inserting data
3636
"""
3737
pass
38+
39+
40+
class MissingExternalFile(DataJointError):
41+
"""
42+
Error raised when an external file managed by DataJoint is no longer accessible
43+
"""

datajoint/external.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@
22
import itertools
33
from collections import Mapping
44
from .settings import config
5-
from .errors import DataJointError
5+
from .errors import DataJointError, MissingExternalFile
66
from .hash import uuid_from_buffer, uuid_from_file
77
from .table import Table
88
from .declare import EXTERNAL_TABLE_ROOT
99
from . import s3
1010
from .utils import safe_write, safe_copy
1111

1212
CACHE_SUBFOLDING = (2, 2) # (2, 2) means "0123456789abcd" will be saved as "01/23/0123456789abcd"
13+
SUPPORT_MIGRATED_BLOBS = True # support blobs migrated from datajoint 0.11.*
1314

1415

1516
def subfold(name, folds):
@@ -124,11 +125,21 @@ def fput(self, local_filepath):
124125
def peek(self, blob_hash, bytes_to_peek=120):
125126
return self.get(blob_hash, size=bytes_to_peek)
126127

127-
def get(self, blob_hash, size=-1):
128+
def get(self, blob_hash, *, size=-1):
128129
"""
129130
get an object from external store.
130131
:param size: max number of bytes to retrieve. If size<0, retrieve entire blob
132+
:param explicit_path: if given, then use it as relative path rather than the path derived from
131133
"""
134+
135+
def read_file(filepath, size):
136+
try:
137+
with open(filepath, 'rb') as f:
138+
blob = f.read(size)
139+
except FileNotFoundError:
140+
raise MissingExternalFile('Lost access to external blob %s.' % full_path) from None
141+
return blob
142+
132143
if blob_hash is None:
133144
return None
134145

@@ -154,18 +165,33 @@ def get(self, blob_hash, size=-1):
154165
subfolders = os.path.join(*subfold(blob_hash.hex, self.spec['subfolding']))
155166
full_path = os.path.join(self.spec['location'], self.database, subfolders, blob_hash.hex)
156167
try:
157-
with open(full_path, 'rb') as f:
158-
blob = f.read(size)
159-
except FileNotFoundError:
160-
raise DataJointError('Lost access to external blob %s.' % full_path) from None
168+
blob = read_file(full_path, size)
169+
except MissingExternalFile:
170+
if not SUPPORT_MIGRATED_BLOBS:
171+
raise
172+
# migrated blobs from 0.11
173+
relative_filepath, contents_hash = (self & {'hash': blob_hash}).fetch1(
174+
'filepath', 'contents_hash')
175+
if relative_filepath is None:
176+
raise
177+
blob = read_file(os.path.join(self.spec['location'], relative_filepath))
161178
else:
162179
if size > 0:
163180
blob_size = os.path.getsize(full_path)
164181
elif self.spec['protocol'] == 's3':
165182
full_path = '/'.join(
166183
(self.database,) + subfold(blob_hash.hex, self.spec['subfolding']) + (blob_hash.hex,))
167184
if size < 0:
168-
blob = self.s3.get(full_path)
185+
try:
186+
blob = self.s3.get(full_path)
187+
except MissingExternalFile:
188+
if not SUPPORT_MIGRATED_BLOBS:
189+
raise
190+
relative_filepath, contents_hash = (self & {'hash': blob_hash}).fetch1(
191+
'filepath', 'contents_hash')
192+
if relative_filepath is None:
193+
raise
194+
blob = self.s3.get(relative_filepath)
169195
else:
170196
blob = self.s3.partial_get(full_path, 0, size)
171197
blob_size = self.s3.get_size(full_path)

datajoint/heading.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import re
55
import logging
66
from .errors import DataJointError
7-
from .declare import UUID_DATA_TYPE, CUSTOM_TYPES, TYPE_PATTERN, EXTERNAL_TYPES, SERIALIZED_TYPES
7+
from .declare import UUID_DATA_TYPE, CUSTOM_TYPES, TYPE_PATTERN, EXTERNAL_TYPES
88
from .utils import OrderedDict
99

1010

@@ -225,11 +225,13 @@ def init_from_database(self, conn, database, table_name):
225225
try:
226226
category = next(c for c in CUSTOM_TYPES if TYPE_PATTERN[c].match(attr['type']))
227227
except StopIteration:
228+
if attr['type'].startswith('external'):
229+
raise DataJointError('Legacy datatype `{type}`.'.format(**attr)) from None
228230
raise DataJointError('Unknown attribute type `{type}`'.format(**attr)) from None
229231
attr.update(
230232
is_attachment=category in ('INTERNAL_ATTACH', 'EXTERNAL_ATTACH'),
231233
is_filepath=category == 'FILEPATH',
232-
is_blob=category in ('INTERNAL_BLOB', 'EXTERNAL_BLOB'), # INTERNAL_BLOB is not a custom type but is included for completeness
234+
is_blob=category in ('INTERNAL_BLOB', 'EXTERNAL_BLOB'), # INTERNAL_BLOB is not a custom type but is included for completeness
233235
uuid=category == 'UUID',
234236
is_external=category in EXTERNAL_TYPES,
235237
store=attr['type'].split('@')[1] if category in EXTERNAL_TYPES else None)

datajoint/s3.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import warnings
77
import uuid
88
import os
9-
9+
from . import errors
1010

1111
class Folder:
1212
"""
@@ -29,7 +29,10 @@ def fput(self, relative_name, local_file, **meta):
2929
self.bucket, '/'.join((self.remote_path, relative_name)), local_file, metadata=meta or None)
3030

3131
def get(self, relative_name):
32-
return self.client.get_object(self.bucket, '/'.join((self.remote_path, relative_name))).data
32+
try:
33+
return self.client.get_object(self.bucket, '/'.join((self.remote_path, relative_name))).data
34+
except minio.error.NoSuchKey:
35+
raise errors.MissingExternalFile from None
3336

3437
def fget(self, relative_name, local_filepath):
3538
"""get file from object name to local filepath"""
@@ -48,13 +51,13 @@ def partial_get(self, relative_name, offset, size):
4851
return self.client.get_partial_object(
4952
self.bucket, '/'.join((self.remote_path, relative_name)), offset, size).data
5053
except minio.error.NoSuchKey:
51-
return None
54+
raise errors.MissingExternalFile from None
5255

5356
def get_size(self, relative_name):
5457
try:
5558
return self.client.stat_object(self.bucket, '/'.join((self.remote_path, relative_name))).size
5659
except minio.error.NoSuchKey:
57-
return None
60+
raise errors.MissingExternalFile from None
5861

5962
def list_objects(self, folder=''):
6063
return self.client.list_objects(self.bucket, '/'.join((self.remote_path, folder, '')), recursive=True)

0 commit comments

Comments
 (0)