22
22
import uuid
23
23
from io import open
24
24
from tempfile import mkdtemp
25
- from typing import Any , Dict , Optional
25
+ from typing import Any , Dict
26
26
from urllib .parse import urlparse
27
+ from warnings import warn
27
28
28
29
import boto3
29
30
import botocore
43
44
44
45
CREATOR_ID = 20
45
46
47
+ class CloudUrlAccessWarning (Warning ):
48
+ """Warning when a cloud URL could not be accessed for any reason"""
49
+
50
+ class CloudUrlAccessForbidden (CloudUrlAccessWarning ):
51
+ """Warning when a cloud URL could not be accessed due to authorization issues"""
52
+
53
+ class CloudUrlNotFound (CloudUrlAccessWarning ):
54
+ """Warning when a cloud URL was not found"""
46
55
47
56
class FileURLError (Exception ):
48
57
"""Thrown when a file cannot be accessed by the given URl"""
49
58
50
59
60
+ class InconsistentFileSizeValues (Exception ):
61
+ """Thrown when the input file size does not match the actual file size of a file being loaded by reference"""
62
+
63
+
64
+ class MissingInputFileSize (Exception ):
65
+ """Thrown when the input file size is not available for a data file being loaded by reference"""
66
+
67
+
51
68
class UnexpectedResponseError (Exception ):
52
69
"""Thrown when DSS gives an unexpected response"""
53
70
@@ -91,6 +108,7 @@ def upload_cloud_file_by_reference(self,
91
108
filename : str ,
92
109
file_uuid : str ,
93
110
file_cloud_urls : set ,
111
+ size : int ,
94
112
guid : str ,
95
113
file_version : str = None ) -> tuple :
96
114
"""
@@ -111,13 +129,18 @@ def upload_cloud_file_by_reference(self,
111
129
:param file_uuid: An RFC4122-compliant UUID to be used to identify the file
112
130
:param file_cloud_urls: A set of 'gs://' and 's3://' bucket links.
113
131
e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
132
+ :param size: size of the file in bytes, as provided by the input data to be loaded.
133
+ An attempt will be made to access the `file_cloud_objects` to obtain the
134
+ basic file metadata, and if successful, the size is verified to be consistent.
114
135
:param guid: An optional additional/alternate data identifier/alias to associate with the file
115
136
e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
116
137
:param file_version: a RFC3339 compliant datetime string
117
138
:return: file_uuid: str, file_version: str, filename: str, already_present: bool
139
+ :raises MissingFileSize: If no input file size is available for file to be loaded by reference
140
+ :raises InconsistentFileSizeValues: If file sizes are inconsistent for file to be loaded by reference
118
141
"""
119
142
120
- def _create_file_reference (file_cloud_urls : set , guid : str ) -> dict :
143
+ def _create_file_reference (file_cloud_urls : set , size : int , guid : str ) -> dict :
121
144
"""
122
145
Format a file's metadata into a dictionary for uploading as a json to support the approach
123
146
described here:
@@ -127,22 +150,26 @@ def _create_file_reference(file_cloud_urls: set, guid: str) -> dict:
127
150
e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
128
151
:param guid: An optional additional/alternate data identifier/alias to associate with the file
129
152
e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
130
- :param file_version: RFC3339 formatted timestamp.
153
+ :param size: file size in bytes from input data
131
154
:return: A dictionary of metadata values.
132
155
"""
133
- s3_metadata = None
134
- gs_metadata = None
156
+
157
+ input_metadata = dict (size = size )
158
+ s3_metadata : Dict [str , Any ] = dict ()
159
+ gs_metadata : Dict [str , Any ] = dict ()
135
160
for cloud_url in file_cloud_urls :
136
161
url = urlparse (cloud_url )
137
162
bucket = url .netloc
138
163
key = url .path [1 :]
164
+ if not (bucket and key ):
165
+ raise FileURLError (f'Invalid URL { cloud_url } ' )
139
166
if url .scheme == "s3" :
140
167
s3_metadata = _get_s3_file_metadata (bucket , key )
141
168
elif url .scheme == "gs" :
142
169
gs_metadata = _get_gs_file_metadata (bucket , key )
143
170
else :
144
171
raise FileURLError ("Unsupported cloud URL scheme: {cloud_url}" )
145
- return _consolidate_metadata (file_cloud_urls , s3_metadata , gs_metadata , guid )
172
+ return _consolidate_metadata (file_cloud_urls , input_metadata , s3_metadata , gs_metadata , guid )
146
173
147
174
def _get_s3_file_metadata (bucket : str , key : str ) -> dict :
148
175
"""
@@ -155,11 +182,24 @@ def _get_s3_file_metadata(bucket: str, key: str) -> dict:
155
182
metadata = dict ()
156
183
try :
157
184
response = self .s3_client .head_object (Bucket = bucket , Key = key , RequestPayer = "requester" )
158
- metadata ['content-type' ] = response ['ContentType' ]
159
- metadata ['s3_etag' ] = response ['ETag' ]
160
- metadata ['size' ] = response ['ContentLength' ]
161
- except Exception as e :
162
- raise FileURLError (f"Error accessing s3://{ bucket } /{ key } " ) from e
185
+ except botocore .exceptions .ClientError as e :
186
+ if e .response ['Error' ]['Code' ] == str (requests .codes .not_found ):
187
+ warn (f'Could not find \" s3://{ bucket } /{ key } \" Error: { e } '
188
+ ' The S3 file metadata for this file reference will be missing.' ,
189
+ CloudUrlNotFound )
190
+ else :
191
+ warn (f"Failed to access \" s3://{ bucket } /{ key } \" Error: { e } "
192
+ " The S3 file metadata for this file reference will be missing." ,
193
+ CloudUrlAccessWarning )
194
+ else :
195
+ try :
196
+ metadata ['size' ] = response ['ContentLength' ]
197
+ metadata ['content-type' ] = response ['ContentType' ]
198
+ metadata ['s3_etag' ] = response ['ETag' ]
199
+ except KeyError as e :
200
+ # These standard metadata should always be present.
201
+ logging .error (f'Failed to access "s3://{ bucket } /{ key } " file metadata field. Error: { e } '
202
+ ' The S3 file metadata for this file will be incomplete.' )
163
203
return metadata
164
204
165
205
def _get_gs_file_metadata (bucket : str , key : str ) -> dict :
@@ -170,25 +210,30 @@ def _get_gs_file_metadata(bucket: str, key: str) -> dict:
170
210
:param key: GS file to upload. e.g. 'output.txt' or 'data/output.txt'
171
211
:return: A dictionary of metadata values.
172
212
"""
173
- metadata = dict ()
174
- try :
175
- gs_bucket = self .gs_client .bucket (bucket , self .google_project_id )
176
- blob_obj = gs_bucket .get_blob (key )
213
+ gs_bucket = self .gs_client .bucket (bucket , self .google_project_id )
214
+ blob_obj = gs_bucket .get_blob (key )
215
+ if blob_obj is not None :
216
+ metadata = dict ()
217
+ metadata ['size' ] = blob_obj .size
177
218
metadata ['content-type' ] = blob_obj .content_type
178
219
metadata ['crc32c' ] = binascii .hexlify (base64 .b64decode (blob_obj .crc32c )).decode ("utf-8" ).lower ()
179
- metadata ['size' ] = blob_obj .size
180
- except Exception as e :
181
- raise FileURLError (f"Error accessing gs://{ bucket } /{ key } " ) from e
182
- return metadata
220
+ return metadata
221
+ else :
222
+ warn (f'Could not find "gs://{ bucket } /{ key } "'
223
+ ' The GS file metadata for this file reference will be missing.' ,
224
+ CloudUrlNotFound )
225
+ return dict ()
183
226
184
227
def _consolidate_metadata (file_cloud_urls : set ,
185
- s3_metadata : Optional [Dict [str , Any ]],
186
- gs_metadata : Optional [Dict [str , Any ]],
228
+ input_metadata : Dict [str , Any ],
229
+ s3_metadata : Dict [str , Any ],
230
+ gs_metadata : Dict [str , Any ],
187
231
guid : str ) -> dict :
188
232
"""
189
233
Consolidates cloud file metadata to create the JSON used to load by reference
190
234
into the DSS.
191
235
236
+ :param input_metadata:
192
237
:param file_cloud_urls: A set of 'gs://' and 's3://' bucket URLs.
193
238
e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
194
239
:param s3_metadata: Dictionary of meta data produced by _get_s3_file_metadata().
@@ -197,19 +242,38 @@ def _consolidate_metadata(file_cloud_urls: set,
197
242
e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
198
243
:return: A dictionary of cloud file metadata values
199
244
"""
200
- consolidated_metadata = dict ()
201
- if s3_metadata :
202
- consolidated_metadata .update (s3_metadata )
203
- if gs_metadata :
204
- consolidated_metadata .update (gs_metadata )
245
+
246
+ def _check_file_size_consistency (input_metadata , s3_metadata , gs_metadata ):
247
+ input_size = input_metadata .get ('size' , None )
248
+ if input_size is not None :
249
+ input_size = int (input_size )
250
+ else :
251
+ raise MissingInputFileSize ('No input file size is available for file being loaded by reference.' )
252
+ s3_size = s3_metadata .get ('size' , None )
253
+ gs_size = gs_metadata .get ('size' , None )
254
+ if s3_size and input_size != s3_size :
255
+ raise InconsistentFileSizeValues (
256
+ f'Input file size does not match actual S3 file size: '
257
+ f'input size: { input_size } , S3 actual size: { s3_size } ' )
258
+ if gs_size and input_size != gs_size :
259
+ raise InconsistentFileSizeValues (
260
+ f'Input file size does not match actual GS actual file size: '
261
+ f'input size: { input_size } , GS actual size: { gs_size } ' )
262
+ return input_size
263
+
264
+ consolidated_metadata : Dict [str , Any ] = dict ()
265
+ consolidated_metadata .update (input_metadata )
266
+ consolidated_metadata .update (s3_metadata )
267
+ consolidated_metadata .update (gs_metadata )
268
+ consolidated_metadata ['size' ] = _check_file_size_consistency (input_metadata , s3_metadata , gs_metadata )
205
269
consolidated_metadata ['url' ] = list (file_cloud_urls )
206
270
consolidated_metadata ['aliases' ] = [str (guid )]
207
271
return consolidated_metadata
208
272
209
273
if self .dry_run :
210
274
logger .info (f"DRY RUN: upload_cloud_file_by_reference: { filename } { str (file_cloud_urls )} { guid } " )
211
275
212
- file_reference = _create_file_reference (file_cloud_urls , guid )
276
+ file_reference = _create_file_reference (file_cloud_urls , size , guid )
213
277
return self .upload_dict_as_file (file_reference ,
214
278
filename ,
215
279
file_uuid ,
0 commit comments