When loading data into the DSS, best results are obtained when all the cloud URLs

Michael Baumann · Michael Baumann · commit fb6b9194d90e · 2018-10-08T14:09:50.000-07:00
loaded by reference are accessible. This provides the most complete set of file metadata
for the files loaded by reference. However, there are times, mainly during development
and testing, when that is not the case. Currently, as we load the GTEx data, we have
access to the NIH files in Google but not in AWS. The most important file information
is included in the loader input data already.
This change enables the loader to run and do a generally adequate job
even when not all the cloud URLs are accessible.

The specific changes are:

Make use of the data file size from the input data. File size was already a required field
for the input schema, but was not be used. Now the input file size is compared to the
actual file size of the cloud URLs that are accessible, and discrepancies result in an
error loading the bundle.
If no actual cloud URLs are available for the a given data file, the input file size is used.

Improved warnings when the cloud URLs are not accessible, using the Python warnings feature/library.

Updated and enhanced unit tests for the above
diff --git a/loader/base_loader.py b/loader/base_loader.py
@@ -22,8 +22,9 @@
 import uuid
 from io import open
 from tempfile import mkdtemp
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 from urllib.parse import urlparse
+from warnings import warn
 
 import boto3
 import botocore
@@ -43,11 +44,27 @@
 
 CREATOR_ID = 20
 
+class CloudUrlAccessWarning(Warning):
+    """Warning when a cloud URL could not be accessed for any reason"""
+
+class CloudUrlAccessForbidden(CloudUrlAccessWarning):
+    """Warning when a cloud URL could not be accessed due to authorization issues"""
+
+class CloudUrlNotFound(CloudUrlAccessWarning):
+    """Warning when a cloud URL was not found"""
 
 class FileURLError(Exception):
     """Thrown when a file cannot be accessed by the given URl"""
 
 
+class InconsistentFileSizeValues(Exception):
+    """Thrown when the input file size does not match the actual file size of a file being loaded by reference"""
+
+
+class MissingInputFileSize(Exception):
+    """Thrown when the input file size is not available for a data file being loaded by reference"""
+
+
 class UnexpectedResponseError(Exception):
     """Thrown when DSS gives an unexpected response"""
 
@@ -91,6 +108,7 @@ def upload_cloud_file_by_reference(self,
                                        filename: str,
                                        file_uuid: str,
                                        file_cloud_urls: set,
+                                       size: int,
                                        guid: str,
                                        file_version: str=None) -> tuple:
         """
@@ -111,13 +129,18 @@ def upload_cloud_file_by_reference(self,
         :param file_uuid: An RFC4122-compliant UUID to be used to identify the file
         :param file_cloud_urls: A set of 'gs://' and 's3://' bucket links.
                                 e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
+        :param size: size of the file in bytes, as provided by the input data to be loaded.
+         An attempt will be made to access the `file_cloud_objects` to obtain the
+         basic file metadata, and if successful, the size is verified to be consistent.
         :param guid: An optional additional/alternate data identifier/alias to associate with the file
         e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
         :param file_version: a RFC3339 compliant datetime string
         :return: file_uuid: str, file_version: str, filename: str, already_present: bool
+        :raises MissingFileSize: If no input file size is available for file to be loaded by reference
+        :raises InconsistentFileSizeValues: If file sizes are inconsistent for file to be loaded by reference
         """
 
-        def _create_file_reference(file_cloud_urls: set, guid: str) -> dict:
+        def _create_file_reference(file_cloud_urls: set, size: int, guid: str) -> dict:
             """
             Format a file's metadata into a dictionary for uploading as a json to support the approach
             described here:
@@ -127,22 +150,26 @@ def _create_file_reference(file_cloud_urls: set, guid: str) -> dict:
                                     e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
             :param guid: An optional additional/alternate data identifier/alias to associate with the file
             e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
-            :param file_version: RFC3339 formatted timestamp.
+            :param size: file size in bytes from input data
             :return: A dictionary of metadata values.
             """
-            s3_metadata = None
-            gs_metadata = None
+
+            input_metadata = dict(size=size)
+            s3_metadata: Dict[str, Any] = dict()
+            gs_metadata: Dict[str, Any] = dict()
             for cloud_url in file_cloud_urls:
                 url = urlparse(cloud_url)
                 bucket = url.netloc
                 key = url.path[1:]
+                if not (bucket and key):
+                    raise FileURLError(f'Invalid URL {cloud_url}')
                 if url.scheme == "s3":
                     s3_metadata = _get_s3_file_metadata(bucket, key)
                 elif url.scheme == "gs":
                     gs_metadata = _get_gs_file_metadata(bucket, key)
                 else:
                     raise FileURLError("Unsupported cloud URL scheme: {cloud_url}")
-            return _consolidate_metadata(file_cloud_urls, s3_metadata, gs_metadata, guid)
+            return _consolidate_metadata(file_cloud_urls, input_metadata, s3_metadata, gs_metadata, guid)
 
         def _get_s3_file_metadata(bucket: str, key: str) -> dict:
             """
@@ -155,11 +182,24 @@ def _get_s3_file_metadata(bucket: str, key: str) -> dict:
             metadata = dict()
             try:
                 response = self.s3_client.head_object(Bucket=bucket, Key=key, RequestPayer="requester")
-                metadata['content-type'] = response['ContentType']
-                metadata['s3_etag'] = response['ETag']
-                metadata['size'] = response['ContentLength']
-            except Exception as e:
-                raise FileURLError(f"Error accessing s3://{bucket}/{key}") from e
+            except botocore.exceptions.ClientError as e:
+                if e.response['Error']['Code'] == str(requests.codes.not_found):
+                    warn(f'Could not find \"s3://{bucket}/{key}\" Error: {e}'
+                         ' The S3 file metadata for this file reference will be missing.',
+                         CloudUrlNotFound)
+                else:
+                    warn(f"Failed to access \"s3://{bucket}/{key}\" Error: {e}"
+                         " The S3 file metadata for this file reference will be missing.",
+                         CloudUrlAccessWarning)
+            else:
+                try:
+                    metadata['size'] = response['ContentLength']
+                    metadata['content-type'] = response['ContentType']
+                    metadata['s3_etag'] = response['ETag']
+                except KeyError as e:
+                    # These standard metadata should always be present.
+                    logging.error(f'Failed to access "s3://{bucket}/{key}" file metadata field. Error: {e}'
+                                  ' The S3 file metadata for this file will be incomplete.')
             return metadata
 
         def _get_gs_file_metadata(bucket: str, key: str) -> dict:
@@ -170,25 +210,30 @@ def _get_gs_file_metadata(bucket: str, key: str) -> dict:
             :param key: GS file to upload.  e.g. 'output.txt' or 'data/output.txt'
             :return: A dictionary of metadata values.
             """
-            metadata = dict()
-            try:
-                gs_bucket = self.gs_client.bucket(bucket, self.google_project_id)
-                blob_obj = gs_bucket.get_blob(key)
+            gs_bucket = self.gs_client.bucket(bucket, self.google_project_id)
+            blob_obj = gs_bucket.get_blob(key)
+            if blob_obj is not None:
+                metadata = dict()
+                metadata['size'] = blob_obj.size
                 metadata['content-type'] = blob_obj.content_type
                 metadata['crc32c'] = binascii.hexlify(base64.b64decode(blob_obj.crc32c)).decode("utf-8").lower()
-                metadata['size'] = blob_obj.size
-            except Exception as e:
-                raise FileURLError(f"Error accessing gs://{bucket}/{key}") from e
-            return metadata
+                return metadata
+            else:
+                warn(f'Could not find "gs://{bucket}/{key}"'
+                     ' The GS file metadata for this file reference will be missing.',
+                     CloudUrlNotFound)
+                return dict()
 
         def _consolidate_metadata(file_cloud_urls: set,
-                                  s3_metadata: Optional[Dict[str, Any]],
-                                  gs_metadata: Optional[Dict[str, Any]],
+                                  input_metadata: Dict[str, Any],
+                                  s3_metadata: Dict[str, Any],
+                                  gs_metadata: Dict[str, Any],
                                   guid: str) -> dict:
             """
             Consolidates cloud file metadata to create the JSON used to load by reference
             into the DSS.
 
+            :param input_metadata:
             :param file_cloud_urls: A set of 'gs://' and 's3://' bucket URLs.
                                     e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
             :param s3_metadata: Dictionary of meta data produced by _get_s3_file_metadata().
@@ -197,19 +242,38 @@ def _consolidate_metadata(file_cloud_urls: set,
             e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
             :return: A dictionary of cloud file metadata values
             """
-            consolidated_metadata = dict()
-            if s3_metadata:
-                consolidated_metadata.update(s3_metadata)
-            if gs_metadata:
-                consolidated_metadata.update(gs_metadata)
+
+            def _check_file_size_consistency(input_metadata, s3_metadata, gs_metadata):
+                input_size = input_metadata.get('size', None)
+                if input_size is not None:
+                    input_size = int(input_size)
+                else:
+                    raise MissingInputFileSize('No input file size is available for file being loaded by reference.')
+                s3_size = s3_metadata.get('size', None)
+                gs_size = gs_metadata.get('size', None)
+                if s3_size and input_size != s3_size:
+                    raise InconsistentFileSizeValues(
+                        f'Input file size does not match actual S3 file size: '
+                        f'input size: {input_size}, S3 actual size: {s3_size}')
+                if gs_size and input_size != gs_size:
+                    raise InconsistentFileSizeValues(
+                        f'Input file size does not match actual GS actual file size: '
+                        f'input size: {input_size}, GS actual size: {gs_size}')
+                return input_size
+
+            consolidated_metadata: Dict[str, Any] = dict()
+            consolidated_metadata.update(input_metadata)
+            consolidated_metadata.update(s3_metadata)
+            consolidated_metadata.update(gs_metadata)
+            consolidated_metadata['size'] = _check_file_size_consistency(input_metadata, s3_metadata, gs_metadata)
             consolidated_metadata['url'] = list(file_cloud_urls)
             consolidated_metadata['aliases'] = [str(guid)]
             return consolidated_metadata
 
         if self.dry_run:
             logger.info(f"DRY RUN: upload_cloud_file_by_reference: {filename} {str(file_cloud_urls)} {guid}")
 
-        file_reference = _create_file_reference(file_cloud_urls, guid)
+        file_reference = _create_file_reference(file_cloud_urls, size, guid)
         return self.upload_dict_as_file(file_reference,
                                         filename,
                                         file_uuid,
diff --git a/loader/standard_loader.py b/loader/standard_loader.py
@@ -22,6 +22,7 @@ class ParsedDataFile(typing.NamedTuple):
     filename: str
     file_uuid: str
     cloud_urls: typing.List[str]  # list of urls
+    size: int
     file_guid: str
     file_version: str  # rfc3339
 
@@ -95,6 +96,14 @@ def _get_cloud_urls(file_info: dict):
                 raise ParseError(f"Expected 'url' as key for urls in file_info: \n{file_info}")
         return [url_dict['url'] for url_dict in urls]
 
+    @staticmethod
+    def _get_file_size(file_info: dict):
+        if 'size' not in file_info:
+            raise ParseError(f'Size field not present in file_info: \n{file_info}')
+        if not int(file_info['size']) >= 0:
+            raise ParseError(f'Invalid value for size in file_info: \n{file_info}')
+        return file_info['size']
+
     @classmethod
     def _parse_bundle(cls, bundle: dict) -> ParsedBundle:
         try:
@@ -116,7 +125,8 @@ def _parse_bundle(cls, bundle: dict) -> ParsedBundle:
             file_uuid = cls._get_file_uuid(file_guid)
             file_version = cls._get_file_version(file_info)
             cloud_urls = cls._get_cloud_urls(file_info)
-            parsed_file = ParsedDataFile(filename, file_uuid, cloud_urls, file_guid, file_version)
+            file_size = cls._get_file_size(file_info)
+            parsed_file = ParsedDataFile(filename, file_uuid, cloud_urls, file_size, file_guid, file_version)
             parsed_files.append(parsed_file)
 
         return ParsedBundle(bundle_uuid, metadata_dict, parsed_files)
@@ -139,13 +149,14 @@ def _load_bundle(self, bundle_uuid, metadata_dict, data_files, bundle_num):
                                    name=metadata_filename, indexed=True))
 
         for data_file in data_files:
-            filename, file_uuid, cloud_urls, file_guid, file_version, = data_file
+            filename, file_uuid, cloud_urls, file_size, file_guid, file_version = data_file
             logger.debug(f'Bundle {bundle_num}: Attempting to upload data file: {filename} '
                          f'with uuid:version {file_uuid}:{file_version}...')
             file_uuid, file_version, filename, already_present = \
                 self.dss_uploader.upload_cloud_file_by_reference(filename,
                                                                  file_uuid,
                                                                  cloud_urls,
+                                                                 file_size,
                                                                  file_guid,
                                                                  file_version=file_version)
             if already_present:
diff --git a/scripts/cgp_data_loader.py b/scripts/cgp_data_loader.py
@@ -56,6 +56,13 @@ def main(argv=sys.argv[1:]):
     logging.getLogger(__name__)
     suppress_verbose_logging()
 
+    if not sys.warnoptions:
+        import warnings
+        # Log each unique cloud URL access warning once by default.
+        # This can be overridden using the "PYTHONWARNINGS" environment variable.
+        # See: https://docs.python.org/3/library/warnings.html
+        warnings.simplefilter('default', 'CloudUrlAccessWarning', append=True)
+
     bundle_uploader = StandardFormatBundleUploader(dss_uploader, metadata_file_uploader)
     logging.info(f'Uploading {"serially" if options.serial else "concurrently"}')
     return bundle_uploader.load_all_bundles(load_json_from_file(options.input_json), not options.serial)
diff --git a/tests/test_standard_loader.py b/tests/test_standard_loader.py
@@ -2,13 +2,14 @@
 import logging
 import typing
 import uuid
+import warnings
 from pathlib import Path
 
 import boto3
 import iso8601
 
 from loader import base_loader
-from loader.base_loader import FileURLError
+from loader.base_loader import FileURLError, CloudUrlNotFound
 from loader.standard_loader import StandardFormatBundleUploader, ParsedBundle, ParseError, ParsedDataFile
 from scripts.cgp_data_loader import GOOGLE_PROJECT_ID
 from tests import ignore_resource_warnings
@@ -121,7 +122,8 @@ def test_parse_bundle(self):
         minimal_file_info = {'name': 'buried_treasure_map',
                              'created': tz_utc_now(),
                              'urls': [{'url': 's3://desert/island/under/palm'},
-                                      {'url': 'gs://captains/quarters/bottom/drawer'}]}
+                                      {'url': 'gs://captains/quarters/bottom/drawer'}],
+                             'size': 0}
         bundle = {}
         self.assertRaises(ParseError, self.loader._parse_bundle, bundle)
         data_bundle = {}
@@ -192,18 +194,19 @@ def _make_minimal_bundle(self, parsed=True, files=1):
             file.Acl().put(ACL='public-read')
 
             data_objects[file_guid] = ParsedDataFile(filename, file_uuid, cloud_urls,
-                                                     file_guid, file_version)
+                                                     file.content_length, file_guid, file_version)
 
         if parsed:
             return ParsedBundle(bundle_uuid, metadata_dict, list(data_objects.values()))
         else:
             dict_objects = {}
-            for filename, file_uuid, cloud_urls, file_guid, file_version in data_objects.values():
+            for filename, file_uuid, cloud_urls, file_size, file_guid, file_version in data_objects.values():
                 dict_objects[file_guid] = {
                     'name': filename,
                     'created': file_version,
                     'id': file_guid,
-                    'urls': [{'url': url} for url in cloud_urls]
+                    'urls': [{'url': url} for url in cloud_urls],
+                    'size': file_size
                 }
             minimal = {
                 'data_bundle': {
@@ -330,22 +333,14 @@ def test_duplicate_file_upload(self):
         """
         _, _, data_files = self._make_minimal_bundle()
         data_file = data_files[0]
-        filename, file_uuid, cloud_urls, file_guid, file_version, = data_file
+        filename, file_uuid, cloud_urls, file_size, file_guid, file_version = data_file
 
         _, _, _, already_present = \
-            self.dss_uploader.upload_cloud_file_by_reference(filename,
-                                                             file_uuid,
-                                                             cloud_urls,
-                                                             file_guid,
-                                                             file_version=file_version)
+            self.dss_uploader.upload_cloud_file_by_reference(filename, file_uuid, cloud_urls, file_size, file_guid, file_version)
         # make sure the file hasn't already been uploaded
         self.assertFalse(already_present)
         _, _, _, already_present = \
-            self.dss_uploader.upload_cloud_file_by_reference(filename,
-                                                             file_uuid,
-                                                             cloud_urls,
-                                                             file_guid,
-                                                             file_version=file_version)
+            self.dss_uploader.upload_cloud_file_by_reference(filename, file_uuid, cloud_urls, file_size, file_guid, file_version)
         # make sure the file HAS already been uploaded
         self.assertTrue(already_present)
 
@@ -355,7 +350,15 @@ def test_bad_URL(self):
         bundle = self._make_minimal_bundle(parsed=True)
         bundle.data_files[0].cloud_urls[0] = 'https://example.com'
         self.assertRaises(FileURLError, self.loader._load_bundle, *bundle, 0)
-        bundle.data_files[0].cloud_urls[0] = 's3://definatelynotavalidbucketorfile'
-        self.assertRaises(FileURLError, self.loader._load_bundle, *bundle, 1)
-        bundle.data_files[0].cloud_urls[0] = 'gs://definatelynotavalidbucketorfile'
-        self.assertRaises(FileURLError, self.loader._load_bundle, *bundle, 2)
+
+        bundle = self._make_minimal_bundle(parsed=True)
+        bundle.data_files[0].cloud_urls[0] = 's3://definatelynotavalidbucketor/file'
+        with self.assertWarnsRegex(CloudUrlNotFound, 'Could not find "s3://definatelynotavalidbucketor/file"'):
+            warnings.simplefilter('always', 'CloudUrlAccessWarning', append=True)
+            self.loader._load_bundle(*bundle, 1)
+
+        bundle = self._make_minimal_bundle(parsed=True)
+        bundle.data_files[0].cloud_urls[0] = 'gs://definatelynotavalidbucketor/file'
+        with self.assertWarnsRegex(CloudUrlNotFound, 'Could not find "gs://definatelynotavalidbucketor/file"'):
+            warnings.simplefilter('always', 'CloudUrlAccessWarning', append=True)
+            self.loader._load_bundle(*bundle, 2)