Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions tests/providers/dataverse/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def settings():
'name': 'A look at wizards',
}


@pytest.fixture
def native_file_metadata():
with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp:
Expand Down Expand Up @@ -65,12 +66,20 @@ def dataset_metadata_object():
'Dataset Test Version'
)


@pytest.fixture
def file_metadata_object():
with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp:
return DataverseFileMetadata(json.load(fp)['native_file_metadata']['datafile'], 'latest')


@pytest.fixture
def csv_file_metadata_object():
with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp:
return DataverseFileMetadata(json.load(fp)['csv_native_file_metadata']['datafile'],
'latest')


@pytest.fixture
def revision_metadata_object():
return DataverseRevision('Test Dataset Verision')
16 changes: 16 additions & 0 deletions tests/providers/dataverse/fixtures/root_provider.json
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,22 @@
"label":"thefile.txt",
"version":1
},
"csv_native_file_metadata":{
"datafile":{
"contentType":"text/tab-separated-values",
"description":"",
"filename":"%2Fusr%2Flocal%2Fglassfish4%2Fglassfish%2Fdomains%2Fdomain1%2Ffiles%2F10.5072%2FFK2%2F232XYH%2F14c7a73d734-8383551cc713",
"id":20,
"md5":"6b50249f91258397fc5cb7d5a4127e15",
"name":"thefile.tab",
"originalFormatLabel":"Comma Separated Values",
"originalFileFormat": "text/csv"
},
"datasetVersionId":5,
"description":"",
"label":"thefile.tab",
"version":1
},
"checksum_mismatch_dataset_metadata":{
"data":{
"createTime":"2015-04-02T13:21:59Z",
Expand Down
29 changes: 29 additions & 0 deletions tests/providers/dataverse/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from tests.providers.dataverse.fixtures import (
dataset_metadata_object,
revision_metadata_object,
csv_file_metadata_object,
file_metadata_object
)


class TestDatasetMetadata:

def test_dataset_metadata(self, dataset_metadata_object):
Expand Down Expand Up @@ -45,6 +47,7 @@ def test_file_metadata(self, file_metadata_object):
assert not file_metadata_object.created_utc
assert file_metadata_object.content_type == 'text/plain; charset=US-ASCII'
assert file_metadata_object.etag == 'latest::20'
assert file_metadata_object.original_names == ['thefile.txt']
assert file_metadata_object.extra == {
'fileId': '20',
'datasetVersion': 'latest',
Expand All @@ -53,3 +56,29 @@ def test_file_metadata(self, file_metadata_object):
'md5': '6b50249f91258397fc5cb7d5a4127e15',
},
}

def test_csv_file_metadata(self, csv_file_metadata_object):
assert csv_file_metadata_object.is_file
assert not csv_file_metadata_object.is_folder
assert csv_file_metadata_object.provider == 'dataverse'
assert csv_file_metadata_object.kind == 'file'
assert csv_file_metadata_object.file_id == '20'
assert csv_file_metadata_object.name == 'thefile.tab'
assert csv_file_metadata_object.path == '/20'
assert csv_file_metadata_object.materialized_path == '/thefile.tab'
assert not csv_file_metadata_object.size
assert not csv_file_metadata_object.modified
assert not csv_file_metadata_object.created_utc
assert csv_file_metadata_object.content_type == 'text/tab-separated-values'
assert csv_file_metadata_object.etag == 'latest::20'
names = csv_file_metadata_object.original_names
assert 'thefile.csv' in names
assert 'thefile.CSV' in names
assert csv_file_metadata_object.extra == {
'fileId': '20',
'datasetVersion': 'latest',
'hasPublishedVersion': False,
'hashes': {
'md5': '6b50249f91258397fc5cb7d5a4127e15',
},
}
27 changes: 27 additions & 0 deletions tests/providers/dataverse/test_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from waterbutler.core.path import WaterButlerPath
from waterbutler.providers.dataverse import settings as dvs
from waterbutler.providers.dataverse import DataverseProvider
from waterbutler.providers.dataverse.exceptions import DataverseIngestionLockError
from waterbutler.providers.dataverse.metadata import DataverseFileMetadata, DataverseRevision

from tests.providers.dataverse.fixtures import (
Expand Down Expand Up @@ -235,6 +236,32 @@ async def test_upload_create(self, provider, file_stream, native_file_metadata,
assert aiohttpretty.has_call(method='GET', uri=latest_url)
assert aiohttpretty.has_call(method='GET', uri=latest_published_url)

@pytest.mark.asyncio
@pytest.mark.aiohttpretty
async def test_upload_ingestion_exception(self, provider, file_stream, native_file_metadata,
empty_native_dataset_metadata, native_dataset_metadata):
path = WaterButlerPath('/thefile.txt')
url = provider.build_url(dvs.EDIT_MEDIA_BASE_URL, 'study', provider.doi)
aiohttpretty.register_uri('POST', url, status=400, body=b'something dataset lock: Ingest')

with pytest.raises(DataverseIngestionLockError):
await provider.upload(file_stream, path)

assert aiohttpretty.has_call(method='POST', uri=url)

@pytest.mark.asyncio
@pytest.mark.aiohttpretty
async def test_upload_random_exception(self, provider, file_stream, native_file_metadata,
empty_native_dataset_metadata, native_dataset_metadata):
path = WaterButlerPath('/thefile.txt')
url = provider.build_url(dvs.EDIT_MEDIA_BASE_URL, 'study', provider.doi)
aiohttpretty.register_uri('POST', url, status=400, body=b'something something error')

with pytest.raises(exceptions.UploadError):
await provider.upload(file_stream, path)

assert aiohttpretty.has_call(method='POST', uri=url)

@pytest.mark.asyncio
@pytest.mark.aiohttpretty
async def test_upload_updates(self, provider,
Expand Down
51 changes: 51 additions & 0 deletions tests/providers/dataverse/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pytest

from waterbutler.providers.dataverse import utils as dv_utils


@pytest.fixture
def format_dict():
return {
'xlsx': {
'originalFileFormat': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'originalFormatLabel': 'MS Excel (XLSX)',
'contentType': 'text/tab-separated-values',
},
'RData': {
'originalFileFormat': 'application/x-rlang-transport',
'originalFormatLabel': 'R Data',
'contentType': 'text/tab-separated-values'
},
'sav': {
'originalFileFormat': 'application/x-spss-sav',
'originalFormatLabel': 'SPSS SAV',
'contentType': 'text/tab-separated-values'
},
'dta': {
'originalFileFormat': 'application/x-stata',
'originalFormatLabel': 'Stata Binary',
'contentType': 'text/tab-separated-values'
},
'por': {
'originalFileFormat': 'application/x-spss-por',
'originalFormatLabel': 'SPSS Portable',
'contentType': 'text/tab-separated-values'
},
'csv': {
'originalFileFormat': 'text/csv',
'originalFormatLabel': 'Comma Separated Values',
'contentType': 'text/tab-separated-values'
}
}


class TestUtils:

def test_original_ext_from_raw_metadata(self, format_dict):
for key in format_dict:
assert key in dv_utils.original_ext_from_raw_metadata(format_dict[key])

def test_original_ext_from_raw_metadata_none_case(self, format_dict):
for key in format_dict:
format_dict[key]['originalFormatLabel'] = 'blarg'
assert dv_utils.original_ext_from_raw_metadata(format_dict[key]) is None
15 changes: 15 additions & 0 deletions waterbutler/providers/dataverse/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from http import HTTPStatus

from waterbutler.core.exceptions import UploadError


class DataverseIngestionLockError(UploadError):
def __init__(self, message, code=HTTPStatus.BAD_REQUEST):
"""``dummy`` argument is because children of ``WaterButlerError`` must be instantiable with
a single integer argument. See :class:`waterbutler.core.exceptions.WaterButlerError`
for details.
"""
super().__init__(
'Some uploads to Dataverse will lock uploading for a time. Please wait'
' a few seconds and try again.',
code=code)
18 changes: 18 additions & 0 deletions waterbutler/providers/dataverse/metadata.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from waterbutler.core import metadata
from waterbutler.providers.dataverse import utils as dv_utils


class BaseDataverseMetadata(metadata.BaseMetadata):
Expand Down Expand Up @@ -26,6 +27,23 @@ def file_id(self):
def name(self):
return self.raw.get('name', None) or self.raw.get('filename', None)

@property
def original_names(self):
""" Dataverse 'ingests' some files types. This changes their extension.
This property will look through the metadata to try to determine possible
original names of the file.
"""

extensions = dv_utils.original_ext_from_raw_metadata(self.raw)
if extensions is None:
return [self.name]
else:
names = []
for ext in extensions:
name = self.name[:self.name.rfind('.')]
names.append(name + '.{}'.format(ext))
return names

@property
def path(self):
return self.build_path(self.file_id)
Expand Down
16 changes: 14 additions & 2 deletions waterbutler/providers/dataverse/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from waterbutler.providers.dataverse import settings
from waterbutler.providers.dataverse.metadata import DataverseRevision
from waterbutler.providers.dataverse.metadata import DataverseDatasetMetadata
from waterbutler.providers.dataverse.exceptions import DataverseIngestionLockError


class DataverseProvider(provider.BaseProvider):
Expand Down Expand Up @@ -170,15 +171,26 @@ async def upload(self, stream, path, **kwargs):
headers=dv_headers,
auth=(self.token, ),
data=file_stream,
expects=(201, ),
expects=(201, 400,),
throws=exceptions.UploadError
)

if resp.status == 400:
data = await resp.read()
data = data.decode('utf-8')

if 'dataset lock: Ingest' in data:
raise DataverseIngestionLockError({'response': data})
else:
raise (await exceptions.exception_from_response(resp,
error=exceptions.UploadError))
await resp.release()

# Find appropriate version of file
metadata = await self._get_data('latest')
files = metadata if isinstance(metadata, list) else []
file_metadata = next(file for file in files if file.name == path.name)
file_metadata = next(file for file in files if (file.name == path.name or
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we not just use a list comprehension here?

path.name in file.original_names))

if stream.writers['md5'].hexdigest != file_metadata.extra['hashes']['md5']:
raise exceptions.UploadChecksumMismatchError()
Expand Down
58 changes: 58 additions & 0 deletions waterbutler/providers/dataverse/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
ORIGINAL_FORMATS = {

'RData': {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See two previous comments regarding multiple 'original_label': 'R Data'

'original_format': 'application/x-rlang-transport',
'original_label': 'R Data',
'content_type': 'text/tab-separated-values',
'all_extensions': ['rdata', 'Rdata', 'RData']
},
'sav': {
'original_format': 'application/x-spss-sav',
'original_label': 'SPSS SAV',
'content_type': 'text/tab-separated-values',
'all_extensions': ['sav']
},
'dta': {
'original_format': 'application/x-stata',
'original_label': 'Stata Binary',
'content_type': 'text/tab-separated-values',
'all_extensions': ['dta']
},
'por': {
'original_format': 'application/x-spss-por',
'original_label': 'SPSS Portable',
'content_type': 'text/tab-separated-values',
'all_extensions': ['por']
},
'csv': {
'original_format': 'text/csv',
'original_label': 'Comma Separated Values',
'content_type': 'text/tab-separated-values',
'all_extensions': ['csv', 'CSV']
},
'xlsx': {
'original_format': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'original_label': 'MS Excel (XLSX)',
'content_type': 'text/tab-separated-values',
'all_extensions': ['xlsx']
}
}


def original_ext_from_raw_metadata(data):
"""Use the raw metadata to figure out possible original extensions."""
label = data.get('originalFormatLabel', None)
file_format = data.get('originalFileFormat', None)
content_type = data.get('contentType', None)

if not label or not file_format or not content_type:
return None

for key in ORIGINAL_FORMATS:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably better as:

for key, value in ORIGINAL_FORMATS.items():
if (label == value['original_label'] and ...

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can apply the above to TestUtils class as well.

if (label == ORIGINAL_FORMATS[key]['original_label'] and
file_format == ORIGINAL_FORMATS[key]['original_format'] and
content_type == ORIGINAL_FORMATS[key]['content_type']):

return ORIGINAL_FORMATS[key]['all_extensions']

return None