-
Notifications
You must be signed in to change notification settings - Fork 2.6k
/
Copy pathmodels.py
219 lines (186 loc) · 8.31 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
"""This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license.
"""
import logging
import os
import uuid
from collections import Counter
import pandas as pd
try:
import ujson as json
except: # noqa: E722
import json
from django.conf import settings
from django.db import models
from django.utils.functional import cached_property
from rest_framework.exceptions import ValidationError
logger = logging.getLogger(__name__)
def upload_name_generator(instance, filename):
project = str(instance.project_id)
project_dir = os.path.join(settings.MEDIA_ROOT, settings.UPLOAD_DIR, project)
os.makedirs(project_dir, exist_ok=True)
path = settings.UPLOAD_DIR + '/' + project + '/' + str(uuid.uuid4())[0:8] + '-' + filename
return path
class FileUpload(models.Model):
user = models.ForeignKey('users.User', related_name='file_uploads', on_delete=models.CASCADE)
project = models.ForeignKey('projects.Project', related_name='file_uploads', on_delete=models.CASCADE)
file = models.FileField(upload_to=upload_name_generator)
def has_permission(self, user):
user.project = self.project # link for activity log
return self.project.has_permission(user)
@cached_property
def filepath(self):
return self.file.name
@property
def url(self):
if settings.FORCE_SCRIPT_NAME and not (settings.HOSTNAME and settings.CLOUD_FILE_STORAGE_ENABLED):
return settings.FORCE_SCRIPT_NAME + '/' + self.file.url.lstrip('/')
else:
return self.file.url
@property
def format(self):
file_format = None
try:
file_format = os.path.splitext(self.filepath)[-1]
except: # noqa: E722
pass
finally:
logger.debug('Get file format ' + str(file_format))
return file_format
@property
def content(self):
# cache file body
if hasattr(self, '_file_body'):
body = getattr(self, '_file_body')
else:
body = self.file.read().decode('utf-8')
setattr(self, '_file_body', body)
return body
def read_tasks_list_from_csv(self, sep=','):
logger.debug('Read tasks list from CSV file {}'.format(self.filepath))
tasks = pd.read_csv(self.file.open(), sep=sep).fillna('').to_dict('records')
tasks = [{'data': task} for task in tasks]
return tasks
def read_tasks_list_from_tsv(self):
return self.read_tasks_list_from_csv('\t')
def read_tasks_list_from_txt(self):
logger.debug('Read tasks list from text file {}'.format(self.filepath))
lines = self.content.splitlines()
tasks = [{'data': {settings.DATA_UNDEFINED_NAME: line}} for line in lines]
return tasks
def read_tasks_list_from_json(self):
logger.debug('Read tasks list from JSON file {}'.format(self.filepath))
raw_data = self.content
# Python 3.5 compatibility fix https://docs.python.org/3/whatsnew/3.6.html#json
try:
tasks = json.loads(raw_data)
except TypeError:
tasks = json.loads(raw_data.decode('utf8'))
if isinstance(tasks, dict):
tasks = [tasks]
tasks_formatted = []
for i, task in enumerate(tasks):
if not task.get('data'):
task = {'data': task}
if not isinstance(task['data'], dict):
raise ValidationError('Task item should be dict')
tasks_formatted.append(task)
return tasks_formatted
def read_task_from_hypertext_body(self):
logger.debug('Read 1 task from hypertext file {}'.format(self.filepath))
body = self.content
tasks = [{'data': {settings.DATA_UNDEFINED_NAME: body}}]
return tasks
def read_task_from_uploaded_file(self):
logger.debug('Read 1 task from uploaded file {}'.format(self.filepath))
if settings.CLOUD_FILE_STORAGE_ENABLED:
tasks = [{'data': {settings.DATA_UNDEFINED_NAME: self.file.storage.url(self.file.name)}}]
else:
tasks = [{'data': {settings.DATA_UNDEFINED_NAME: self.url}}]
return tasks
@property
def format_could_be_tasks_list(self):
return self.format in ('.csv', '.tsv', '.txt')
def read_tasks(self, file_as_tasks_list=True):
file_format = self.format
try:
# file as tasks list
if file_format == '.csv' and file_as_tasks_list:
tasks = self.read_tasks_list_from_csv()
elif file_format == '.tsv' and file_as_tasks_list:
tasks = self.read_tasks_list_from_tsv()
elif file_format == '.txt' and file_as_tasks_list:
tasks = self.read_tasks_list_from_txt()
elif file_format == '.json':
tasks = self.read_tasks_list_from_json()
# otherwise - only one object tag should be presented in label config
elif not self.project.one_object_in_label_config:
raise ValidationError(
'Your label config has more than one data key and direct file upload supports only '
'one data key. To import data with multiple data keys, use a JSON or CSV file.'
)
# file as a single asset
elif file_format in ('.html', '.htm', '.xml'):
tasks = self.read_task_from_hypertext_body()
else:
tasks = self.read_task_from_uploaded_file()
except Exception as exc:
raise ValidationError('Failed to parse input file ' + self.filepath + ': ' + str(exc))
return tasks
@classmethod
def load_tasks_from_uploaded_files(
cls, project, file_upload_ids=None, formats=None, files_as_tasks_list=True, trim_size=None
):
tasks = []
fileformats = []
common_data_fields = set()
# scan all files
file_uploads = FileUpload.objects.filter(project=project)
if file_upload_ids:
file_uploads = file_uploads.filter(id__in=file_upload_ids)
for file_upload in file_uploads:
file_format = file_upload.format
if formats and file_format not in formats:
continue
new_tasks = file_upload.read_tasks(files_as_tasks_list)
for task in new_tasks:
task['file_upload_id'] = file_upload.id
new_data_fields = set(iter(new_tasks[0]['data'].keys())) if len(new_tasks) > 0 else set()
if not common_data_fields:
common_data_fields = new_data_fields
elif not common_data_fields.intersection(new_data_fields):
raise ValidationError(
_old_vs_new_data_keys_inconsistency_message(
new_data_fields, common_data_fields, file_upload.file.name
)
)
else:
common_data_fields &= new_data_fields
tasks += new_tasks
fileformats.append(file_format)
if trim_size is not None:
if len(tasks) > trim_size:
break
return tasks, dict(Counter(fileformats)), common_data_fields
def _old_vs_new_data_keys_inconsistency_message(new_data_keys, old_data_keys, current_file):
new_data_keys_list = ','.join(new_data_keys)
old_data_keys_list = ','.join(old_data_keys)
common_prefix = "You're trying to import inconsistent data:\n"
if new_data_keys_list == old_data_keys_list:
return ''
elif new_data_keys_list == settings.DATA_UNDEFINED_NAME:
return (
common_prefix + 'uploading a single file {0} '
'clashes with data key(s) found from other files:\n"{1}"'.format(current_file, old_data_keys_list)
)
elif old_data_keys_list == settings.DATA_UNDEFINED_NAME:
return (
common_prefix + 'uploading tabular data from {0} with data key(s) {1}, '
'clashes with other raw binary files (images, audios, etc.)'.format(current_file, new_data_keys_list)
)
else:
return (
common_prefix + 'uploading tabular data from "{0}" with data key(s) "{1}", '
'clashes with data key(s) found from other files:\n"{2}"'.format(
current_file, new_data_keys_list, old_data_keys_list
)
)