Skip to content

Commit 9dddc9d

Browse files
authored
Merge pull request #1557 from Scifabric/improve-csv-exporter
Improve csv exporter
2 parents b9f766a + df7f2e8 commit 9dddc9d

File tree

3 files changed

+34
-105
lines changed

3 files changed

+34
-105
lines changed

pybossa/exporter/csv_export.py

Lines changed: 10 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -28,96 +28,28 @@
2828
from pybossa.util import UnicodeWriter
2929
from werkzeug.datastructures import FileStorage
3030
from werkzeug.utils import secure_filename
31+
from flatten_json import flatten
32+
import pandas as pd
3133

3234

3335
class CsvExporter(Exporter):
3436

35-
def _format_csv_row(self, row, ty):
36-
tmp = row.keys()
37-
task_keys = []
38-
for k in tmp:
39-
k = "%s__%s" % (ty, k)
40-
task_keys.append(k)
41-
if (type(row['info']) == dict):
42-
task_info_keys = []
43-
tmp = row['info'].keys()
44-
for k in tmp:
45-
k = "%sinfo__%s" % (ty, k)
46-
task_info_keys.append(k)
47-
else:
48-
task_info_keys = []
49-
50-
keys = sorted(task_keys + task_info_keys)
51-
values = []
52-
_prefix = "%sinfo" % ty
53-
for k in keys:
54-
prefix, k = k.split("__")
55-
if prefix == _prefix:
56-
if row['info'].get(k) is not None:
57-
values.append(row['info'][k])
58-
else:
59-
values.append(None)
60-
else:
61-
if row.get(k) is not None:
62-
values.append(row[k])
63-
else:
64-
values.append(None)
65-
66-
return values
67-
68-
def _handle_row(self, writer, t, ty):
69-
normal_ty = filter(lambda char: char.isalpha(), ty)
70-
writer.writerow(self._format_csv_row(t.dictize(), ty=normal_ty))
71-
72-
def _get_csv(self, out, writer, table, id):
73-
for tr in getattr(task_repo, 'filter_%ss_by' % table)(project_id=id,
74-
yielded=True):
75-
self._handle_row(writer, tr, table)
76-
out.seek(0)
77-
yield out.read()
78-
79-
def _format_headers(self, t, ty):
80-
tmp = t.dictize().keys()
81-
task_keys = []
82-
for k in tmp:
83-
k = "%s__%s" % (ty, k)
84-
task_keys.append(k)
85-
if (type(t.info) == dict):
86-
task_info_keys = []
87-
tmp = t.info.keys()
88-
for k in tmp:
89-
k = "%sinfo__%s" % (ty, k)
90-
task_info_keys.append(k)
91-
else:
92-
task_info_keys = []
93-
keys = task_keys + task_info_keys
94-
return sorted(keys)
95-
9637
def _respond_csv(self, ty, id):
9738
out = tempfile.TemporaryFile()
9839
writer = UnicodeWriter(out)
99-
t = getattr(task_repo, 'get_%s_by' % ty)(project_id=id)
100-
if t is not None:
101-
headers = self._format_headers(t, ty)
102-
writer.writerow(headers)
103-
104-
return self._get_csv(out, writer, ty, id)
105-
else:
106-
def empty_csv(out):
107-
yield out.read()
108-
return empty_csv(out)
109-
40+
data = getattr(task_repo, 'filter_%ss_by' % ty)(project_id=id)
41+
flat_data = [flatten(datum.dictize()) for datum in data]
42+
return pd.DataFrame(flat_data)
43+
11044
def _make_zip(self, project, ty):
11145
name = self._project_name_latin_encoded(project)
112-
csv_task_generator = self._respond_csv(ty, project.id)
113-
if csv_task_generator is not None:
114-
# TODO: use temp file from csv generation directly
46+
dataframe = self._respond_csv(ty, project.id)
47+
if dataframe is not None:
11548
datafile = tempfile.NamedTemporaryFile()
11649
try:
117-
for line in csv_task_generator:
118-
datafile.write(str(line))
50+
dataframe.to_csv(datafile, index=False,
51+
encoding='utf-8')
11952
datafile.flush()
120-
csv_task_generator.close() # delete temp csv file
12153
zipped_datafile = tempfile.NamedTemporaryFile()
12254
try:
12355
_zip = self._zip_factory(zipped_datafile.name)

setup.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,13 @@
5858
"webassets>=0.12.1, <0.12.2",
5959
"readability-lxml>=0.6.2, <1.0",
6060
"pybossa-onesignal",
61+
"pandas>=0.20.2, <0.20.3",
62+
"flatten-json>=0.1.5, <0.1.6"
6163
]
6264

6365
setup(
6466
name = 'pybossa',
65-
version = '2.4.2',
67+
version = '2.4.3',
6668
packages = find_packages(),
6769
install_requires = requirements,
6870
# only needed when installing directly from setup.py (PyPi, eggs?) and pointing to e.g. a git repo.

test/test_web.py

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
from unidecode import unidecode
4444
from werkzeug.utils import secure_filename
4545
from nose.tools import assert_raises
46+
from flatten_json import flatten
4647

4748

4849
class TestWeb(web.Helper):
@@ -4267,29 +4268,30 @@ def test_export_task_csv(self):
42674268
assert len(exported_tasks) == len(project.tasks), err_msg
42684269
for t in project.tasks:
42694270
err_msg = "All the task column names should be included"
4270-
for tk in t.dictize().keys():
4271-
expected_key = "task__%s" % tk
4271+
for tk in flatten(t.dictize()).keys():
4272+
expected_key = "%s" % tk
42724273
assert expected_key in keys, err_msg
42734274
err_msg = "All the task.info column names should be included"
42744275
for tk in t.info.keys():
4275-
expected_key = "taskinfo__%s" % tk
4276+
expected_key = "info_%s" % tk
42764277
assert expected_key in keys, err_msg
42774278

42784279
for et in exported_tasks:
4279-
task_id = et[keys.index('task__id')]
4280+
task_id = et[keys.index('id')]
42804281
task = db.session.query(Task).get(task_id)
4282+
task_dict_flat = flatten(task.dictize())
42814283
task_dict = task.dictize()
4282-
for k in task_dict:
4283-
slug = 'task__%s' % k
4284-
err_msg = "%s != %s" % (task_dict[k], et[keys.index(slug)])
4285-
if k != 'info':
4286-
assert unicode(task_dict[k]) == et[keys.index(slug)], err_msg
4284+
for k in task_dict_flat.keys():
4285+
slug = '%s' % k
4286+
err_msg = "%s != %s" % (task_dict_flat[k], et[keys.index(slug)])
4287+
if task_dict_flat[k] is not None:
4288+
assert unicode(task_dict_flat[k]) == et[keys.index(slug)], err_msg
42874289
else:
4288-
assert json.dumps(task_dict[k]) == et[keys.index(slug)], err_msg
4290+
assert u'' == et[keys.index(slug)], err_msg
42894291
for k in task_dict['info'].keys():
4290-
slug = 'taskinfo__%s' % k
4292+
slug = 'info_%s' % k
42914293
err_msg = "%s != %s" % (task_dict['info'][k], et[keys.index(slug)])
4292-
assert unicode(task_dict['info'][k]) == et[keys.index(slug)], err_msg
4294+
assert unicode(task_dict_flat[slug]) == et[keys.index(slug)], err_msg
42934295
# Tasks are exported as an attached file
42944296
content_disposition = 'attachment; filename=%d_project1_task_csv.zip' % project.id
42954297
assert res.headers.get('Content-Disposition') == content_disposition, res.headers
@@ -4363,28 +4365,21 @@ def test_53_export_task_runs_csv(self):
43634365
assert len(exported_task_runs) == len(project.task_runs), err_msg
43644366

43654367
for t in project.tasks[0].task_runs:
4366-
for tk in t.dictize().keys():
4367-
expected_key = "task_run__%s" % tk
4368-
assert expected_key in keys, expected_key
4369-
for tk in t.info.keys():
4370-
expected_key = "task_runinfo__%s" % tk
4368+
for tk in flatten(t.dictize()).keys():
4369+
expected_key = "%s" % tk
43714370
assert expected_key in keys, expected_key
43724371

43734372
for et in exported_task_runs:
4374-
task_run_id = et[keys.index('task_run__id')]
4373+
task_run_id = et[keys.index('id')]
43754374
task_run = db.session.query(TaskRun).get(task_run_id)
4376-
task_run_dict = task_run.dictize()
4375+
task_run_dict = flatten(task_run.dictize())
43774376
for k in task_run_dict:
4378-
slug = 'task_run__%s' % k
4377+
slug = '%s' % k
43794378
err_msg = "%s != %s" % (task_run_dict[k], et[keys.index(slug)])
4380-
if k != 'info':
4379+
if task_run_dict[k] is not None:
43814380
assert unicode(task_run_dict[k]) == et[keys.index(slug)], err_msg
43824381
else:
4383-
assert json.dumps(task_run_dict[k]) == et[keys.index(slug)], err_msg
4384-
for k in task_run_dict['info'].keys():
4385-
slug = 'task_runinfo__%s' % k
4386-
err_msg = "%s != %s" % (task_run_dict['info'][k], et[keys.index(slug)])
4387-
assert unicode(task_run_dict['info'][k]) == et[keys.index(slug)], err_msg
4382+
assert u'' == et[keys.index(slug)], err_msg
43884383
# Task runs are exported as an attached file
43894384
content_disposition = 'attachment; filename=%d_project1_task_run_csv.zip' % project.id
43904385
assert res.headers.get('Content-Disposition') == content_disposition, res.headers

0 commit comments

Comments
 (0)