Skip to content

Commit bf91a6e

Browse files
committed
Merge branch 'dev'
* dev: minor changes/package settings update accapt absolute path in DDF() add default value to search path add diff subcommand add function to compare 2 datasets
2 parents cc82cff + 03a48f3 commit bf91a6e

File tree

8 files changed

+212
-11
lines changed

8 files changed

+212
-11
lines changed

ddf_utils/cli.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,5 +212,65 @@ def merge_translation(path, split_path, lang_path, dtype, overwrite):
212212
click.echo('Done.')
213213

214214

215+
# for QA
216+
@ddf.command()
217+
@click.argument('dataset1')
218+
@click.argument('dataset2')
219+
@click.option('--git', '-g', is_flag=True)
220+
@click.option('--checkout-path', type=click.Path(), default='./etl/diff')
221+
@click.option('--diff-only', is_flag=True)
222+
def diff(dataset1, dataset2, git, checkout_path, diff_only):
223+
"""give a report on the statistical differences for datapoints between 2 datasets."""
224+
import ddf_utils.ddf_reader as dr
225+
from ddf_utils.qa import compare_with_func
226+
import tabulate
227+
from os.path import join
228+
229+
if git:
230+
from subprocess import check_output
231+
assert dr.is_dataset('./')
232+
233+
c1 = check_output(['git', 'rev-parse', dataset1])
234+
p1 = c1.strip().decode('utf8')
235+
236+
c2 = check_output(['git', 'rev-parse', dataset2])
237+
p2 = c2.strip().decode('utf8')
238+
239+
try:
240+
os.makedirs(join(checkout_path, p1))
241+
logging.info('checkout git rev {} into {}'.format(dataset1, join(checkout_path, p1)))
242+
os.system('git --work-tree={} checkout {} -- .'.format(join(checkout_path, p1), p1))
243+
except FileExistsError:
244+
pass
245+
246+
try:
247+
os.makedirs(join(checkout_path, p2))
248+
logging.info('checkout git rev {} into {}'.format(dataset2, join(checkout_path, p2)))
249+
os.system('git --work-tree={} checkout {} -- .'.format(join(checkout_path, p2), p2))
250+
except FileExistsError:
251+
pass
252+
253+
dr.config.DDF_SEARCH_PATH = checkout_path
254+
255+
d1 = dr.DDF(p1)
256+
d2 = dr.DDF(p2)
257+
258+
else:
259+
d1 = dr.DDF(dataset1)
260+
d2 = dr.DDF(dataset2)
261+
262+
result = compare_with_func(d1, d2)
263+
if diff_only:
264+
result = result[result.rval != 1]
265+
266+
cols = result.columns
267+
268+
# sort it
269+
result = result.sort_values(by='rval', ascending=False).set_index('indicator')
270+
271+
click.echo(tabulate.tabulate(result,
272+
headers=cols, tablefmt='psql'))
273+
274+
215275
if __name__ == '__main__':
216276
ddf()

ddf_utils/config.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55
import os
66

77
DDF_SEARCH_PATH = os.getenv('DDF_SEARCH_PATH')
8-
DICT_PATH = None
98

9+
if not DDF_SEARCH_PATH:
10+
DDF_SEARCH_PATH = './'
11+
12+
DICT_PATH = None
1013
DEBUG_ALL = False
1114
DEBUG_OUTPUT_PATH = None

ddf_utils/ddf_reader.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,20 @@
1212
class DDF():
1313
"""DDF reader class
1414
15-
The reader instance accepts an dataset id on init and search the dataset in path
16-
set by the `DDF_SEARCH_PATH` global variable.
15+
The reader instance accepts an dataset id or absolute path on init. If absolute path is given,
16+
it will load the dataset in path. Other wise it will search the dataset id in the path set by
17+
the `DDF_SEARCH_PATH` global variable.
1718
"""
1819
def __init__(self, ddf_id, no_check_valid=False):
19-
dataset_path = os.path.join(config.DDF_SEARCH_PATH, ddf_id)
20+
if os.path.isabs(ddf_id):
21+
self.dataset_path = ddf_id
22+
self.ddf_id = os.path.dirname(ddf_id)
23+
else:
24+
self.dataset_path = os.path.join(config.DDF_SEARCH_PATH, ddf_id)
25+
self.ddf_id = ddf_id
2026
if not no_check_valid:
21-
assert is_dataset(dataset_path), "path is not ddf dataset: {}".format(dataset_path)
22-
self.dataset_path = dataset_path
23-
self.ddf_id = ddf_id
27+
assert is_dataset(self.dataset_path), \
28+
"path is not ddf dataset: {}".format(self.dataset_path)
2429
self._datapackage = None
2530
self._concepts = None
2631

@@ -53,6 +58,12 @@ def dtypes(self):
5358

5459
return res
5560

61+
@property
62+
def indicator_dict(self):
63+
"""return all indicators"""
64+
return dict([name, list(item.keys())]
65+
for name, item in self.get_datapoint_files().items())
66+
5667
def get_all_files(self):
5768
"""return a list of all files in this dataset"""
5869
resources = self.datapackage['resources']
@@ -220,12 +231,12 @@ def get_datapoint_df(self, measure, primaryKey=None):
220231
keys = list(datapoint_files[measure].keys())[0]
221232
if primaryKey:
222233
if not set(keys) == set(primaryKey):
223-
raise ValueError('key not found for the measure!')
234+
raise KeyError('no such key for the measure: ', primaryKey)
224235
df = self.get_datapoints(measure, keys)
225236
return df[measure][keys]
226237
else:
227238
if not primaryKey:
228-
raise ValueError("please specify a primaryKey for measures with multiple"
239+
raise ValueError("please specify a primaryKey for measures with multiple "
229240
"primaryKeys")
230241
for keys in datapoint_files[measure].keys():
231242
if set(keys) == set(primaryKey):

ddf_utils/index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def get_datapackage(path, use_existing=True, to_disk=False):
3838
datapackage_new = create_datapackage(path)
3939
else:
4040
if use_existing:
41-
print("wARNING: no existing datapackage.json")
41+
print("WARNING: no existing datapackage.json")
4242
datapackage_new = create_datapackage(path)
4343

4444
if to_disk:

ddf_utils/qa.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""QA functioins.
4+
"""
5+
6+
import sys
7+
import logging
8+
import pandas as pd
9+
import numpy as np
10+
11+
logger = logging.getLogger('QA')
12+
this = sys.modules[__name__]
13+
14+
15+
def _gen_indicator_key_list(d):
16+
for k, v in d.items():
17+
for i in v:
18+
yield (k, i)
19+
20+
21+
def compare_with_func(dataset1, dataset2, fns=['rval', 'avg_pct_chg'],
22+
indicators=None, key=None):
23+
"""compare 2 datasets with functions"""
24+
25+
indicators1 = [(k, v) for k, v in _gen_indicator_key_list(dataset1.indicator_dict)]
26+
indicators2 = [(k, v) for k, v in _gen_indicator_key_list(dataset2.indicator_dict)]
27+
28+
# check availability for indicators
29+
s1 = set(indicators1)
30+
s2 = set(indicators2)
31+
32+
diff12 = s1 - s2
33+
diff21 = s2 - s1
34+
35+
if len(diff12) > 0:
36+
msg = ["below indicators are noly available in {}".format(dataset1.ddf_id)]
37+
for item in diff12:
38+
msg.append("- {} by {}".format(item[0], ', '.join(item[1])))
39+
msg.append('')
40+
logger.warning('\n'.join(msg))
41+
if len(diff21) > 0:
42+
msg = ["below indicators are noly available in {}".format(dataset2.ddf_id)]
43+
for item in diff21:
44+
msg.append("- {} by {}".format(item[0], ', '.join(item[1])))
45+
msg.append('')
46+
logger.warning('\n'.join(msg))
47+
48+
# construct a dataframe, including all indicators in both dataset.
49+
result = pd.DataFrame(list(s1.union(s2)), columns=['indicator', 'primary_key'])
50+
51+
def get_comp_df(indicator, k):
52+
'''get dataframes from old and new datasets, and combine them into one dataframe'''
53+
# FIXME: support multiple indicator in one file
54+
# like the indicators in ddf--sodertorn--stockholm_lan_basomrade
55+
try:
56+
i1 = dataset1.get_datapoint_df(indicator, k)
57+
except KeyError:
58+
raise
59+
try:
60+
i2 = dataset2.get_datapoint_df(indicator, k)
61+
except KeyError:
62+
raise
63+
i1 = i1.rename(columns={indicator: 'old'})
64+
i2 = i2.rename(columns={indicator: 'new'})
65+
comp = pd.concat([i1, i2], axis=1)
66+
67+
return comp
68+
69+
def do_compare(fns, indicator, k):
70+
try:
71+
comp_df = get_comp_df(indicator, k)
72+
except KeyError:
73+
return [np.nan] * len(fns)
74+
75+
return [f(comp_df) if callable(f) else getattr(this, f)(comp_df)
76+
for f in fns]
77+
78+
# only keep indicators we want to compare
79+
if indicators:
80+
result = result[result.indicator.isin(indicators)]
81+
if key:
82+
result = result[result.primary_key.isin(key)]
83+
84+
# append new columns before we do calculation
85+
for f in fns:
86+
result[f] = np.nan
87+
88+
result = result.set_index(['indicator', 'primary_key'])
89+
90+
for i in result.index:
91+
result.ix[i, fns] = do_compare(fns, i[0], i[1])
92+
93+
return result.reset_index()
94+
95+
96+
def rval(comp_df):
97+
return comp_df.corr().ix['old', 'new']
98+
99+
100+
def avg_pct_chg(comp_df):
101+
res = (comp_df['new'] - comp_df['old']) / comp_df['old'] * 100
102+
return res.replace([np.inf, -np.inf], np.nan).mean()

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ sphinx_rtd_theme
1010
sphinxcontrib-napoleon
1111
recommonmark
1212
hypothesis
13+
tabulate

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
'cookiecutter',
2121
'jsonschema',
2222
'Click',
23-
'daff'
23+
'daff',
24+
'tabulate'
2425
]
2526

2627
setup(

tests/test_qa.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import common
4+
from ddf_utils import DDF
5+
import numpy as np
6+
from numpy.testing import assert_array_equal
7+
8+
9+
def test_compare_func():
10+
from ddf_utils.qa import compare_with_func
11+
d1 = DDF('ddf--bp--energy')
12+
d2 = DDF('ddf--cme')
13+
14+
res1 = compare_with_func(d1, d1)
15+
res2 = compare_with_func(d1, d2)
16+
17+
assert_array_equal(res1.columns,
18+
['indicator', 'primary_key', 'rval', 'avg_pct_chg'])
19+
assert_array_equal(res1.rval.unique(), np.array([1.]))
20+
21+
assert_array_equal(res1.columns,
22+
['indicator', 'primary_key', 'rval', 'avg_pct_chg'])
23+
assert_array_equal(res2.rval.unique(), np.array([np.nan]))

0 commit comments

Comments
 (0)