Merge branch 'dev'

semio · semio · commit bf91a6e0920e · 2017-04-07T15:41:28.000+08:00
* dev:
  minor changes/package settings update
  accapt absolute path in DDF()
  add default value to search path
  add diff subcommand
  add function to compare 2 datasets
diff --git a/ddf_utils/cli.py b/ddf_utils/cli.py
@@ -212,5 +212,65 @@ def merge_translation(path, split_path, lang_path, dtype, overwrite):
     click.echo('Done.')
 
 
+# for QA
+@ddf.command()
+@click.argument('dataset1')
+@click.argument('dataset2')
+@click.option('--git', '-g', is_flag=True)
+@click.option('--checkout-path', type=click.Path(), default='./etl/diff')
+@click.option('--diff-only', is_flag=True)
+def diff(dataset1, dataset2, git, checkout_path, diff_only):
+    """give a report on the statistical differences for datapoints between 2 datasets."""
+    import ddf_utils.ddf_reader as dr
+    from ddf_utils.qa import compare_with_func
+    import tabulate
+    from os.path import join
+
+    if git:
+        from subprocess import check_output
+        assert dr.is_dataset('./')
+
+        c1 = check_output(['git', 'rev-parse', dataset1])
+        p1 = c1.strip().decode('utf8')
+
+        c2 = check_output(['git', 'rev-parse', dataset2])
+        p2 = c2.strip().decode('utf8')
+
+        try:
+            os.makedirs(join(checkout_path, p1))
+            logging.info('checkout git rev {} into {}'.format(dataset1, join(checkout_path, p1)))
+            os.system('git --work-tree={} checkout {} -- .'.format(join(checkout_path, p1), p1))
+        except FileExistsError:
+            pass
+
+        try:
+            os.makedirs(join(checkout_path, p2))
+            logging.info('checkout git rev {} into {}'.format(dataset2, join(checkout_path, p2)))
+            os.system('git --work-tree={} checkout {} -- .'.format(join(checkout_path, p2), p2))
+        except FileExistsError:
+            pass
+
+        dr.config.DDF_SEARCH_PATH = checkout_path
+
+        d1 = dr.DDF(p1)
+        d2 = dr.DDF(p2)
+
+    else:
+        d1 = dr.DDF(dataset1)
+        d2 = dr.DDF(dataset2)
+
+    result = compare_with_func(d1, d2)
+    if diff_only:
+        result = result[result.rval != 1]
+
+    cols = result.columns
+
+    # sort it
+    result = result.sort_values(by='rval', ascending=False).set_index('indicator')
+
+    click.echo(tabulate.tabulate(result,
+                                 headers=cols, tablefmt='psql'))
+
+
 if __name__ == '__main__':
     ddf()
diff --git a/ddf_utils/config.py b/ddf_utils/config.py
@@ -5,7 +5,10 @@
 import os
 
 DDF_SEARCH_PATH = os.getenv('DDF_SEARCH_PATH')
-DICT_PATH = None
 
+if not DDF_SEARCH_PATH:
+    DDF_SEARCH_PATH = './'
+
+DICT_PATH = None
 DEBUG_ALL = False
 DEBUG_OUTPUT_PATH = None
diff --git a/ddf_utils/ddf_reader.py b/ddf_utils/ddf_reader.py
@@ -12,15 +12,20 @@
 class DDF():
     """DDF reader class
 
-    The reader instance accepts an dataset id on init and search the dataset in path
-    set by the `DDF_SEARCH_PATH` global variable.
+    The reader instance accepts an dataset id or absolute path on init. If absolute path is given,
+    it will load the dataset in path. Other wise it will search the dataset id in the path set by
+    the `DDF_SEARCH_PATH` global variable.
     """
     def __init__(self, ddf_id, no_check_valid=False):
-        dataset_path = os.path.join(config.DDF_SEARCH_PATH, ddf_id)
+        if os.path.isabs(ddf_id):
+            self.dataset_path = ddf_id
+            self.ddf_id = os.path.dirname(ddf_id)
+        else:
+            self.dataset_path = os.path.join(config.DDF_SEARCH_PATH, ddf_id)
+            self.ddf_id = ddf_id
         if not no_check_valid:
-            assert is_dataset(dataset_path), "path is not ddf dataset: {}".format(dataset_path)
-        self.dataset_path = dataset_path
-        self.ddf_id = ddf_id
+            assert is_dataset(self.dataset_path), \
+                "path is not ddf dataset: {}".format(self.dataset_path)
         self._datapackage = None
         self._concepts = None
 
@@ -53,6 +58,12 @@ def dtypes(self):
 
         return res
 
+    @property
+    def indicator_dict(self):
+        """return all indicators"""
+        return dict([name, list(item.keys())]
+                    for name, item in self.get_datapoint_files().items())
+
     def get_all_files(self):
         """return a list of all files in this dataset"""
         resources = self.datapackage['resources']
@@ -220,12 +231,12 @@ def get_datapoint_df(self, measure, primaryKey=None):
             keys = list(datapoint_files[measure].keys())[0]
             if primaryKey:
                 if not set(keys) == set(primaryKey):
-                    raise ValueError('key not found for the measure!')
+                    raise KeyError('no such key for the measure: ', primaryKey)
             df = self.get_datapoints(measure, keys)
             return df[measure][keys]
         else:
             if not primaryKey:
-                raise ValueError("please specify a primaryKey for measures with multiple"
+                raise ValueError("please specify a primaryKey for measures with multiple "
                                  "primaryKeys")
             for keys in datapoint_files[measure].keys():
                 if set(keys) == set(primaryKey):
diff --git a/ddf_utils/index.py b/ddf_utils/index.py
@@ -38,7 +38,7 @@ def get_datapackage(path, use_existing=True, to_disk=False):
             datapackage_new = create_datapackage(path)
     else:
         if use_existing:
-            print("wARNING: no existing datapackage.json")
+            print("WARNING: no existing datapackage.json")
         datapackage_new = create_datapackage(path)
 
     if to_disk:
diff --git a/ddf_utils/qa.py b/ddf_utils/qa.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+
+"""QA functioins.
+"""
+
+import sys
+import logging
+import pandas as pd
+import numpy as np
+
+logger = logging.getLogger('QA')
+this = sys.modules[__name__]
+
+
+def _gen_indicator_key_list(d):
+    for k, v in d.items():
+        for i in v:
+            yield (k, i)
+
+
+def compare_with_func(dataset1, dataset2, fns=['rval', 'avg_pct_chg'],
+                      indicators=None, key=None):
+    """compare 2 datasets with functions"""
+
+    indicators1 = [(k, v) for k, v in _gen_indicator_key_list(dataset1.indicator_dict)]
+    indicators2 = [(k, v) for k, v in _gen_indicator_key_list(dataset2.indicator_dict)]
+
+    # check availability for indicators
+    s1 = set(indicators1)
+    s2 = set(indicators2)
+
+    diff12 = s1 - s2
+    diff21 = s2 - s1
+
+    if len(diff12) > 0:
+        msg = ["below indicators are noly available in {}".format(dataset1.ddf_id)]
+        for item in diff12:
+            msg.append("- {} by {}".format(item[0], ', '.join(item[1])))
+        msg.append('')
+        logger.warning('\n'.join(msg))
+    if len(diff21) > 0:
+        msg = ["below indicators are noly available in {}".format(dataset2.ddf_id)]
+        for item in diff21:
+            msg.append("- {} by {}".format(item[0], ', '.join(item[1])))
+        msg.append('')
+        logger.warning('\n'.join(msg))
+
+    # construct a dataframe, including all indicators in both dataset.
+    result = pd.DataFrame(list(s1.union(s2)), columns=['indicator', 'primary_key'])
+
+    def get_comp_df(indicator, k):
+        '''get dataframes from old and new datasets, and combine them into one dataframe'''
+        # FIXME: support multiple indicator in one file
+        # like the indicators in ddf--sodertorn--stockholm_lan_basomrade
+        try:
+            i1 = dataset1.get_datapoint_df(indicator, k)
+        except KeyError:
+            raise
+        try:
+            i2 = dataset2.get_datapoint_df(indicator, k)
+        except KeyError:
+            raise
+        i1 = i1.rename(columns={indicator: 'old'})
+        i2 = i2.rename(columns={indicator: 'new'})
+        comp = pd.concat([i1, i2], axis=1)
+
+        return comp
+
+    def do_compare(fns, indicator, k):
+        try:
+            comp_df = get_comp_df(indicator, k)
+        except KeyError:
+            return [np.nan] * len(fns)
+
+        return [f(comp_df) if callable(f) else getattr(this, f)(comp_df)
+                for f in fns]
+
+    # only keep indicators we want to compare
+    if indicators:
+        result = result[result.indicator.isin(indicators)]
+    if key:
+        result = result[result.primary_key.isin(key)]
+
+    # append new columns before we do calculation
+    for f in fns:
+        result[f] = np.nan
+
+    result = result.set_index(['indicator', 'primary_key'])
+
+    for i in result.index:
+        result.ix[i, fns] = do_compare(fns, i[0], i[1])
+
+    return result.reset_index()
+
+
+def rval(comp_df):
+    return comp_df.corr().ix['old', 'new']
+
+
+def avg_pct_chg(comp_df):
+    res = (comp_df['new'] - comp_df['old']) / comp_df['old'] * 100
+    return res.replace([np.inf, -np.inf], np.nan).mean()
diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,4 @@ sphinx_rtd_theme
 sphinxcontrib-napoleon
 recommonmark
 hypothesis
+tabulate
diff --git a/setup.py b/setup.py
@@ -20,7 +20,8 @@
     'cookiecutter',
     'jsonschema',
     'Click',
-    'daff'
+    'daff',
+    'tabulate'
 ]
 
 setup(
diff --git a/tests/test_qa.py b/tests/test_qa.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+import common
+from ddf_utils import DDF
+import numpy as np
+from numpy.testing import assert_array_equal
+
+
+def test_compare_func():
+    from ddf_utils.qa import compare_with_func
+    d1 = DDF('ddf--bp--energy')
+    d2 = DDF('ddf--cme')
+
+    res1 = compare_with_func(d1, d1)
+    res2 = compare_with_func(d1, d2)
+
+    assert_array_equal(res1.columns,
+                       ['indicator', 'primary_key', 'rval', 'avg_pct_chg'])
+    assert_array_equal(res1.rval.unique(), np.array([1.]))
+
+    assert_array_equal(res1.columns,
+                       ['indicator', 'primary_key', 'rval', 'avg_pct_chg'])
+    assert_array_equal(res2.rval.unique(), np.array([np.nan]))

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,8 @@`
`20`	`20`	`'cookiecutter',`
`21`	`21`	`'jsonschema',`
`22`	`22`	`'Click',`
`23`		`- 'daff'`
	`23`	`+ 'daff',`
	`24`	`+ 'tabulate'`
`24`	`25`	`]`
`25`	`26`
`26`	`27`	`setup(`