semio
diff --git a/‎CHANGELOG.md‎
Lines changed: 18 additions & 6 deletions b/‎CHANGELOG.md‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 16 additions & 49 deletions b/‎README.md‎
Lines changed: 16 additions & 49 deletions
diff --git a/‎ddf_utils/chef/cook.py‎
Lines changed: 67 additions & 50 deletions b/‎ddf_utils/chef/cook.py‎
Lines changed: 67 additions & 50 deletions
diff --git a/‎ddf_utils/chef/dag.py‎
Lines changed: 26 additions & 1 deletion b/‎ddf_utils/chef/dag.py‎
Lines changed: 26 additions & 1 deletion
@@ -1,3 +1,21 @@
+## version 0.2.9 2017-01-03
+
+- added tests for all procedures
+- removed outdated procedures (align, accumulate)
+- more api docs, and all docs are available in read the docs now
+- new `--ddf_dir` option for `ddf run_recipe`  #45
+- add options for `serve` procedures and `serving` section. Now you should
+provide a list of dictionaries in `serving` section, instead of a list of
+ids as pervious version
+- improvements and bug fixes
+
+## version 0.2.8 2016-12-13
+
+- new proecedures: `window` (#25)
+- updated `groupby` procedure (#25)
+- updated `translate_column` procedure to include the function in `align` (#3)
+- minor improvements
+
 ## version 0.2.7 2016-12-06
 
 - use DAG to model the recipe. changes are:
@@ -9,9 +27,3 @@
 - added support for serve section
 - renamed procedure `add_concepts` to `extract_concepts` #40
 
-## version 0.28. 2016-12-13
-
-- new proecedures: `window` (#25)
-- updated `groupby` procedure (#25)
-- updated `translate_column` procedure to include the function in `align` (#3)
-- minor improvements
 
@@ -1,5 +1,18 @@
 # ddf_utils
 
+ddf_utils is a Python library and command line tool for people working with
+[Tabular Data Package][1] in [DDF model][2]. It provides various functions for [ETL tasks][3],
+including string formatting, data transforming, generating datapackage.json,
+reading data form DDF datasets, running [recipes][4], a decleative
+DSL designed to manipulate datasets to generate new datasets, and other
+functions we find useful in daily works in [Gapminder][5].
+
+[1]: http://specs.frictionlessdata.io/tabular-data-package
+[2]: https://github.com/open-numbers/wiki/wiki/Introduction-to-DDF
+[3]: https://en.wikipedia.org/wiki/Extract,_transform,_load
+[4]: https://ddf-utils.readthedocs.io/en/latest/recipe.html
+[5]: https://www.gapminder.org/
+
 ## Installation
 
 We are using python3 only features such as type signature in this repo.
@@ -16,53 +29,7 @@ try updating setuptools to latest version:
 
 and then reinstall ddf_utils should fix the problem.
 
-## Commandline helper
-
-we provide a commandline utility `ddf` for etl tasks. For now supported commands are:
-
-```
-$ ddf --help
-Usage: ddf [OPTIONS] COMMAND [ARGS]...
-
-Options:
-  --help  Show this message and exit.
-
-Commands:
-  cleanup             clean up ddf files or translation files
-  create_datapackage  create datapackage.json
-  merge_translation   merge all translation files from crowdin
-  new                 create a new ddf project
-  run_recipe          generate new ddf dataset with recipe
-  split_translation   split ddf files for crowdin translation
-```
-
-for each subcommands, you can run `ddf <subcommand> --help` to get help
-of that subcommand
-
-### Recipe
-
-document for recipe: [link](https://github.com/semio/ddf--gapminder--systema_globalis/blob/feature/autogenerated/etl/recipes/README.md)
-
-to run a recipe, simply run following:
-
-```
-$ ddf run_recipe -i path_to_recipe -o outdir
-```
-
-to run a recipe without saving the result into disk, run
-
-```
-$ ddf run_recipe -i path_to_recipe -d
-```
-
-note that you should set `ddf_dir`/`recipes_dir`/`dictionary_dir` correct in order 
-the chef can find the correct file. if there are includes in the recipe, only the top 
-level `ddf_dir` will be used (so the ddf_dir setting in sub-recipes will be ignored). 
-
-### useful API for etl tasks
-
-You can check the api documents at [readthedoc][1] or clone this repo and read it in
-docs/_html. Note that the chef module document is not complete in readthedoc due to a 
-bug in their system.
+## Usage
 
-[1]: https://ddf-utils.readthedocs.io/en/latest/py-modindex.html
+Check the [documents](https://ddf-utils.readthedocs.io/en/latest/intro.html) for
+how to use ddf_utils.
@@ -8,9 +8,9 @@
 
 from . ingredient import *
 from . dag import DAG, IngredientNode, ProcedureNode
+from . helpers import read_opt
 from .. import config
 from . procedure import *
-from .. str import format_float_digits
 
 import logging
 
@@ -26,9 +26,30 @@ def _loadfile(f):
 
 
 # functions for reading/running recipe
-def build_recipe(recipe_file, to_disk=False):
-    """build a complete recipe file if there are includes in
-    recipe file, if no includes found than return the file as is.
+def build_recipe(recipe_file, to_disk=False, **kwargs):
+    """build a complete recipe object.
+
+    This function will check each part of recipe, convert string (the ingredient ids,
+    dictionaries file names) into actual objects.
+
+    If there are includes in recipe file, this function will run recurivly.
+    If no includes found then return the parsed object as is.
+
+    Parameters
+    ----------
+    recipe_file : `str`
+        path to recipe file
+
+    Keyword Args
+    ------------
+    to_disk : bool
+        if true, save the parsed reslut to a yaml file in working dir
+
+    Other Parameters
+    ----------------
+    ddf_dir : `str`
+        path to search for DDF datasets, will overwrite the contfig in recipe
+
     """
     recipe = _loadfile(recipe_file)
 
@@ -64,6 +85,12 @@ def build_recipe(recipe_file, to_disk=False):
 
                     recipe['cooking'][p][i]['options']['dictionary'] = _loadfile(path)
 
+    # setting ddf search path if option is provided
+    if 'ddf_dir' in kwargs.keys():
+        if 'config' not in recipe.keys():
+            recipe['config'] = AttrDict()
+        recipe.config.ddf_dir = kwargs['ddf_dir']
+
     if 'include' not in recipe.keys():
         return recipe
     else:  # append sub-recipe entities into main recipe
@@ -156,6 +183,10 @@ def check_dataset_availability(recipe):
 
 
 def build_dag(recipe):
+    """build a DAG model for the recipe.
+
+    For more detail for DAG model, see :py:mod:`ddf_utils.chef.dag`.
+    """
 
     def add_dependency(dag, upstream_id, downstream):
         if not dag.has_task(upstream_id):
@@ -203,9 +234,11 @@ def add_dependency(dag, upstream_id, downstream):
         if not dag.has_task(i):
             raise ValueError('Ingredient not found: ' + i)
     if 'serving' in recipe.keys():
+        if len(serving) > 0:
+            raise ValueError('can not have serve procedure and serving section at same time!')
         for i in recipe['serving']:
-            if not dag.has_task(i):
-                raise ValueError('Ingredient not found: ' + i)
+            if not dag.has_task(i['id']):
+                raise ValueError('Ingredient not found: ' + i['id'])
     # display the tree
     # dag.tree_view()
     return dag
@@ -215,7 +248,25 @@ def run_recipe(recipe):
     """run the recipe.
 
     returns a dictionary. keys are `concepts`, `entities` and `datapoints`,
-    and values are ingredients return by the procedures
+    and values are ingredients defined in the `serve` procedures or `serving` section.
+    for example:
+
+    .. code-block:: python
+
+        {
+            "concepts": [{"ingredient": DataFrame1, "options": None}]
+            "datapoints": [
+                {
+                    "ingredient": DataFrame2,
+                    "options": {"digits": 5}
+                },
+                {
+                    "ingredient": DataFrame3,
+                    "options": {"digits": 1}
+                },
+            ]
+        }
+
     """
     try:
         config.DDF_SEARCH_PATH = recipe['config']['ddf_dir']
@@ -242,62 +293,28 @@ def run_recipe(recipe):
             func = p['procedure']
             if func == 'serve':
                 ingredients = [dag.get_task(x).evaluate() for x in p['ingredients']]
-                [dishes[k].append(i) for i in ingredients]
+                opts = read_opt(p, 'options', default=dict())
+                [dishes[k].append({'ingredient': i, 'options': opts}) for i in ingredients]
                 continue
             out = dag.get_task(p['result']).evaluate()
         # if there is no seving procedures/section, use the last output Ingredient object as final result.
         if len(dishes[k]) == 0 and 'serving' not in recipe.keys():
             logger.warning('serving last procedure output for {}: {}'.format(k, out.ingred_id))
-            dishes[k].append(out)
+            dishes[k].append({'ingredient': out, 'options': dict()})
     # update dishes when there is serving section
     if 'serving' in recipe.keys():
         for i in recipe['serving']:
-            ing = dag.get_task(i).evaluate()
+            opts = read_opt(i, 'options', default=dict())
+            ing = dag.get_task(i['id']).evaluate()
             if ing.dtype in dishes.keys():
-                dishes[ing.dtype].append(ing)
+                dishes[ing.dtype].append({'ingredient': ing, 'options': opts})
             else:
-                dishes[ing.dtype] = [ing]
+                dishes[ing.dtype] = [{'ingredient': ing, 'options': opts}]
     return dishes
 
 
 def dish_to_csv(dishes, outpath):
+    """save the recipe output to disk"""
     for t, ds in dishes.items():
         for dish in ds:
-            all_data = dish.get_data()
-            if isinstance(all_data, dict):
-                for k, df in all_data.items():
-                    # change boolean into string
-                    for i, v in df.dtypes.iteritems():
-                        if v == 'bool':
-                            df[i] = df[i].map(lambda x: str(x).upper())
-                    if t == 'datapoints':
-                        by = dish.key_to_list()
-                        path = os.path.join(outpath, 'ddf--{}--{}--by--{}.csv'.format(t, k, '--'.join(by)))
-                    elif t == 'concepts':
-                        path = os.path.join(outpath, 'ddf--{}.csv'.format(t))
-                    elif t == 'entities':
-                        domain = dish.key[0]
-                        if k == domain:
-                            path = os.path.join(outpath, 'ddf--{}--{}.csv'.format(t, k))
-                        else:
-                            path = os.path.join(outpath, 'ddf--{}--{}--{}.csv'.format(t, domain, k))
-                    else:
-                        raise ValueError('Not a correct collection: ' + t)
-
-                    if t == 'datapoints':
-                        df = df.set_index(by)
-                        if not np.issubdtype(df[k].dtype, np.number):
-                            try:
-                                df[k] = df[k].astype(float)
-                                # TODO: make floating precision an option
-                                df[k] = df[k].map(lambda x: format_float_digits(x, 5))
-                            except ValueError:
-                                logging.warning("data not numeric: " + k)
-                        else:
-                            df[k] = df[k].map(lambda x: format_float_digits(x, 5))
-                        df[[k]].to_csv(path, encoding='utf8')
-                    else:
-                        df.to_csv(path, index=False, encoding='utf8')
-            else:
-                path = os.path.join(outpath, 'ddf--{}.csv'.format(t))
-                all_data.to_csv(path, index=False, encoding='utf8')
+            dish['ingredient'].serve(outpath, **dish['options'])
@@ -1,12 +1,18 @@
 # -*- coding: utf-8 -*-
 
-"""the DAG module of chef"""
+"""the DAG model of chef
+
+The DAG consists of 2 types of nodes: IngredientNode and ProcedureNode.
+each node will have a `evaluate()` function, which will return an ingredient
+on eval.
+"""
 
 import pandas as pd
 from . import procedure as pc
 
 
 class BaseNode():
+    """The base node which IngredientNode and ProcedureNode inherit from"""
     def __init__(self, node_id, dag):
         self.node_id = node_id
         self.dag = dag
@@ -61,6 +67,10 @@ def detect_downstream_cycle(self, task=None):
 
 
 class IngredientNode(BaseNode):
+    """Node for storing dataset ingredients.
+
+    The evaluate() function of this type of node will return the ingredient as is.
+    """
     def __init__(self, node_id, ingredient, dag):
         super(IngredientNode, self).__init__(node_id, dag)
         self.ingredient = ingredient
@@ -70,6 +80,11 @@ def evaluate(self):
 
 
 class ProcedureNode(BaseNode):
+    """The node for storing procedure results
+
+    The evaluate() function will run a procedure according to `self.procedure`, using
+    other nodes' data. Other nodes will be evaluated if it's needed.
+    """
     def __init__(self, node_id, procedure, dag):
         super(ProcedureNode, self).__init__(node_id, dag)
         self.procedure = procedure
@@ -110,6 +125,14 @@ def evaluate(self):
 
 
 class DAG():
+    """The DAG model.
+
+    .. note::
+
+        the "task" in the functions is equal to "node". We will change to use
+        same name later.
+
+    """
     def __init__(self, task_dict=None):
         if not task_dict:
             self._task_dict = dict()
@@ -118,6 +141,7 @@ def __init__(self, task_dict=None):
 
     @property
     def roots(self):
+        """return the roots of the DAG"""
         return [t for t in self.tasks if not t.downstream_list]
 
     @property
@@ -133,6 +157,7 @@ def task_dict(self, task):
         raise AttributeError('can not set task_dict manually')
 
     def add_task(self, task):
+        """add a node to DAG"""
         if task.node_id in self.task_dict.keys():
             # only overwirte case is when procedure in ProcedureNode is None.
             if (isinstance(task, ProcedureNode) and