Merge pull request #534 from dimitri-yatsenko/master

eywalker · web-flow · commit aa5473449754 · 2018-12-08T15:20:44.000-06:00
Add pandas support and support order_by "KEY" (issues #459, #537, #538, #541)
diff --git a/datajoint/declare.py b/datajoint/declare.py
@@ -272,7 +272,7 @@ def compile_attribute(line, in_key, foreign_key_sql):
     match = {k: v.strip() for k, v in match.items()}
     match['nullable'] = match['default'].lower() == 'null'
     acceptable_datatype_pattern = r'^(time|date|year|enum|(var)?char|float|double|decimal|' \
-                                  r'(tiny|small|medium|big)?int|' \
+                                  r'(tiny|small|medium|big)?int|bool(ean)?|' \
                                   r'(tiny|small|medium|long)?blob|external|attach)'
     if re.match(acceptable_datatype_pattern, match['type']) is None:
         raise DataJointError('DataJoint does not support datatype "{type}"'.format(**match))
diff --git a/datajoint/errors.py b/datajoint/errors.py
@@ -14,6 +14,7 @@
     'lost connection': 2013,
 }
 
+
 def is_connection_error(e):
     """
     Checks if error e pertains to a connection issue
@@ -22,7 +23,6 @@ def is_connection_error(e):
         (isinstance(e, err.OperationalError) and e.args[0] in operation_error_codes.values())
 
 
-
 class DataJointError(Exception):
     """
     Base class for errors specific to DataJoint internal operation.
diff --git a/datajoint/expression.py b/datajoint/expression.py
@@ -6,6 +6,7 @@
 import re
 import datetime
 import decimal
+import pandas
 from .settings import config
 from .errors import DataJointError
 from .fetch import Fetch, Fetch1
@@ -117,7 +118,7 @@ def _make_condition(self, arg):
         """
         Translate the input arg into the equivalent SQL condition (a string)
         :param arg: any valid restriction object.
-        :return: an SQL condition string.  It may also be a boolean that is intended to be treated as a string.
+        :return: an SQL condition string or a boolean value.
         """
         def prep_value(v):
             return str(v) if isinstance(v, (datetime.date, datetime.datetime, datetime.time, decimal.Decimal)) else v
@@ -176,6 +177,10 @@ def prep_value(v):
                     not_="not " if negate else "",
                     subquery=arg.make_sql(common_attributes)))
 
+        # restrict by pandas.DataFrames
+        if isinstance(arg, pandas.DataFrame):
+            arg = arg.to_records()   # convert to np.recarray
+
         # if iterable (but not a string, a QueryExpression, or an AndList), treat as an OrList
         try:
             or_list = [self._make_condition(q) for q in arg]
@@ -289,8 +294,7 @@ def restrict(self, restriction):
         rel.restrict(restriction)  is equivalent to  rel = rel & restriction  or  rel &= restriction
         rel.restrict(Not(restriction))  is equivalent to  rel = rel - restriction  or  rel -= restriction
         The primary key of the result is unaffected.
-        Successive restrictions are combined using the logical AND.
-        The AndList class is provided to play the role of successive restrictions.
+        Successive restrictions are combined as logical AND:   r & a & b  is equivalent to r & AndList((a, b))
         Any QueryExpression, collection, or sequence other than an AndList are treated as OrLists
         (logical disjunction of conditions)
         Inverse restriction is accomplished by either using the subtraction operator or the Not class.
@@ -342,6 +346,26 @@ def fetch1(self):
     def fetch(self):
         return Fetch(self)
 
+    def head(self, limit=25, **fetch_kwargs):
+        """
+        shortcut to fetch the first few entries from query expression.
+        Equivalent to fetch(order_by="KEY", limit=25)
+        :param limit:  number of entries
+        :param fetch_kwargs: kwargs for fetch
+        :return: query result
+        """
+        return self.fetch(order_by="KEY", limit=limit, **fetch_kwargs)
+
+    def tail(self, limit=25, **fetch_kwargs):
+        """
+        shortcut to fetch the last few entries from query expression.
+        Equivalent to fetch(order_by="KEY DESC", limit=25)[::-1]
+        :param limit:  number of entries
+        :param fetch_kwargs: kwargs for fetch
+        :return: query result
+        """
+        return self.fetch(order_by="KEY DESC", limit=limit, **fetch_kwargs)[::-1]
+
     def attributes_in_restriction(self):
         """
         :return: list of attributes that are probably used in the restriction.
@@ -365,7 +389,7 @@ def preview(self, limit=None, width=None):
             limit = config['display.limit']
         if width is None:
             width = config['display.width']
-        tuples = rel.fetch(limit=limit+1)
+        tuples = rel.fetch(limit=limit+1, format="array")
         has_more = len(tuples) > limit
         tuples = tuples[:limit]
         columns = heading.names
@@ -378,13 +402,13 @@ def preview(self, limit=None, width=None):
             '\n'.join(' '.join(templates[f] % (tup[f] if f in tup.dtype.names else '=BLOB=')
                 for f in columns) for tup in tuples) +
             ('\n   ...\n' if has_more else '\n') +
-            (' (%d tuples)\n' % len(rel) if config['display.show_tuple_count'] else ''))
+            (' (Total: %d)\n' % len(rel) if config['display.show_tuple_count'] else ''))
 
     def _repr_html_(self):
         heading = self.heading
         rel = self.proj(*heading.non_blobs)
         info = heading.table_info
-        tuples = rel.fetch(limit=config['display.limit']+1)
+        tuples = rel.fetch(limit=config['display.limit']+1, format='array')
         has_more = len(tuples) > config['display.limit']
         tuples = tuples[0:config['display.limit']]
 
@@ -464,7 +488,7 @@ def _repr_html_(self):
                 ['\n'.join(['<td>%s</td>' % (tup[name] if name in tup.dtype.names else '=BLOB=')
                     for name in heading.names])
                  for tup in tuples]),
-            count=('<p>%d tuples</p>' % len(rel)) if config['display.show_tuple_count'] else '')
+            count=('<p>Total: %d</p>' % len(rel)) if config['display.show_tuple_count'] else '')
 
     def make_sql(self, select_fields=None):
         return 'SELECT {fields} FROM {from_}{where}'.format(
diff --git a/datajoint/fetch.py b/datajoint/fetch.py
@@ -1,9 +1,12 @@
 from collections import OrderedDict
 from functools import partial
+import warnings
+import pandas
+import re
 import numpy as np
 from .blob import unpack
 from .errors import DataJointError
-import warnings
+from .settings import config
 
 
 class key:
@@ -24,6 +27,16 @@ def to_dicts(recarray):
         yield dict(zip(recarray.dtype.names, rec.tolist()))
 
 
+def _flatten_attribute_list(primary_key, attr):
+    for a in attr:
+        if re.match(r'^\s*KEY\s*(ASC\s*)?$', a):
+            yield from primary_key
+        elif re.match(r'^\s*KEY\s*DESC\s*$', a):
+            yield from (q + ' DESC' for q in primary_key)
+        else:
+            yield a
+
+
 class Fetch:
     """
     A fetch object that handles retrieving elements from the table expression.
@@ -33,36 +46,59 @@ class Fetch:
     def __init__(self, expression):
         self._expression = expression
 
-    def __call__(self, *attrs, offset=None, limit=None, order_by=None, as_dict=False, squeeze=False):
+    def __call__(self, *attrs, offset=None, limit=None, order_by=None, format=None, as_dict=False, squeeze=False):
         """
         Fetches the expression results from the database into an np.array or list of dictionaries and unpacks blob attributes.
 
         :param attrs: zero or more attributes to fetch. If not provided, the call will return
         all attributes of this relation. If provided, returns tuples with an entry for each attribute.
         :param offset: the number of tuples to skip in the returned result
         :param limit: the maximum number of tuples to return
-        :param order_by: the list of attributes to order the results. No ordering should be assumed if order_by=None.
+        :param order_by: a single attribute or the list of attributes to order the results.
+                No ordering should be assumed if order_by=None.
+                To reverse the order, add DESC to the attribute name or names: e.g. ("age DESC", "frequency")
+                To order by primary key, use "KEY" or "KEY DESC"
+        :param format: Effective when as_dict=False and when attrs is empty
+                None: default from config['fetch_format'] or 'array' if not configured
+                "array": use numpy.key_array
+                "frame": output pandas.DataFrame. .
         :param as_dict: returns a list of dictionaries instead of a record array
         :param squeeze:  if True, remove extra dimensions from arrays
         :return: the contents of the relation in the form of a structured numpy.array or a dict list
         """
 
-        # if 'order_by' passed in a string, make into list
-        if isinstance(order_by, str):
-            order_by = [order_by]
+        if order_by is not None:
+            # if 'order_by' passed in a string, make into list
+            if isinstance(order_by, str):
+                order_by = [order_by]
+            # expand "KEY" or "KEY DESC"
+            order_by = list(_flatten_attribute_list(self._expression.primary_key, order_by))
 
         # if attrs are specified then as_dict cannot be true
         if attrs and as_dict:
             raise DataJointError('Cannot specify attributes to return when as_dict=True. '
-                                 'Use proj() to select attributes or set as_dict=False')
+                                 'Use '
+                                 'proj() to select attributes or set as_dict=False')
+        # format should not be specified with attrs or is_dict=True
+        if format is not None and (as_dict or attrs):
+            raise DataJointError('Cannot specify output format when as_dict=True or '
+                                 'when attributes are selected to be fetched separately.')
+
+        if format not in {None, "array", "frame"}:
+            raise DataJointError('Fetch output format must be in {{"array", "frame"}} but "{}" was given'.format(format))
+
+        if not (attrs or as_dict) and format is None:
+            format = config['fetch_format']  # default to array
+            if format not in {"array", "frame"}:
+                raise DataJointError('Invalid entry "{}" in datajoint.config["fetch_format"]: use "array" or "frame"'.format(format))
 
         if limit is None and offset is not None:
             warnings.warn('Offset set, but no limit. Setting limit to a large number. '
                           'Consider setting a limit explicitly.')
             limit = 2 * len(self._expression)
 
         if not attrs:
-            # fetch all attributes
+            # fetch all attributes as a numpy.record_array or pandas.DataFrame
             cur = self._expression.cursor(as_dict=as_dict, limit=limit, offset=offset, order_by=order_by)
             heading = self._expression.heading
             if as_dict:
@@ -78,6 +114,8 @@ def __call__(self, *attrs, offset=None, limit=None, order_by=None, as_dict=False
                         ret[name] = list(map(external_table.get, ret[name]))
                     elif heading[name].is_blob:
                         ret[name] = list(map(partial(unpack, squeeze=squeeze), ret[name]))
+                if format == "frame":
+                    ret = pandas.DataFrame(ret).set_index(heading.primary_key)
         else:  # if list of attributes provided
             attributes = [a for a in attrs if not is_key(a)]
             result = self._expression.proj(*attributes).fetch(
diff --git a/datajoint/jobs.py b/datajoint/jobs.py
@@ -1,4 +1,3 @@
-from decimal import Decimal
 from .hash import key_hash
 import os
 import platform
diff --git a/datajoint/settings.py b/datajoint/settings.py
@@ -37,6 +37,7 @@
     'connection.charset': '',   # pymysql uses '' as default
     'loglevel': 'INFO',
     'safemode': True,
+    'fetch_format': 'array',
     'display.limit': 12,
     'display.width': 14,
     'display.show_tuple_count': True
diff --git a/datajoint/table.py b/datajoint/table.py
@@ -3,6 +3,7 @@
 import inspect
 import platform
 import numpy as np
+import pandas
 import pymysql
 import logging
 import warnings
@@ -146,13 +147,12 @@ def insert1(self, row, **kwargs):
         """
         self.insert((row,), **kwargs)
 
-    def insert(self, rows, replace=False, skip_duplicates=False, ignore_extra_fields=False, ignore_errors=False,
-               allow_direct_insert=None):
+    def insert(self, rows, replace=False, skip_duplicates=False, ignore_extra_fields=False, allow_direct_insert=None):
         """
         Insert a collection of rows.
 
-        :param rows: An iterable where an element is a numpy record, a dict-like object, or an ordered sequence.
-            rows may also be another relation with the same heading.
+        :param rows: An iterable where an element is a numpy record, a dict-like object, a pandas.DataFrame, a sequence,
+            or a query expression with the same heading as table self.
         :param replace: If True, replaces the existing tuple.
         :param skip_duplicates: If True, silently skip duplicate inserts.
         :param ignore_extra_fields: If False, fields that are not in the heading raise error.
@@ -164,9 +164,8 @@ def insert(self, rows, replace=False, skip_duplicates=False, ignore_extra_fields
         >>>     dict(subject_id=8, species="mouse", date_of_birth="2014-09-02")])
         """
 
-        if ignore_errors:
-            warnings.warn('Use of `ignore_errors` in `insert` and `insert1` is deprecated. Use try...except... '
-                          'to explicitly handle any errors', stacklevel=2)
+        if isinstance(rows, pandas.DataFrame):
+            rows = rows.to_records()
 
         # prohibit direct inserts into auto-populated tables
         if not (allow_direct_insert or getattr(self, '_allow_insert', True)):  # _allow_insert is only present in AutoPopulate
diff --git a/datajoint/user_tables.py b/datajoint/user_tables.py
@@ -12,7 +12,8 @@
 
 # attributes that trigger instantiation of user classes
 supported_class_attrs = {
-    'key_source', 'describe', 'populate', 'progress', 'primary_key', 'proj', 'aggr', 'heading', 'fetch', 'fetch1',
+    'key_source', 'describe', 'heading', 'populate', 'progress', 'primary_key', 'proj', 'aggr',
+    'fetch', 'fetch1','head', 'tail',
     'insert', 'insert1', 'drop', 'drop_quick', 'delete', 'delete_quick'}
 
 
diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,7 @@ numpy
 pymysql>=0.7.2
 pyparsing
 ipython
+pandas
 tqdm
 networkx
 pydot
diff --git a/tests/schema_simple.py b/tests/schema_simple.py
@@ -58,7 +58,7 @@ class C(dj.Part):
         value :float  # normally distributed variables according to parameters in B
         """
 
-    def _make_tuples(self, key):
+    def make(self, key):
         random.seed(str(key))
         sub = B.C()
         for i in range(4):
@@ -113,7 +113,7 @@ class F(dj.Part):
         -> B.C
         """
 
-    def _make_tuples(self, key):
+    def make(self, key):
         random.seed(str(key))
         self.insert1(dict(key, **random.choice(list(L().fetch('KEY')))))
         sub = E.F()
diff --git a/tests/test_fetch.py b/tests/test_fetch.py
@@ -1,8 +1,9 @@
-from nose.tools import assert_true, raises, assert_equal, assert_dict_equal
+from nose.tools import assert_true, raises, assert_equal, assert_dict_equal, assert_list_equal
 from operator import itemgetter
 import itertools
 import numpy as np
 import decimal
+import pandas
 import warnings
 from . import schema
 import datajoint as dj
@@ -75,6 +76,23 @@ def test_order_by_limit(self):
         for c, l in list(zip(cur, languages))[:4]:
             assert_true(np.all([cc == ll for cc, ll in zip(c, l)]), 'Sorting order is different')
 
+    @staticmethod
+    def test_head_tail():
+        query = schema.User * schema.Language
+        n = 5
+        frame = query.head(n, format='frame')
+        array = query.head(n, format='array')
+        assert_equal(array.size, n)
+        assert_equal(len(frame), n)
+        assert_list_equal(query.primary_key, frame.index.names)
+
+        n = 4
+        frame = query.tail(n, format='frame')
+        array = query.tail(n, format='array')
+        assert_equal(array.size, n)
+        assert_equal(len(frame), n)
+        assert_list_equal(query.primary_key, frame.index.names)
+
     def test_limit_offset(self):
         """Test the limit and offset kwargs together"""
         languages = schema.Language.contents
diff --git a/tests/test_relation.py b/tests/test_relation.py
diff --git a/tests/test_relational_operand.py b/tests/test_relational_operand.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-from decimal import Decimal`
`2`	`1`	`from .hash import key_hash`
`3`	`2`	`import os`
`4`	`3`	`import platform`