Merge pull request #416 from itamarst/410.large-numpy-arrays

itamarst · web-flow · commit 5a9dabc71ec6 · 2019-05-19T19:52:45.000-04:00
Better support for large numpy arrays
diff --git a/docs/source/news.rst b/docs/source/news.rst
@@ -11,6 +11,10 @@ Features:
   when the program runs. Fixes #403.
 * PyPy3 is now officially supported.
 
+Changes:
+
+* If you log a NumPy array whose size > 10000, only a subset will logged. This is to ensure logging giant arrays by mistake doesn't impact your software's performance. If you want to customize logging of large arrays, see :ref:`large_numpy_arrays`. Fixes #410.
+
 1.8.0
 ^^^^^
 
diff --git a/docs/source/scientific-computing.rst b/docs/source/scientific-computing.rst
@@ -13,6 +13,24 @@ Eliot is an ideal logging library for these cases:
 
 At PyCon 2019 Itamar Turner-Trauring gave talk about logging for scientific computing, in part using Eliot—you can `watch the video <https://pyvideo.org/pycon-us-2019/logging-for-scientific-computing-reproducibility-debugging-optimization.html>`_ or `read a prose version <https://pythonspeed.com/articles/logging-for-scientific-computing/>`_.
 
+.. _large_numpy_arrays:
+
+Logging large arrays
+--------------------
+
+Logging large arrays is a problem: it will take a lot of CPU, and it's no fun discovering that your batch process was slow because you mistakenly logged an array with 30 million integers every time you called a core function.
+
+So how do you deal with logging large arrays?
+
+1. **Log a summary (default behavior):** By default, if you log an array with size > 10,000, Eliot will only log the first 10,000 values, along with the shape.
+2. **Omit the array:** You can also just choose not to log the array at all.
+   With ``log_call`` you can use the ``include_args`` parameter to ensure the array isn't logged (see :ref:`log_call decorator`).
+   With ``start_action`` you can just not pass it in.
+3. **Manual transformation:** If you're using ``start_action`` you can also manually modify the array yourself before passing it in.
+   For example, you could write it to some sort of temporary storage, and then log the path to that file.
+   Or you could summarize it some other way than the default.
+
+
 .. _dask_usage:
 
 Using Dask
diff --git a/eliot/json.py b/eliot/json.py
@@ -22,7 +22,14 @@ def default(self, o):
             if isinstance(o, (numpy.bool, numpy.bool_)):
                 return bool(o)
             if isinstance(o, numpy.ndarray):
-                return o.tolist()
+                if o.size > 10000:
+                    # Too big to want to log as-is, log a summary:
+                    return {
+                        "array_start": o.flat[:10000].tolist(),
+                        "original_shape": o.shape,
+                    }
+                else:
+                    return o.tolist()
         return json.JSONEncoder.default(self, o)
 
 
diff --git a/eliot/tests/test_json.py b/eliot/tests/test_json.py
@@ -6,6 +6,7 @@
 
 from unittest import TestCase, skipUnless, skipIf
 from json import loads, dumps
+from math import isnan
 
 try:
     import numpy as np
@@ -18,6 +19,13 @@
 class EliotJSONEncoderTests(TestCase):
     """Tests for L{EliotJSONEncoder}."""
 
+    def test_nan_inf(self):
+        """NaN, inf and -inf are round-tripped."""
+        l = [float("nan"), float("inf"), float("-inf")]
+        roundtripped = loads(dumps(l, cls=EliotJSONEncoder))
+        self.assertEqual(l[1:], roundtripped[1:])
+        self.assertTrue(isnan(roundtripped[0]))
+
     @skipUnless(np, "NumPy not installed.")
     def test_numpy(self):
         """NumPy objects get serialized to readable JSON."""
@@ -62,3 +70,20 @@ def test_numpy_not_imported(self):
         with self.assertRaises(TypeError):
             dumps([object()], cls=EliotJSONEncoder)
         self.assertEqual(dumps(12, cls=EliotJSONEncoder), "12")
+
+    @skipUnless(np, "NumPy is not installed.")
+    def test_large_numpy_array(self):
+        """
+        Large NumPy arrays are not serialized completely, since this is (A) a
+        performance hit (B) probably a mistake on the user's part.
+        """
+        a1000 = np.array([0] * 10000)
+        self.assertEqual(loads(dumps(a1000, cls=EliotJSONEncoder)), a1000.tolist())
+        a1002 = np.zeros((2, 5001))
+        a1002[0][0] = 12
+        a1002[0][1] = 13
+        a1002[1][1] = 500
+        self.assertEqual(
+            loads(dumps(a1002, cls=EliotJSONEncoder)),
+            {"array_start": a1002.flat[:10000].tolist(), "original_shape": [2, 5001]},
+        )