Downsample on rows (resp. columns) when the table has many rows (resp.columns) (#85)

mwouts · web-flow · commit ec388a71f574 · 2022-07-02T23:14:33.000+02:00
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,6 +1,13 @@
 ITables ChangeLog
 =================
 
+1.1.2 (2022-06-30)
+------------------
+
+**Changed**
+- Tables with many rows are preferentially downsampled on rows, while tables with many columns are preferentially downsampled on columns ([#84](https://github.com/mwouts/jupytext/issues/84))
+
+
 1.1.1 (2022-06-23)
 ------------------
 
diff --git a/itables/downsample.py b/itables/downsample.py
@@ -1,4 +1,5 @@
 import logging
+import math
 
 import pandas as pd
 
@@ -35,7 +36,35 @@ def downsample(df, max_rows=0, max_columns=0, max_bytes=0):
     return df
 
 
-def _downsample(df, max_rows=0, max_columns=0, max_bytes=0):
+def shrink_towards_target_aspect_ratio(
+    rows, columns, shrink_factor, target_aspect_ratio
+):
+    # current and target aspect ratio
+    aspect_ratio = rows / columns
+
+    # Optimization problem:
+    # row_shrink_factor * column_shrink_factor = shrink_factor
+    # row_shrink_factor / column_shrink_factor * aspect_ratio = target_aspect_ratio (equal or closer to)
+    # with 0 < row_shrink_factor, column_shrink_factor <= 1
+
+    # row and column natural shrink factors
+    row_shrink_factor = min(1, max(target_aspect_ratio / aspect_ratio, shrink_factor))
+    column_shrink_factor = min(
+        1, max(aspect_ratio / target_aspect_ratio, shrink_factor)
+    )
+
+    # and in case the above is not enough, we shrink in both directions
+    common_shrink_factor = math.sqrt(
+        shrink_factor / (row_shrink_factor * column_shrink_factor)
+    )
+
+    row_shrink_factor *= common_shrink_factor
+    column_shrink_factor *= common_shrink_factor
+
+    return int(rows * row_shrink_factor), int(columns * column_shrink_factor)
+
+
+def _downsample(df, max_rows=0, max_columns=0, max_bytes=0, target_aspect_ratio=None):
     """Implementation of downsample - may be called recursively"""
     if len(df.index) > max_rows > 0:
         second_half = max_rows // 2
@@ -54,22 +83,25 @@ def _downsample(df, max_rows=0, max_columns=0, max_bytes=0):
             df = df.iloc[:, :first_half]
 
     if df.values.nbytes > max_bytes > 0:
-        max_rows = len(df.index)
-        max_columns = len(df.columns)
-
-        # we want to decrease max_rows * max_columns by df.values.nbytes / max_bytes
-        max_product = max_rows * max_columns / (float(df.values.nbytes) / max_bytes)
-
-        while max_product >= 1:
-            max_rows = max(max_rows // 2, 1)
-            if max_rows * max_columns <= max_product:
-                return _downsample(df, max_rows, max_columns, max_bytes)
+        if target_aspect_ratio is None:
+            if max_rows > 0 and max_columns > 0:
+                target_aspect_ratio = max_rows / max_columns
+            else:
+                target_aspect_ratio = 1.0
+
+        max_rows, max_columns = shrink_towards_target_aspect_ratio(
+            len(df.index),
+            len(df.columns),
+            shrink_factor=max_bytes / df.values.nbytes,
+            target_aspect_ratio=target_aspect_ratio,
+        )
 
-            max_columns = max(max_columns // 2, 1)
-            if max_rows * max_columns <= max_product:
-                return _downsample(df, max_rows, max_columns, max_bytes)
+        if max_rows > 0 and max_columns > 0:
+            return _downsample(
+                df, max_rows, max_columns, max_bytes, target_aspect_ratio
+            )
 
-        # max_product < 1.0:
+        # max_bytes is smaller than the average size of one cell
         df = df.iloc[:1, :1]
         df.iloc[0, 0] = "..."
         return df
diff --git a/itables/javascript.py b/itables/javascript.py
@@ -116,7 +116,7 @@ def _formatted_values(df):
         formatted_df[col] = np.array(fmt.format_array(x.values, None))
         if x.dtype.kind == "f":
             try:
-                formatted_df[col] = formatted_df[col].astype(np.float)
+                formatted_df[col] = formatted_df[col].astype(float)
             except ValueError:
                 pass
 
diff --git a/itables/version.py b/itables/version.py
@@ -1,3 +1,3 @@
 """ITables' version number"""
 
-__version__ = "1.1.1"
+__version__ = "1.1.2"
diff --git a/tests/test_downsample.py b/tests/test_downsample.py
@@ -1,38 +1,37 @@
 """Test that the code in all the test notebooks work, including README.md"""
 
-import itertools
-
 import pandas as pd
 import pytest
 
-from itables.downsample import downsample
+from itables.downsample import downsample, shrink_towards_target_aspect_ratio
 
 
-def large_tables(N=1000):
+def large_tables(N=1000, M=1000):
     return [
-        pd.DataFrame(5, columns=range(N), index=range(N)),
-        pd.DataFrame(3.14159, columns=range(N), index=range(N)),
-        pd.DataFrame("abcdefg", columns=range(N), index=range(N)),
+        pd.DataFrame(5, columns=range(M), index=range(N)),
+        pd.DataFrame(3.14159, columns=range(M), index=range(N)),
+        pd.DataFrame("abcdefg", columns=range(M), index=range(N)),
     ]
 
 
-@pytest.mark.parametrize("df,max_rows", itertools.product(large_tables(), [99, 100]))
+@pytest.mark.parametrize("df", large_tables())
+@pytest.mark.parametrize("max_rows", [99, 100])
 def test_max_rows(df, max_rows):
     dn = downsample(df, max_rows=max_rows)
     assert len(dn.index) == max_rows
     pd.testing.assert_index_equal(dn.columns, df.columns)
 
 
-@pytest.mark.parametrize("df,max_columns", itertools.product(large_tables(), [99, 100]))
+@pytest.mark.parametrize("df", large_tables())
+@pytest.mark.parametrize("max_columns", [99, 100])
 def test_max_columns(df, max_columns):
     dn = downsample(df, max_columns=max_columns)
     pd.testing.assert_index_equal(dn.index, df.index)
     assert len(dn.columns) == max_columns
 
 
-@pytest.mark.parametrize(
-    "df,max_bytes", itertools.product(large_tables(), [10, 1e2, 1e3, 1e4, 1e5])
-)
+@pytest.mark.parametrize("df", large_tables())
+@pytest.mark.parametrize("max_bytes", [10, 1e2, 1e3, 1e4, 1e5])
 def test_max_bytes(df, max_bytes):
     dn = downsample(df, max_bytes=max_bytes)
     assert dn.values.nbytes <= max_bytes
@@ -44,3 +43,43 @@ def test_max_one_byte(df, max_bytes=1):
     dn = downsample(df, max_bytes=max_bytes)
     assert len(dn.columns) == len(dn.index) == 1
     assert dn.iloc[0, 0] == "..."
+
+
+def test_shrink_towards_target_aspect_ratio():
+    # Shrink on rows only
+    assert shrink_towards_target_aspect_ratio(100, 10, 0.1, 1.0) == (10, 10)
+    assert shrink_towards_target_aspect_ratio(200, 10, 0.1, 1.0) == (20, 10)
+
+    # Shrink on columns only
+    assert shrink_towards_target_aspect_ratio(10, 100, 0.1, 1.0) == (10, 10)
+    assert shrink_towards_target_aspect_ratio(10, 200, 0.1, 1.0) == (10, 20)
+
+    # Shrink on rows and columns and achieve target aspect ratio
+    assert shrink_towards_target_aspect_ratio(100, 10, 0.1 / 4, 1.0) == (5, 5)
+    assert shrink_towards_target_aspect_ratio(200, 10, 0.1 / 8, 1.0) == (5, 5)
+
+    # Aspect ratio not one
+    assert shrink_towards_target_aspect_ratio(100, 10, 0.1 / 2, 2.0) == (10, 5)
+    assert shrink_towards_target_aspect_ratio(200, 10, 0.1 / 4, 2.0) == (10, 5)
+
+
+@pytest.mark.parametrize("df", large_tables(N=10000, M=100))
+@pytest.mark.parametrize("max_bytes", [1e3, 1e4, 1e5])
+def test_df_with_many_rows_is_downsampled_preferentially_on_rows(df, max_bytes):
+    dn = downsample(df, max_bytes=max_bytes)
+    if max_bytes == 1e5:
+        assert len(dn.index) < len(df.index) and len(dn.columns) == len(df.columns)
+    else:
+        # aspect ratio is close to 1
+        assert 0.5 < len(dn.index) / len(dn.columns) < 2
+
+
+@pytest.mark.parametrize("df", large_tables(N=100, M=10000))
+@pytest.mark.parametrize("max_bytes", [1e3, 1e4, 1e5])
+def test_df_with_many_columns_is_downsampled_preferentially_on_columns(df, max_bytes):
+    dn = downsample(df, max_bytes=max_bytes)
+    if max_bytes == 1e5:
+        assert len(dn.index) == len(df.index) and len(dn.columns) < len(df.columns)
+    else:
+        # aspect ratio is close to 1
+        assert 0.5 < len(dn.index) / len(dn.columns) < 2

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`"""ITables' version number"""`
`2`	`2`
`3`		`-__version__ = "1.1.1"`
	`3`	`+__version__ = "1.1.2"`