Skip to content

Commit ec388a7

Browse files
authored
Downsample on rows (resp. columns) when the table has many rows (resp.columns) (#85)
1 parent 47336dd commit ec388a7

File tree

5 files changed

+107
-29
lines changed

5 files changed

+107
-29
lines changed

docs/changelog.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
ITables ChangeLog
22
=================
33

4+
1.1.2 (2022-06-30)
5+
------------------
6+
7+
**Changed**
8+
- Tables with many rows are preferentially downsampled on rows, while tables with many columns are preferentially downsampled on columns ([#84](https://github.com/mwouts/jupytext/issues/84))
9+
10+
411
1.1.1 (2022-06-23)
512
------------------
613

itables/downsample.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import math
23

34
import pandas as pd
45

@@ -35,7 +36,35 @@ def downsample(df, max_rows=0, max_columns=0, max_bytes=0):
3536
return df
3637

3738

38-
def _downsample(df, max_rows=0, max_columns=0, max_bytes=0):
39+
def shrink_towards_target_aspect_ratio(
40+
rows, columns, shrink_factor, target_aspect_ratio
41+
):
42+
# current and target aspect ratio
43+
aspect_ratio = rows / columns
44+
45+
# Optimization problem:
46+
# row_shrink_factor * column_shrink_factor = shrink_factor
47+
# row_shrink_factor / column_shrink_factor * aspect_ratio = target_aspect_ratio (equal or closer to)
48+
# with 0 < row_shrink_factor, column_shrink_factor <= 1
49+
50+
# row and column natural shrink factors
51+
row_shrink_factor = min(1, max(target_aspect_ratio / aspect_ratio, shrink_factor))
52+
column_shrink_factor = min(
53+
1, max(aspect_ratio / target_aspect_ratio, shrink_factor)
54+
)
55+
56+
# and in case the above is not enough, we shrink in both directions
57+
common_shrink_factor = math.sqrt(
58+
shrink_factor / (row_shrink_factor * column_shrink_factor)
59+
)
60+
61+
row_shrink_factor *= common_shrink_factor
62+
column_shrink_factor *= common_shrink_factor
63+
64+
return int(rows * row_shrink_factor), int(columns * column_shrink_factor)
65+
66+
67+
def _downsample(df, max_rows=0, max_columns=0, max_bytes=0, target_aspect_ratio=None):
3968
"""Implementation of downsample - may be called recursively"""
4069
if len(df.index) > max_rows > 0:
4170
second_half = max_rows // 2
@@ -54,22 +83,25 @@ def _downsample(df, max_rows=0, max_columns=0, max_bytes=0):
5483
df = df.iloc[:, :first_half]
5584

5685
if df.values.nbytes > max_bytes > 0:
57-
max_rows = len(df.index)
58-
max_columns = len(df.columns)
59-
60-
# we want to decrease max_rows * max_columns by df.values.nbytes / max_bytes
61-
max_product = max_rows * max_columns / (float(df.values.nbytes) / max_bytes)
62-
63-
while max_product >= 1:
64-
max_rows = max(max_rows // 2, 1)
65-
if max_rows * max_columns <= max_product:
66-
return _downsample(df, max_rows, max_columns, max_bytes)
86+
if target_aspect_ratio is None:
87+
if max_rows > 0 and max_columns > 0:
88+
target_aspect_ratio = max_rows / max_columns
89+
else:
90+
target_aspect_ratio = 1.0
91+
92+
max_rows, max_columns = shrink_towards_target_aspect_ratio(
93+
len(df.index),
94+
len(df.columns),
95+
shrink_factor=max_bytes / df.values.nbytes,
96+
target_aspect_ratio=target_aspect_ratio,
97+
)
6798

68-
max_columns = max(max_columns // 2, 1)
69-
if max_rows * max_columns <= max_product:
70-
return _downsample(df, max_rows, max_columns, max_bytes)
99+
if max_rows > 0 and max_columns > 0:
100+
return _downsample(
101+
df, max_rows, max_columns, max_bytes, target_aspect_ratio
102+
)
71103

72-
# max_product < 1.0:
104+
# max_bytes is smaller than the average size of one cell
73105
df = df.iloc[:1, :1]
74106
df.iloc[0, 0] = "..."
75107
return df

itables/javascript.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def _formatted_values(df):
116116
formatted_df[col] = np.array(fmt.format_array(x.values, None))
117117
if x.dtype.kind == "f":
118118
try:
119-
formatted_df[col] = formatted_df[col].astype(np.float)
119+
formatted_df[col] = formatted_df[col].astype(float)
120120
except ValueError:
121121
pass
122122

itables/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""ITables' version number"""
22

3-
__version__ = "1.1.1"
3+
__version__ = "1.1.2"

tests/test_downsample.py

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,37 @@
11
"""Test that the code in all the test notebooks work, including README.md"""
22

3-
import itertools
4-
53
import pandas as pd
64
import pytest
75

8-
from itables.downsample import downsample
6+
from itables.downsample import downsample, shrink_towards_target_aspect_ratio
97

108

11-
def large_tables(N=1000):
9+
def large_tables(N=1000, M=1000):
1210
return [
13-
pd.DataFrame(5, columns=range(N), index=range(N)),
14-
pd.DataFrame(3.14159, columns=range(N), index=range(N)),
15-
pd.DataFrame("abcdefg", columns=range(N), index=range(N)),
11+
pd.DataFrame(5, columns=range(M), index=range(N)),
12+
pd.DataFrame(3.14159, columns=range(M), index=range(N)),
13+
pd.DataFrame("abcdefg", columns=range(M), index=range(N)),
1614
]
1715

1816

19-
@pytest.mark.parametrize("df,max_rows", itertools.product(large_tables(), [99, 100]))
17+
@pytest.mark.parametrize("df", large_tables())
18+
@pytest.mark.parametrize("max_rows", [99, 100])
2019
def test_max_rows(df, max_rows):
2120
dn = downsample(df, max_rows=max_rows)
2221
assert len(dn.index) == max_rows
2322
pd.testing.assert_index_equal(dn.columns, df.columns)
2423

2524

26-
@pytest.mark.parametrize("df,max_columns", itertools.product(large_tables(), [99, 100]))
25+
@pytest.mark.parametrize("df", large_tables())
26+
@pytest.mark.parametrize("max_columns", [99, 100])
2727
def test_max_columns(df, max_columns):
2828
dn = downsample(df, max_columns=max_columns)
2929
pd.testing.assert_index_equal(dn.index, df.index)
3030
assert len(dn.columns) == max_columns
3131

3232

33-
@pytest.mark.parametrize(
34-
"df,max_bytes", itertools.product(large_tables(), [10, 1e2, 1e3, 1e4, 1e5])
35-
)
33+
@pytest.mark.parametrize("df", large_tables())
34+
@pytest.mark.parametrize("max_bytes", [10, 1e2, 1e3, 1e4, 1e5])
3635
def test_max_bytes(df, max_bytes):
3736
dn = downsample(df, max_bytes=max_bytes)
3837
assert dn.values.nbytes <= max_bytes
@@ -44,3 +43,43 @@ def test_max_one_byte(df, max_bytes=1):
4443
dn = downsample(df, max_bytes=max_bytes)
4544
assert len(dn.columns) == len(dn.index) == 1
4645
assert dn.iloc[0, 0] == "..."
46+
47+
48+
def test_shrink_towards_target_aspect_ratio():
49+
# Shrink on rows only
50+
assert shrink_towards_target_aspect_ratio(100, 10, 0.1, 1.0) == (10, 10)
51+
assert shrink_towards_target_aspect_ratio(200, 10, 0.1, 1.0) == (20, 10)
52+
53+
# Shrink on columns only
54+
assert shrink_towards_target_aspect_ratio(10, 100, 0.1, 1.0) == (10, 10)
55+
assert shrink_towards_target_aspect_ratio(10, 200, 0.1, 1.0) == (10, 20)
56+
57+
# Shrink on rows and columns and achieve target aspect ratio
58+
assert shrink_towards_target_aspect_ratio(100, 10, 0.1 / 4, 1.0) == (5, 5)
59+
assert shrink_towards_target_aspect_ratio(200, 10, 0.1 / 8, 1.0) == (5, 5)
60+
61+
# Aspect ratio not one
62+
assert shrink_towards_target_aspect_ratio(100, 10, 0.1 / 2, 2.0) == (10, 5)
63+
assert shrink_towards_target_aspect_ratio(200, 10, 0.1 / 4, 2.0) == (10, 5)
64+
65+
66+
@pytest.mark.parametrize("df", large_tables(N=10000, M=100))
67+
@pytest.mark.parametrize("max_bytes", [1e3, 1e4, 1e5])
68+
def test_df_with_many_rows_is_downsampled_preferentially_on_rows(df, max_bytes):
69+
dn = downsample(df, max_bytes=max_bytes)
70+
if max_bytes == 1e5:
71+
assert len(dn.index) < len(df.index) and len(dn.columns) == len(df.columns)
72+
else:
73+
# aspect ratio is close to 1
74+
assert 0.5 < len(dn.index) / len(dn.columns) < 2
75+
76+
77+
@pytest.mark.parametrize("df", large_tables(N=100, M=10000))
78+
@pytest.mark.parametrize("max_bytes", [1e3, 1e4, 1e5])
79+
def test_df_with_many_columns_is_downsampled_preferentially_on_columns(df, max_bytes):
80+
dn = downsample(df, max_bytes=max_bytes)
81+
if max_bytes == 1e5:
82+
assert len(dn.index) == len(df.index) and len(dn.columns) < len(df.columns)
83+
else:
84+
# aspect ratio is close to 1
85+
assert 0.5 < len(dn.index) / len(dn.columns) < 2

0 commit comments

Comments
 (0)