forked from pola-rs/polars-book
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_data.py
More file actions
60 lines (45 loc) · 1.58 KB
/
generate_data.py
File metadata and controls
60 lines (45 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""Small `Python` snippet to generate mock datasets (in CSV format) to be used by the
code examples included in the User Guide."""
import numpy as np
import pandas as pd
import polars as pl
np.random.seed(1)
DATA_DIR = "data"
# GroupBy benchmark data
print("GroupBy data:")
groups = np.arange(10)
str_groups = np.array(list("0123456789"))
for size in np.array([1e4, 1e5, 1e6, 1e7], dtype=int):
g = np.random.choice(groups, size)
sg = np.random.choice(str_groups, size)
v = np.random.randn(size)
df = pl.DataFrame({"groups": g, "values": v, "str": sg})
df.to_csv(f"{DATA_DIR}/{size}.csv")
print(f" {int(size)} rows of GroupBy data created")
# Join benchmark data
# https://wesmckinney.com/blog/high-performance-database-joins-with-pandas-dataframe-more-benchmarks/
# https://github.com/wesm/pandas/blob/23669822819808bbaeb6ea36a6b2ef98026884db/bench/bench_merge_sqlite.py
print("Join data:")
N = 10000
indices = np.array([pd._testing.rands(10) for _ in range(N)], dtype=str)
indices2 = np.array([pd._testing.rands(10) for _ in range(N)], dtype=str)
key = np.tile(indices[:8000], 10)
key2 = np.tile(indices2[:8000], 10)
left = pl.DataFrame(
{
"key": key,
"key2": key2,
"value": np.random.randn(80000),
}
)
right = pl.DataFrame(
{
"key": indices[2000:],
"key2": indices2[2000:],
"value2": np.random.randn(8000),
}
)
left.to_csv(f"{DATA_DIR}/join_left_80000.csv")
right.to_csv(f"{DATA_DIR}/join_right_80000.csv")
print(f" Left data created (shape: {left.shape})")
print(f" Right data created (shape: {right.shape})")