-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
119 lines (90 loc) · 3.75 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pennylane.numpy as np
import pandas as pd
rng = np.random.default_rng()
# Generate random numbers in the given range and with the given shape for the initial weights of the model.
def generate_theta(min_theta, max_theta, shape):
return rng.uniform(min_theta, max_theta, shape, requires_grad=True)
def get_wine_dataset(n_data: int = None, multiply_items_rate: int = 0):
df_raw = pd.read_csv("datasets/red_wine.csv")
if n_data is not None:
df_raw = df_raw.sample(n=n_data)
# Normalize output between 0 and 1
# df_raw["quality"] -= df_raw["quality"].min()
# df_raw["quality"] /= df_raw["quality"].max()
for i in range(multiply_items_rate):
for col in df_raw.columns:
df_raw[col + str(i)] = df_raw[col]
# To fragmentize dataframe again
df_raw = df_raw.copy()
columns = df_raw.columns.tolist()
input_cols = columns[:-1]
output_cols = columns[-1]
inputs_numpy = df_raw[input_cols].to_numpy()
outputs_numpy = df_raw[output_cols].to_numpy()
x = np.array(inputs_numpy)
y = np.array(outputs_numpy)
return x, y
def get_iris_dataset(n_data: int = None, drop_bigger_than: int = None, multiply_items_rate: int = 0):
df_raw = pd.read_csv("datasets/iris.csv")
# Map categorical data to integers
df_raw["variety"] = pd.factorize(df_raw["variety"])[0]
if drop_bigger_than is not None:
df_raw = df_raw.drop(df_raw[df_raw.variety > drop_bigger_than].index)
if n_data is not None:
df_raw = df_raw.sample(n=n_data)
# Normalize output between 0 and 1
# df_raw["variety"] -= df_raw["quality"].min()
# df_raw["variety"] /= df_raw["variety"].max()
output = df_raw.pop("variety")
inputs = df_raw.copy()
for i in range(multiply_items_rate):
for col in inputs.columns:
df_raw[col + str(i)] = inputs[col]
x = df_raw.to_numpy()
y = output.to_numpy()
return x, y
# Creates a dataset where both x (data) and y (label) are random
# x has the size of (n_data, n_qubits) whose values repeats every `data_dim` times (only `data_dim` features)
# y has the size of (n_data)
def create_random_dataset(n_data, n_qubits, data_dim, min_x, max_x, min_y, max_y):
x = np.zeros((n_data, n_qubits), requires_grad=False)
data = rng.uniform(min_x, max_x, (n_data, data_dim), requires_grad=False)
for i in range(n_data):
for j in range(n_qubits):
x[i, j] = data[i, j % data_dim]
y = rng.uniform(min_y, max_y, n_data, requires_grad=False)
return x, y
# Same as create_random_dataset but
# the data has only one dimension
# the x vector built from that data is just the same number repeated multiple times, each time scaled by its `qubit index + 1`.
def create_redundant_x_dataset(n_data, n_qubits, min_x, max_x, min_y, max_y):
x = np.zeros((n_data, n_qubits), requires_grad=False)
data = rng.uniform(min_x, max_x, (n_data), requires_grad=False)
for i in range(n_data):
for j in range(n_qubits):
x[i, j] = (j + 1) * data[i]
y = rng.uniform(min_y, max_y, n_data, requires_grad=False)
return x, y
# Creates a teacher-student dataset where
# x is random
# y is the output of the teacher model
# Basically it's creating a (x, f(x)) dataset where the f is the model given in the parameters.
def create_artifical_y_dataset(
n_data,
n_qubits,
data_dim,
model,
n_layers,
min_x,
max_x,
min_y,
max_y,
min_theta,
max_theta,
):
x, _ = create_random_dataset(n_data, n_qubits, data_dim, min_x, max_x, min_y, max_y)
theta = generate_theta(min_theta, max_theta, (n_layers, n_qubits))
y = np.zeros((n_data), requires_grad=False)
for i in range(n_data):
y[i] = model(x[i], theta, n_qubits, n_layers)
return x, y