-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathdataset.h
More file actions
412 lines (376 loc) · 14.1 KB
/
dataset.h
File metadata and controls
412 lines (376 loc) · 14.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
// dataset.h
// Dataset and dataloader helpers
#ifndef TINYTENSOR_DATASET_H_
#define TINYTENSOR_DATASET_H_
#include <tt/concepts.h>
#include <tt/exception.h>
#include <tt/export.h>
#include <tt/random.h>
#include <tt/tensor.h>
#include <tt/util.h>
#include <concepts>
#include <cstddef>
#include <cstdint>
#include <format>
#include <memory>
#include <numeric>
#include <ranges>
#include <tuple>
#include <type_traits>
#include <utility>
#include <vector>
namespace tinytensor::data {
// Concept each dataset must satisfy
template <typename T>
concept IsDataset = requires(const T ct, int idx) {
typename T::DataType;
requires IsSpecialization<typename T::DataType, std::tuple>;
{ ct.size() } -> std::same_as<int>;
{ ct.get(idx) } -> std::same_as<typename T::DataType>;
};
// IsDataset compliant dataset of tensors
template <typename... Ts>
requires IsAllOf<Tensor, Ts...>
class TINYTENSOR_EXPORT TensorDataset {
public:
static_assert(sizeof...(Ts) > 0);
using DataType = std::tuple<Ts...>;
TensorDataset(Ts... args)
: tensors(std::make_tuple(args...)), N(std::get<0>(tensors).size(0)) {
// Check outer most dim are the same
std::apply(
[&](auto &&...arg) {
auto check_dim = [&](Tensor tensor) {
if (tensor.size(0) != N) {
TT_EXCEPTION("Given tensors do not have the same size of the outer-most dim.");
}
};
(check_dim(arg), ...);
},
tensors
);
}
[[nodiscard]] auto size() const -> int {
return N;
}
[[nodiscard]] auto get(int idx) const -> DataType {
return std::apply([&](auto &&...arg) { return std::make_tuple(arg[idx]...); }, tensors);
}
private:
std::tuple<Ts...> tensors;
int N;
};
// View over a dataset, with indices to represent which subset to view over
// DatasetView needs to either take ownership of the underlying dataset, or share ownership
// (usually through multiple views created from train/validation/test splits)
template <IsDataset T>
class TINYTENSOR_EXPORT DatasetView {
public:
/**
* Create DatasetView through taking ownership of entire dataset
* @param dataset The dataset
*/
DatasetView(T &&dataset)
: dataset_(std::make_shared<T>(std::move(dataset))) {
indices_ = std::views::iota(0, dataset_->size()) | tinytensor::to<decltype(indices_)>();
check();
}
/**
* Create DatasetView from a shared dataset and indices, usually from random_split
* @param dataset The dataset
* @param indices The indices over the dataset the view represents
*/
DatasetView(std::shared_ptr<T> dataset, const std::vector<int> &indices)
: dataset_(std::move(dataset)), indices_(indices) {
check();
}
/**
* Shuffle the underlying dataset implicitly through the indices the view represents
* @param gen The source of randomness
*/
auto shuffle(Generator &gen) {
tinytensor::shuffle(indices_, gen);
}
/**
* The size the view represents
* @return Size of the dataset view
*/
auto size() -> int {
return static_cast<int>(indices_.size());
}
// Support for negative indexing
auto operator[](int idx) -> T::DataType {
int N = static_cast<int>(indices_.size());
if (idx >= N || idx < -N) {
TT_EXCEPTION(std::format("Invalid idx, expected to be in range[{}, {}]", -N, N - 1));
}
// Check idx in bounds
return dataset_->get(indices_.at(static_cast<std::size_t>(idx)));
}
private:
void check() {
if (static_cast<int>(indices_.size()) > dataset_->size()) {
TT_EXCEPTION(
std::format(
"Given indices list of size {:d}, but dataset is only of size {:d}",
indices_.size(),
dataset_->size()
)
);
}
if (indices_.empty()) {
TT_EXCEPTION("Given empty indices list");
}
if (dataset_->size() <= 0) {
TT_EXCEPTION("Dataset must contain at least one element");
}
}
std::shared_ptr<T> dataset_;
std::vector<int> indices_;
};
/**
* Create multiple dataset views over a given dataset
* @note The collection of the views will share ownership of the dataset, thus requires an rvalue dataset
* @note The sum of the splits must equal the size of the dataset
* @param dataset The dataset
* @param gen Source of randomness for the split
* @param splits Sequence of split sizes for each view
*/
template <IsDataset T>
TINYTENSOR_EXPORT auto random_split(T &&dataset, Generator &gen, const std::vector<int> &splits)
-> std::vector<DatasetView<T>> {
auto split_sum = std::reduce(splits.begin(), splits.end());
if (split_sum != dataset.size()) {
TT_EXCEPTION(
std::format(
"Number of elements the split represents ({:d}) does not match the size of the dataset ({:d})",
split_sum,
dataset.size()
)
);
}
auto shared_dataest = std::make_shared<T>(std::forward<T>(dataset));
std::vector<int> indices = std::views::iota(0, split_sum) | tinytensor::to<std::vector<int>>();
shuffle(indices, gen);
int start_idx = 0;
std::vector<DatasetView<T>> dataset_views;
for (int split_size : splits) {
if (split_size <= 0) {
TT_EXCEPTION(std::format("Split sizes must be greater than 0, given split size of {:d}", split_size));
}
std::vector<int> view_indices =
std::views::iota(start_idx, start_idx + split_size)
| std::views::transform([&](int idx) { return indices.at(static_cast<std::size_t>(idx)); })
| tinytensor::to<std::vector<int>>();
start_idx += split_size;
dataset_views.emplace_back(shared_dataest, view_indices);
}
return dataset_views;
}
/**
* Create multiple dataset views over a given dataset
* @note The collection of the views will share ownership of the dataset, thus requires an rvalue dataset
* @note The sum of the splits must equal the size of the dataset
* @param dataset The dataset
* @param gen Source of randomness for the split
* @params splits Sequence of split sizes for each view
*/
template <IsDataset T, typename... Ts>
requires(!std::is_lvalue_reference_v<T> && IsAllOf<int, Ts...>)
TINYTENSOR_EXPORT auto random_split(T &&dataset, Generator &gen, const std::tuple<Ts...> &splits) {
auto split_sum = std::apply([](auto... v) { return (v + ...); }, splits);
if (split_sum != dataset.size()) {
TT_EXCEPTION(
std::format(
"The sum of splits ({:d}) does not match the size of the dataset ({:d})",
split_sum,
dataset.size()
)
);
}
auto shared_dataest = std::make_shared<T>(std::forward<T>(dataset));
std::vector<int> indices = std::views::iota(0, split_sum) | tinytensor::to<std::vector<int>>();
shuffle(indices, gen);
int start_idx = 0;
return std::apply(
[&](auto &&...arg) {
auto make_result = [&](int split_size) -> DatasetView<T> {
if (split_size <= 0) {
TT_EXCEPTION(
std::format("Split sizes must be greater than 0, given split size of {:d}", split_size)
);
}
std::vector<int> view_indices =
std::views::iota(start_idx, start_idx + split_size)
| std::views::transform([&](int idx) { return indices.at(static_cast<std::size_t>(idx)); })
| tinytensor::to<std::vector<int>>();
start_idx += split_size;
return {shared_dataest, view_indices};
};
return std::make_tuple(make_result(arg)...);
},
splits
);
}
/**
* Create multiple dataset views over a given dataset
* @note The collection of the views will share ownership of the dataset, thus requires an rvalue dataset
* @note The sum of the splits must equal the size of the dataset
* @param dataset The dataset
* @param seed Seed for the split
* @params splits Sequence of split sizes for each view
*/
template <IsDataset T, typename... Ts>
requires(!std::is_lvalue_reference_v<T> && IsAllOf<int, Ts...>)
TINYTENSOR_EXPORT auto random_split(T &&dataset, uint64_t seed, Ts... splits) {
Generator gen(seed);
return random_split(std::forward<T>(dataset), gen, std::make_tuple(splits...));
}
/**
* Create multiple dataset views over a given dataset
* @note The collection of the views will share ownership of the dataset, thus requires an rvalue dataset
* @note The sum of the splits must equal the size of the dataset
* @param dataset The dataset
* @param seed Seed for the split
* @param splits Sequence of split sizes for each view
*/
template <IsDataset T>
requires(!std::is_lvalue_reference_v<T>)
TINYTENSOR_EXPORT auto random_split(T &&dataset, uint64_t seed, const std::vector<int> &splits)
-> std::vector<DatasetView<T>> {
Generator gen(seed);
return random_split(std::forward<T>(dataset), gen, splits);
}
// A Dataloader class similar to torch
// This facilitates batching and shuffling of a dataset
// DataLoaders take views over datasets, which can be created using random_split
template <IsDataset T>
class TINYTENSOR_EXPORT DataLoader {
public:
// Iterator support for range loops
class Iterator {
public:
Iterator(std::shared_ptr<DatasetView<T>> dataview, int idx, int batch_size)
: dataview_(std::move(dataview)), idx_(idx), batch_size_(batch_size) {}
auto operator!=(const Iterator &other) const -> bool {
return idx_ != other.idx_;
}
auto operator++() -> Iterator {
if (idx_ * batch_size_ > dataview_->size()) {
TT_ERROR(
"Internal iterator is already passed the end. Dataloader::Iterator should only be used implicitly "
"in range-based for loops"
);
}
++idx_;
return *this;
}
auto operator*() const {
if (idx_ * batch_size_ > dataview_->size()) {
TT_ERROR(
"Internal iterator is already passed the end. Dataloader::Iterator should only be used implicitly "
"in range-based for loops"
);
}
int idx_start = idx_ * batch_size_;
int idx_end = std::min((idx_ + 1) * batch_size_, dataview_->size());
// Grab each tuple item from first batch element
auto batched_values = std::apply(
[](auto &&...arg) {
return std::make_tuple(CheckedVec<std::remove_reference_t<decltype(arg)>>{arg}...);
},
(*dataview_)[idx_start]
);
// Grab each tuple item for rest of batch
for (int i : std::views::iota(idx_start + 1, idx_end)) {
std::apply(
[&](auto &&...batched_value) {
std::apply(
[&](auto &&...cur_value) { (batched_value.push_back(cur_value), ...); },
(*dataview_)[i]
);
},
batched_values
);
}
// Convert the batched vector<Tensor> values into a concated tensor
auto val = std::apply(
[](auto &&...batched_value) {
auto make_result = [](auto &&arg) {
if constexpr (std::is_same_v<std::remove_cvref_t<decltype(arg)>, TensorList>) {
return stack(arg, 0);
} else {
return arg;
}
};
return std::make_tuple(make_result(batched_value)...);
},
batched_values
);
return val;
}
private:
std::shared_ptr<DatasetView<T>> dataview_;
int idx_;
int batch_size_;
};
/**
* Create Dataloader by coping a dataview
* @param dataview The Dataview
* @param batch_size The size of the batches to create when iterating
* @param shuffle Flag to shuffle the data for each pass over
* @param seed The seed to use when shuffling
*/
DataLoader(const DatasetView<T> &dataview, int batch_size, bool shuffle = true, uint64_t seed = 0)
: dataview_(std::make_shared<DatasetView<T>>(dataview)),
batch_size_(batch_size),
shuffle_(shuffle),
gen_(seed) {
check();
}
/**
* Create Dataloader by taking ownership of a dataview
* @param dataview The Dataview
* @param batch_size The size of the batches to create when iterating
* @param shuffle Flag to shuffle the data for each pass over
* @param seed The seed to use when shuffling
*/
DataLoader(DatasetView<T> &&dataview, int batch_size, bool shuffle = true, uint64_t seed = 0)
: dataview_(std::make_shared<DatasetView<T>>(std::move(dataview))),
batch_size_(batch_size),
shuffle_(shuffle),
gen_(seed) {
check();
}
/**
* Get the number of batches the dataloader generates
* @return Number of batches
*/
auto size() -> int {
return ceil_div(dataview_->size(), batch_size_);
}
auto begin() -> Iterator {
if (shuffle_) {
dataview_->shuffle(gen_);
}
return {dataview_, 0, batch_size_};
}
auto end() -> Iterator {
return {dataview_, ceil_div(dataview_->size(), batch_size_), batch_size_};
}
private:
void check() {
if (dataview_->size() <= 0) {
TT_EXCEPTION("Dataview must contain at least one element");
}
if (batch_size_ <= 0) {
TT_EXCEPTION("batch_size must be positive");
}
}
std::shared_ptr<DatasetView<T>> dataview_;
int batch_size_;
bool shuffle_;
Generator gen_;
};
} // namespace tinytensor::data
#endif // TINYTENSOR_DATASET_H_