-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathloader.py
1858 lines (1569 loc) · 81.9 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright 2021 UC Davis Plant AI and Biophysics Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import fnmatch
import glob
import json
import os
from collections.abc import Sequence
from decimal import Decimal, getcontext
from typing import Union
import numpy as np
from agml.backend.config import SUPER_BASE_DIR, data_save_path, synthetic_data_save_path
from agml.backend.experimental import AgMLExperimentalFeatureWrapper
from agml.backend.tftorch import (
StrictBackendError,
_add_dataset_to_mro, # noqa
get_backend,
set_backend,
user_changed_backend,
)
from agml.data.builder import DataBuilder
from agml.data.exporters.yolo import export_yolo
from agml.data.manager import DataManager
from agml.data.metadata import DatasetMetadata, make_metadata
from agml.framework import AgMLSerializable
from agml.utils.data import load_public_sources
from agml.utils.general import NoArgument, resolve_list_value
from agml.utils.io import get_dir_list, get_file_list
from agml.utils.logging import log
from agml.utils.random import inject_random_state
from agml.viz.general import show_sample
class AgMLDataLoaderMeta(type):
def __instancecheck__(self, instance):
# This override allows for objects of type `AgMLMultiDatasetLoader`
# to be treated as an `AgMLDataLoader` when the following command
# is run: `isinstance(a, AgMLDataLoader)` (hacky fix, essentially).
if isinstance(instance, self.__class__):
return True
from agml.data.multi_loader import AgMLMultiDatasetLoader
if isinstance(instance, AgMLMultiDatasetLoader):
return True
return False
class AgMLDataLoader(AgMLSerializable, metaclass=AgMLDataLoaderMeta):
"""Loads and provides a processing interface for a dataset.
The `AgMLDataLoader` is the main interface to AgML's public dataset
interface, and exposes an API which enables the downloading and
subsequent local loading of a public dataset, as well as various
preprocessing functions and hooks to integrate into existing pipelines.
Methods provided include splitting the dataset into train/val/test sets,
batching the data, applying transforms, and more. All of the processing
code is contained internally, so all you need to do is instantiate the
loader and call the relevant methods to apply the preprocessing methods.
`AgMLDataLoader` supports both TensorFlow and PyTorch as backends, and
can automatically perform tensor conversion and batching to enable
seamless usage in training or inference pipelines. Data can also be
exported into native TensorFlow and PyTorch objects.
There is also support for using custom datasets outside of the AgML
public data repository. To do this, you need to pass an extra argument
containing metadata for the dataset, after which point the loader
will work as normal (and all interfaces, except for the info parameters
which are not provided, will also be available for standard use).
Parameters
----------
dataset : str
The name of the public dataset you want to load. See the helper
method `agml.data.public_data_sources()` for a list of datasets.
If using a custom dataset, this can be any valid string.
kwargs : dict, optional
dataset_path : str, optional
A custom path to download and load the dataset from.
overwrite : bool, optional
Whether to rewrite and re-install the dataset.
meta : dict, optional
A dictionary consisting of metadata properties, if you want
to create a custom loader. At minimum, this needs to contain
two parameters: `task`, indicating the type of machine learning
task that the dataset is for, and `classes`, a list of the
classes that the dataset contains.
Notes
-----
See the methods for examples on how to use an `AgMLDataLoader` effectively.
"""
IS_MULTI_DATASET: bool = False
serializable = frozenset(
(
"info",
"builder",
"manager",
"train_data",
"train_content",
"val_data",
"val_content",
"test_data",
"test_content",
"is_split",
"meta_properties",
)
)
def __new__(cls, dataset, **kwargs):
# If a single dataset is passed, then we use the base `AgMLDataLoader`.
# However, if an iterable of datasets is passed, then we need to
# dispatch to the subclass `AgMLMultiDatasetLoader` for them.
if isinstance(dataset, (str, DatasetMetadata)):
if "*" in dataset: # enables wildcard search for datasets
valid_datasets = fnmatch.filter(load_public_sources().keys(), dataset)
if len(valid_datasets) == 0:
raise ValueError(f"Wildcard search for dataset '{dataset}' yielded no results.")
if len(valid_datasets) == 1:
log(
f"Wildcard search for dataset '{dataset}' yielded only "
f"one result. Returning a regular, single-element data loader."
)
return super(AgMLDataLoader, cls).__new__(cls)
from agml.data.multi_loader import AgMLMultiDatasetLoader
return AgMLMultiDatasetLoader(valid_datasets, **kwargs)
return super(AgMLDataLoader, cls).__new__(cls)
elif isinstance(dataset, Sequence):
if len(dataset) == 1:
log(
"Received a sequence with only one element when "
"instantiating an `AgMLDataLoader`. Returning "
"a regular, single-element data loader."
)
return super(AgMLDataLoader, cls).__new__(cls)
from agml.data.multi_loader import AgMLMultiDatasetLoader
return AgMLMultiDatasetLoader(dataset, **kwargs)
raise TypeError(
f"Expected either a single dataset name (or metadata), or"
f"a list of dataset names/metadata when instantiating an "
f"`AgMLDataLoader`. Got {dataset} of type {type(dataset)}."
)
def __getnewargs__(self):
return (self._info.name,)
def __init__(self, dataset, **kwargs):
"""Instantiates an `AgMLDataLoader` with the dataset."""
# Set up the dataset and its associated metadata.
self._info = make_metadata(dataset, kwargs.get("meta", None))
# The data for the class is constructed in two stages. First, the
# internal contents are constructed using a `DataBuilder`, which
# finds and wraps the local data in a proper format.
self._builder = DataBuilder(
info=self._info,
dataset_path=kwargs.get("dataset_path", None),
overwrite=kwargs.get("overwrite", False),
)
# These contents are then passed to a `DataManager`, which conducts
# the actual loading and processing of the data when called.
self._manager = DataManager(
builder=self._builder,
task=self._info.tasks.ml,
name=self._info.name,
root=self._builder.dataset_root,
)
# If the dataset is split, then the `AgMLDataLoader`s with the
# split and reduced data are stored as accessible class properties.
self._train_data = None
self._train_content = None
self._val_data = None
self._val_content = None
self._test_data = None
self._test_content = None
self._is_split = False
# Set the direct access metadata properties like `num_images` and
# `classes`, since these can be modified depending on the state of
# the loader, whilst the `info` parameter attributes cannot.
self._meta_properties = {
"num_images": self._info.num_images,
"classes": self._info.classes,
"num_classes": self._info.num_classes,
"num_to_class": self._info.num_to_class,
"class_to_num": self._info.class_to_num,
"data_distributions": {self.name: self._info.num_images},
}
@classmethod
def custom(cls, name, dataset_path=None, classes=None, **kwargs):
"""Creates an `AgMLDataLoader` with a set of custom data.
If you have a custom dataset that you want to use in an `AgMLDataLoader`,
this method constructs the loader using similar semantics to the regular
loader instantiation. It is a wrapper around using the `meta` argument to
provide dataset properties that provides additional convenience for some
circumstances, as summarized below.
Functionally, this method is equivalent to instantiating `AgMLDataLoader`
with an extra argument `meta` that contains metadata for the dataset, with
the `task` and `classes` keys required and the others not necessary. This
would look like follows:
> loader = AgMLDataLoader('name', meta = {'task': task, 'classes': classes})
This method replaces the meta dictionary with keyword arguments to allow
for a more Pythonic construction of the custom loader. This method, however
includes additional optimizations which allow for a more convenient way
to instantiate the loader:
1. It automatically inferences the task from the structure which the data is
in, so you don't need to provide the task at all to this method.
2. For image classification and object detection task, this method will
attempt to automatically inference the classes in the loader (by searching
for the image directories for image classification tasks, and searching
in the COCO JSON file for object detection). Semantic segmentation tasks,
however, still require the list of classes to be passed.
This makes it so that in a variety of cases, the loader can be instantiated
without even requiring any metadata, as most of it can be inferred directly
by this method and thus streamlines the procedure for using custom data.
If you want to cache the metadata, rather than constantly putting them as
arguments, then create a file `.meta.json` at the path `/root/.meta.json`
with the parameters that you want.
Parameters
----------
name : str
A name for the custom dataset (this can be any valid string). This
can also be a path to the dataset (in which case the name will be
the base directory inferred from the path).
dataset_path : str, optional
A custom path to load the dataset from. If this is not passed,
we will assume that the dataset is at the traditional path:
`~/.agml/datasets/<name>` (or the changed default data path).
Otherwise, the dataset can be passed as a path such as `/root/name`,
or `/root`; in the latter case the method will check for `/root/name`.
classes : list, tuple
A list of string-labels for the classes of the dataset, in order.
This is not required for image classification/object detection.
kwargs : dict
Any other metadata for the dataset, this is not required.
Returns
-------
An `AgMLDataLoader` outfitted with the custom dataset.
"""
# Check the name and ensure that no dataset with that name exists.
if name in load_public_sources().keys() or not isinstance(name, str):
raise ValueError(
f"Invalid name '{name}', the name should be "
f"a string that is not an existing dataset in "
f"the AgML public data source repository."
)
# Check if the `name` is itself the path to the dataset.
if os.path.exists(name):
dataset_path = name
name = os.path.basename(name)
# Locate the path to the dataset.
if dataset_path is None:
dataset_path = os.path.abspath(os.path.join(data_save_path(), name))
if not os.path.exists(dataset_path):
raise NotADirectoryError(
f"Existing directory '{dataset_path}' for dataset of name "
f"{name} not found, pass a custom path if you want to use "
f"a custom dataset path for the dataset."
)
else:
dataset_path = os.path.abspath(os.path.expanduser(dataset_path))
if not os.path.exists(dataset_path):
if not os.path.exists(dataset_path):
raise NotADirectoryError(
f"Could not find a directory for dataset '{name}' at the "
f"provided dataset path: {dataset_path}."
)
if not dataset_path.endswith(name):
dataset_path = os.path.join(dataset_path, name)
if not os.path.exists(dataset_path):
raise NotADirectoryError(
f"Could not find a directory for dataset '{name}' at the "
f"provided dataset path: {dataset_path}."
)
# Infer the task based on the provided dataset path.
if os.path.exists(os.path.join(dataset_path, "annotations.json")):
task = "object_detection"
elif os.path.exists(os.path.join(dataset_path, "images")) and os.path.exists(
os.path.join(dataset_path, "annotations")
):
task = "semantic_segmentation"
elif len(get_file_list(dataset_path)) == 0 and len(get_dir_list(dataset_path)) != 0:
task = "image_classification"
else:
raise TypeError("Unrecognized dataset annotation format.")
# Check if there is a metadata file.
kwargs["classes"] = classes
if os.path.exists(os.path.join(dataset_path, ".meta.json")):
with open(os.path.join(dataset_path, ".meta.json"), "r") as f:
kwargs.update(json.load(f))
# Infer the classes for image classification/object detection.
classes = kwargs.pop("classes")
if classes is None:
if task == "semantic_segmentation":
raise ValueError("Classes are required for a semantic segmentation task.")
elif task == "image_classification":
classes = get_dir_list(dataset_path)
else: # object detection
with open(os.path.join(dataset_path, "annotations.json"), "r") as f:
classes = [c["name"] for c in json.load(f)["categories"]]
# Construct and return the `AgMLDataLoader`.
return cls(
name,
dataset_path=dataset_path,
meta={"task": task, "classes": classes, **kwargs},
)
@classmethod
def helios(cls, name, dataset_path=None):
"""Creates an `AgMLDataLoader` from a Helios-generated dataset.
Given the path to a Helios-generated (and converted) dataset, this method
will generate an `AgMLDataLoader` which is constructed using similar
semantics to the regular instantiation. This method is largely similar to
`AgMLDataLoader.custom()`, but also takes into account the extra
information which is provided in the `.metadata` directory of the Helios
generated dataset, allowing it to contain potentially even more info.
"""
# Instantiate from a list of datasets.
if isinstance(name, (list, tuple)):
if dataset_path is None:
dataset_path = [None] * len(name)
elif isinstance(dataset_path, str):
dataset_path = [dataset_path] * len(name)
else:
if not len(dataset_path) == len(name):
raise ValueError("The number of dataset paths must be " "the same as the number of dataset names.")
datasets = [cls.helios(n, dataset_path=dp) for n, dp in zip(name, dataset_path)]
return cls.merge(*datasets)
# Instantiate from a wildcard pattern.
if isinstance(name, str) and "*" in name:
if dataset_path is None:
dataset_path = os.path.abspath(synthetic_data_save_path())
elif not os.path.exists(dataset_path):
raise NotADirectoryError(
f"Existing directory '{dataset_path}' for dataset of name "
f"{name} not found, pass a custom path if you want to use "
f"a custom dataset path for the dataset."
)
# Get the list of datasets.
possible_datasets = glob.glob(os.path.join(dataset_path, name))
if len(possible_datasets) == 0:
raise ValueError(f"No datasets found for pattern: {name}.")
datasets = [cls.helios(os.path.basename(p), dataset_path=dataset_path) for p in sorted(possible_datasets)]
return cls.merge(*datasets)
# Locate the path to the dataset, using synthetic semantics.
if dataset_path is None:
dataset_path = os.path.abspath(os.path.join(synthetic_data_save_path(), name))
if not os.path.exists(dataset_path):
raise NotADirectoryError(
f"Existing directory '{dataset_path}' for dataset of name "
f"{name} not found, pass a custom path if you want to use "
f"a custom dataset path for the dataset."
)
else:
dataset_path = os.path.abspath(os.path.expanduser(dataset_path))
if not os.path.exists(dataset_path):
if not os.path.exists(dataset_path):
raise NotADirectoryError(
f"Could not find a directory for Helios dataset '{name}' "
f"at the provided dataset path: {dataset_path}."
)
# just in case there is a locally defined folder with the same name
# as a dataset in the `~/.agml/synthetic` directory, warn in advance:
if os.path.exists(os.path.join(os.path.abspath(synthetic_data_save_path()), name)):
log(
f"Found a dataset folder '{name}' in the synthetic data "
f"directory, which may conflict with the Helios dataset."
)
if not dataset_path.endswith(name):
dataset_path = os.path.join(dataset_path, name)
if not os.path.exists(dataset_path):
raise NotADirectoryError(
f"Could not find a directory for Helios dataset '{name}' "
f"at the provided dataset path: {dataset_path}."
)
# Load the information file.
info_file = os.path.join(dataset_path, ".metadata", "agml_info.json")
if not os.path.exists(info_file):
raise FileNotFoundError(
f"The information file at '{info_file}' for the " f"Helios dataset {name} could not be found."
)
with open(info_file, "r") as f:
meta = json.load(f)
# Construct the loader.
return cls.custom(name, dataset_path, **meta)
@staticmethod
def merge(*loaders, classes=None):
"""Merges a set of `AgMLDataLoader`s into a single loader.
Given a set of input `AgMLDataLoader`s, this method will return a single
`AgMLDataLoader` which is capable of returning data from any and every one
of the input loaders. The resultant loader is functionally equivalent to
the `AgMLDataLoader` returned by instantiating an `AgMLDataLoader` from a
sequence of AgML public data sources, except that in this case, the input
loaders may be subject to a number of input modifications before merging.
This also allows the usage of both an AgML public data source and a custom
dataset together in a single multi-dataset loader. As such, this method
should be used with caution, as since input loaders may be allowed to have
any modification, certain methods may not function as expected. For instance,
if one of the passed loaders has already been split, then the overall new
multi-loader cannot be split as a whole. Similarly, if also using a custom
dataset, then any properties of the `info` parameter which are not passed
to the dataset cannot be used, even if the other datasets have them.
Parameters
----------
loaders : Tuple[AgMLDataLoader]
A collection of `AgMLDataLoader`s (but not any `AgMLDataLoader`s
which are already holding a collection of datasets).
classes : list
A list of classes in the new loader. This argument can be used to
construct a custom ordering (non-alphabetical) of classes in the loader.
Returns
-------
A new `AgMLDataLoader` wrapping the input datasets.
"""
# Validate the input loaders.
from agml.data.multi_loader import AgMLMultiDatasetLoader
if len(loaders) == 1:
raise ValueError("There should be at least two inputs to the `merge` method.")
for loader in loaders:
if isinstance(loader, AgMLMultiDatasetLoader):
raise TypeError("Cannot merge datasets which already hold a " "collection of multiple datasets.")
# Instantiate the `AgMLMultiDatasetLoader`.
return AgMLMultiDatasetLoader._instantiate_from_collection(*loaders, classes=classes)
def __add__(self, other):
if not isinstance(other, AgMLDataLoader):
return NotImplemented
return AgMLDataLoader.merge(self, other)
def __len__(self):
return self._manager.data_length()
def __getitem__(self, indexes: Union[list, int, slice]):
if isinstance(indexes, slice):
data = np.arange(self._manager.data_length())
indexes = data[indexes].tolist()
if isinstance(indexes, int):
indexes = [indexes]
if np.isscalar(indexes):
indexes = [indexes.item()] # noqa
for idx in indexes:
if idx not in range(len(self)):
if idx not in [-i for i in range(1, len(self) + 1, 1)]:
raise IndexError(f"Index {idx} out of range of " f"AgMLDataLoader length: {len(self)}.")
return self._manager.get(resolve_list_value(indexes))
def __iter__(self):
for indx in range(len(self)):
yield self[indx]
def __repr__(self):
out = f"<AgMLDataLoader: (dataset={self.name}"
out += f", task={self.task}"
out += f", images={self.num_images}"
out += f") at {hex(id(self))}>"
return out
def __str__(self):
return repr(self)
def __copy__(self):
"""Copies the loader and updates its state."""
cp = super(AgMLDataLoader, self).__copy__()
cp.copy_state(self)
return cp
def copy(self):
"""Returns a deep copy of the data loader's contents."""
return self.__copy__()
def copy_state(self, loader):
"""Copies the state of another `AgMLDataLoader` into this loader.
This method copies the state of another `AgMLDataLoader` into this
loader, including its transforms, resizing, and training state. Other
general parameters such as batch size and shuffling are left intact.
Parameters
----------
loader : AgMLDataLoader
The data loader from which the state should be copied.
Returns
-------
This `AgMLDataLoader`.
"""
# Re-construct the training manager.
new_train_manager = loader._manager._train_manager.__copy__()
self._manager._train_manager = new_train_manager
# Re-construct the transform manager.
new_transform_manager = loader._manager._transform_manager.__copy__()
self._manager._transform_manager = new_transform_manager
self._manager._train_manager._transform_manager = new_transform_manager
# Re-construct the resizing manager.
new_resize_manager = loader._manager._resize_manager.__copy__()
self._manager._resize_manager = new_resize_manager
self._manager._train_manager._resize_manager = new_resize_manager
@property
def name(self):
"""Returns the name of the dataset in the loader."""
return self._info.name
@property
def dataset_root(self):
"""Returns the local path to the dataset being used."""
return self._builder.dataset_root
@property
def info(self):
"""Returns a `DatasetMetadata` object containing dataset info.
The contents returned in the `DatasetMetadata` object can be used
to inspect dataset metadata, such as the location the data was
captured, the data formats, and the license/copyright information.
See the `DatasetMetadata` class for more information.
"""
return self._info
@property
def task(self):
"""Returns the ML task that this dataset is constructed for."""
return self._info.tasks.ml
@property
def num_images(self):
"""Returns the number of images in the dataset."""
return self._meta_properties.get("num_images")
@property
def classes(self):
"""Returns the classes that the dataset is predicting."""
return self._meta_properties.get("classes")
@property
def num_classes(self):
"""Returns the number of classes in the dataset."""
return self._meta_properties.get("num_classes")
@property
def num_to_class(self):
"""Returns a mapping from a number to a class label."""
return self._meta_properties.get("num_to_class")
@property
def class_to_num(self):
"""Returns a mapping from a class label to a number."""
return self._meta_properties.get("class_to_num")
@property
def data_distributions(self):
"""Displays the distribution of images from each source."""
return self._meta_properties.get("data_distributions")
@property
def image_size(self):
"""Returns the determined image size for the loader.
This is primarily useful when using auto shape inferencing, to
access what the final result ends up being. Otherwise, it may
just return `None` or the shape that the user has set.
"""
return self._manager._resize_manager.size
def _generate_split_loader(self, contents, split, meta_properties=None, **kwargs):
"""Generates a split `AgMLDataLoader`."""
# Check if the data split exists.
if contents is None:
raise ValueError(f"Attempted to access '{split}' split when " f"the data has not been split for '{split}'.")
# Load a new `DataManager` and update its internal managers
# using the state of the existing loader's `DataManager`.
builder = DataBuilder.from_data(
contents=[contents, kwargs.get("labels_for_image", None)],
info=self.info,
root=self.dataset_root,
builder=self._builder,
)
current_manager = copy.deepcopy(self._manager.__getstate__())
current_manager.pop("builder")
current_manager["builder"] = builder
# Build the new accessors and construct the `DataManager`.
accessors = np.arange(len(builder.get_contents()))
if self._manager._shuffle:
np.random.shuffle(accessors)
current_manager["accessors"] = accessors
batch_size = current_manager.pop("batch_size")
current_manager["batch_size"] = None
new_manager = DataManager.__new__(DataManager)
new_manager.__setstate__(current_manager)
# After the builder and accessors have been generated, we need
# to generate a new list of `DataObject`s.
new_manager._create_objects(new_manager._builder, self.task)
# Update the `TransformManager` and `ResizeManager` of the
# `TrainManager` in the `DataManager` (they need to be synchronized).
new_manager._train_manager._transform_manager = new_manager._transform_manager
new_manager._train_manager._resize_manager = new_manager._resize_manager
# Batching data needs to be done independently.
if batch_size is not None:
new_manager.batch_data(batch_size=batch_size)
# Update the metadata parameters.
if meta_properties is None:
meta_properties = self._meta_properties.copy()
meta_properties["num_images"] = len(contents)
meta_properties["data_distributions"] = {self.name: len(contents)}
# Instantiate a new `AgMLDataLoader` from the contents.
loader_state = self.copy().__getstate__()
loader_state["builder"] = builder
loader_state["manager"] = new_manager
loader_state["meta_properties"] = meta_properties
cls = super(AgMLDataLoader, self).__new__(AgMLDataLoader)
cls.__setstate__(loader_state)
for attr in ["train", "val", "test"]:
setattr(cls, f"_{attr}_data", None)
cls._is_split = True
return cls
@property
def train_data(self):
"""Stores the `train` split of the data in the loader."""
if isinstance(self._train_data, AgMLDataLoader):
return self._train_data
self._train_data = self._generate_split_loader(self._train_content, split="train")
return self._train_data
@property
def val_data(self):
"""Stores the `val` split of the data in the loader."""
if isinstance(self._val_data, AgMLDataLoader):
return self._val_data
self._val_data = self._generate_split_loader(self._val_content, split="val")
self._val_data.eval()
return self._val_data
@property
def test_data(self):
"""Stores the `test` split of the data in the loader."""
if isinstance(self._test_data, AgMLDataLoader):
return self._test_data
self._test_data = self._generate_split_loader(self._test_content, split="test")
self._test_data.eval()
return self._test_data
def eval(self) -> "AgMLDataLoader":
"""Sets the `AgMLDataLoader` in evaluation mode.
Evaluation mode disables transforms, and only keeps the loader applying
resizing to the contents. If the loader was previously set into TensorFlow
or PyTorch mode, however, it will also keep up tensor conversion and
potential batch adding (see `as_keras_sequence()` and `as_torch_dataset()`
methods for more information on the exact operations).
This method does not completely disable preprocessing, to completely
disable preprocessing, use `loader.disable_preprocessing()`. Additionally,
if you want to keep only the resizing but not the implicit tensor
conversions based on the backend, then run:
> loader.disable_preprocessing() # or loader.reset_preprocessing()
> loader.eval()
This will refresh the backend conversions and return it to `eval` mode.
Returns
-------
The `AgMLDataLoader` object.
"""
self._manager.update_train_state("eval")
return self
def disable_preprocessing(self) -> "AgMLDataLoader":
"""Disables all preprocessing on the `AgMLDataLoader`.
This sets the loader in a no-preprocessing mode (represented internally as
`False`), where only the raw data is returned: no transforms, resizing, or
any conversion to any type of backend. This can be used to test or inspect
the original data contents of the loader before processing.
The loader can be set into any mode from here, for instance see `eval()`,
`as_keras_sequence()`, and `as_torch_dataset()` for specific examples on
the different potential training and evaluation states. If you just want
to reset the loader to its default state, which applies only transforms
and resizing, then use `loader.reset_preprocessing()`.
Returns
-------
The `AgMLDataLoader` object.
"""
self._manager.update_train_state(False)
return self
def reset_preprocessing(self) -> "AgMLDataLoader":
"""Re-enables preprocessing on the `AgMLDataLoader`.
This resets the loader back to its default train state, namely where it
applies just the given transforms and content resizing. This is a consistent
method, meaning that regardless of the prior train state of the loader
before running this method, it will hard reset it to its original state
(similar to `disable_preprocessing()`, but it keeps some preprocessing).
Returns
-------
The `AgMLDataLoader` object.
"""
self._manager.update_train_state(None)
return self
def on_epoch_end(self):
"""Shuffles the dataset on the end of an epoch for a Keras sequence.
If `as_keras_sequence()` is called and the `AgMLDataLoader` inherits
from `tf.keras.utils.Sequence`, then this method will shuffle the
dataset on the end of each epoch to improve training.
"""
self._manager._maybe_shuffle()
def as_keras_sequence(self) -> "AgMLDataLoader":
"""Sets the `DataLoader` in TensorFlow mode.
This TensorFlow extension converts the loader into a TensorFlow mode,
adding inheritance from the superclass `keras.utils.Sequence` to enable
it to be used directly in a Keras pipeline, and adding extra preprocessing
to the images and annotations to make them compatible with TensorFlow.
The main features added on enabling this include:
1. Conversion of output images and annotations to `tf.Tensor`s.
2. Adding an implicit batch size dimension to images even when the
data is not batched (for compatibility in `Model.fit()`).
3. Adding inheritance from `keras.utils.Sequence` so that any
`AgMLDataLoader` object can be used directly in `Model.fit()`.
4. Setting the data loader to use a constant image shape, namely
`auto` (which will default to (512, 512) if none is found).
This can be overridden by manually setting the image shape
parameter back after running this method. Note that this may
result in errors when attempting implicit tensor conversion.
Returns
-------
The `AgMLDataLoader` object.
"""
_add_dataset_to_mro(self, "tf")
self._manager.update_train_state("tf")
return self
def as_torch_dataset(self) -> "AgMLDataLoader":
"""Sets the `DataLoader` in PyTorch mode.
This PyTorch extension converts the loader into a PyTorch mode, adding
inheritance from th superclass `torch.utils.data.Dataset` to enable it to
be used directly in a PyTorch pipeline, and adding extra preprocessing to
the images and annotations to make them compatible with PyTorch.
The main features added on enabling this include:
1. Conversion of output images and annotations to `torch.Tensor`s.
2. Converting the channel format of the input images from the default,
channels_last, into channels_first (NHWC -> NCHW).
3. Adding inheritance from `torch.utils.data.Dataset` so that any
`AgMLDataLoader` object can be used with a `torch.utils.data.DataLoader`.
4. Setting the data loader to use a constant image shape, namely
`auto` (which will default to (512, 512) if none is found).
This can be overridden by manually setting the image shape
parameter back after running this method. Note that this may
result in errors when attempting implicit tensor conversion.
Returns
-------
The `AgMLDataLoader` object.
"""
_add_dataset_to_mro(self, "torch")
self._manager.update_train_state("torch")
return self
@property
def shuffle_data(self):
"""Returns whether the loader is set to shuffle data or not.
By default, if no value is passed in initialization, this is set to
`True`. It can be manually toggled to `False` using this property.
"""
return self._manager._shuffle
@shuffle_data.setter
def shuffle_data(self, value):
"""Set whether the loader should shuffle data or not.
This can be used to enable/disable shuffling, by passing
either `True` or `False`, respectively.
"""
if not isinstance(value, bool):
raise TypeError("Expected either `True` or `False` for 'shuffle_data'.")
self._manager._shuffle = value
def shuffle(self, seed=None):
"""Potentially shuffles the contents of the loader.
If shuffling is enabled on this loader (`shuffle = False` has
not been passed to the instantiation), then this method will
shuffle the order of contents in it. A seed can be provided to
shuffle the dataset to an expected order.
If the data is already batched, then the batch contents will be
shuffled. For instance, if we have data batches [[1, 2], [3, 4]],
then the shuffling result will be [[3, 4], [1, 2]]. If you want
all of the contents to be shuffled, call `shuffle` before batching.
Note that the data is automatically shuffled upon instantiation,
unless the `shuffle = False` parameter is passed at instantiation.
However, this disables automatic shuffling for the class
permanently, and this method must be called to shuffle the data.
Parameters
----------
seed : int, optional
A pre-determined seed for shuffling.
Returns
-------
The `AgMLDataLoader` object.
"""
self._manager.shuffle(seed=seed)
return self
def take_images(self):
"""Returns a mini-loader over all of the images in the dataset.
This method returns a mini-loader over all of the images in the dataset,
without any annotations. This is useful for running inference over just
the images in a dataset, or in general any operations in which you just
want the raw image data from a loader, without any corresponding labels.
Returns
-------
An `agml.data.ImageLoader` with the dataset images.
"""
from agml.data.image_loader import ImageLoader
return ImageLoader(self)
def take_dataset(self, name) -> "AgMLDataLoader":
"""Takes one of the datasets in a multi-dataset loader.
This method selects one of the datasets (as denoted by `name`)
in this multi-dataset collection and returns an `AgMLDataLoader`
with its contents. These contents will be subject to any transforms
and modifications as applied by the main loader, but the returned
loader will be a copy, such that any new changes made to the main
multi-dataset loader will not affect the new loader.
Note that this method only works for multi-dataset collections.
Parameters
----------
name : str
The name of one of the sub-datasets of the loader.
Returns
-------
An `AgMLDataLoader`.
"""
raise ValueError("The `loader.take_dataset` method only works for multi-dataset loaders.")
def take_class(self, classes, reindex=True) -> "AgMLDataLoader":
"""Reduces the dataset to a subset of class labels.
This method, given a set of either integer or string class labels,
will return a new `AgMLDataLoader` containing a subset of the
original dataset, where the only classes in the dataset are those
specified in the `classes` argument.
The new loader will have info parameters like `num_classes` and
`class_to_num` updated for the new set of classes; however, the
original `info` metadata will remain the same as the original.
Note that if the dataset contains images which have bounding boxes
corresponding to multiple classes, this method will not work.
Parameters
----------
classes : list, int, str
Either a single integer/string for a single class, or a list
of integers or strings for multiple classes. Integers should
be one-indexed for object detection.
reindex : bool
Re-indexes all of the new classes starting from 1, in ascending
order based on their number in the original dataset.
Notes
-----
This method only works for object detection datasets.
"""
if self._info.tasks.ml != "object_detection":
raise RuntimeError("The `take_class` method can only be " "used for object detection datasets.")
# Parse the provided classes and determine their numerical labels.
if isinstance(classes, str):
if classes not in self.classes:
raise ValueError(
f"Received a class '{classes}' for `loader.take_class`, "
f"which is not in the classes for {self.name}: {self.classes}"
)
classes = [self.class_to_num[classes]]
elif isinstance(classes, int):
try:
self.num_to_class[classes]
except IndexError:
raise ValueError(
f"The provided class number {classes} is out of "
f"range for {self.num_classes} classes. Make sure "
f"you are using zero-indexing."
)
classes = [classes]
else:
parsed_classes = []
if isinstance(classes[0], str):
for cls in classes:
if cls not in self.classes:
raise ValueError(
f"Received a class '{cls}' for `loader.take_class`, which "
f"is not in the classes for {self.name}: {self.classes}"
)
parsed_classes.append(self.class_to_num[cls])
elif isinstance(classes[0], int):
for cls in classes:
try:
self.num_to_class[cls]
except IndexError:
raise ValueError(
f"The provided class number {cls} is out of "
f"range for {self.num_classes} classes. Make "
f"sure you are using zero-indexing."
)
parsed_classes.append(cls)
classes = parsed_classes.copy()
# Ensure that there are no images with multi-category boxes.
categories = self._builder._labels_for_image
if not all(len(np.unique(c)) == 1 for c in categories.values()):
raise ValueError(
f"Dataset {self.name} has images with multiple categories for "
f"bounding boxes, cannot take an individual set of classes."