Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion runtime/onert/api/python/package/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# Define the public API of the onert package
__all__ = ["infer", "tensorinfo"]
__all__ = ["infer", "tensorinfo", "experimental"]

# Import and expose the infer module's functionalities
from . import infer

# Import and expose tensorinfo
from .common import tensorinfo as tensorinfo

# Import and expose the experimental module's functionalities
from . import experimental
3 changes: 3 additions & 0 deletions runtime/onert/api/python/package/experimental/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
__all__ = ["train"]

from . import train
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from .session import TrainSession as session
from onert.native.libnnfw_api_pybind import traininfo
from .dataloader import DataLoader
from . import optimizer
from . import losses
from . import metrics

__all__ = ["session", "traininfo", "DataLoader", "optimizer", "losses", "metrics"]
231 changes: 231 additions & 0 deletions runtime/onert/api/python/package/experimental/train/dataloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
import os
import numpy as np


class DataLoader:
Comment thread
zetwhite marked this conversation as resolved.
"""
A flexible DataLoader to manage training and validation data.
Automatically detects whether inputs are paths or NumPy arrays.
"""
def __init__(self,
input_dataset,
expected_dataset,
batch_size,
Comment thread
ragmani marked this conversation as resolved.
Outdated
input_shape=None,
expected_shape=None,
dtype=np.float32):
"""
Initialize the DataLoader.

Args:
input_dataset (list of np.ndarray): List of input arrays where each array's first dimension is the batch dimension.
expected_dataset (list of np.ndarray): List of expected arrays where each array's first dimension is the batch dimension.
batch_size (int): Number of samples per batch.
input_shape (tuple, optional): Shape of the input data if raw format is used.
expected_shape (tuple, optional): Shape of the expected data if raw format is used.
dtype (type, optional): Data type of the raw file (default: np.float32).
"""
self.batch_size = batch_size
self.inputs = self._process_dataset(input_dataset, input_shape, dtype)
self.expecteds = self._process_dataset(expected_dataset, expected_shape, dtype)
self.batched_inputs = []

# Verify data consistency
self.num_samples = self.inputs[0].shape[0] # Batch dimension
if self.num_samples != self.expecteds[0].shape[0]:
raise ValueError(
"Input data and expected data must have the same number of samples.")

# Precompute batches
self.batched_inputs, self.batched_expecteds = self._create_batches()

def _process_dataset(self, data, shape, dtype=np.float32):
"""
Process a dataset or file path.

Args:
data (str or np.ndarray): Path to file or NumPy arrays.
shape (tuple, optional): Shape of the data if raw format is used.
dtype (type, optional): Data type for raw files.

Returns:
list of np.ndarray: Loaded or passed data as NumPy arrays.
"""
if isinstance(data, list):
# Check if all elements in the list are NumPy arrays
if all(isinstance(item, np.ndarray) for item in data):
return data
raise ValueError("All elements in the list must be NumPy arrays.")
if isinstance(data, np.ndarray):
# If it's already a NumPy array and is not a list of arrays
if len(data.shape) > 1:
# If the array has multiple dimensions, split it into a list of arrays
return [data[i] for i in range(data.shape[0])]
else:
# If it's a single array, wrap it into a list
return [data]
elif isinstance(data, str):
# If it's a string, assume it's a file path
return [self._load_data(data, shape, dtype)]
else:
raise ValueError("Data must be a NumPy array or a valid file path.")

def _load_data(self, file_path, shape, dtype=np.float32):
"""
Load data from a file, supporting both .npy and raw formats.

Args:
file_path (str): Path to the file to load.
shape (tuple, optional): Shape of the data if raw format is used.
dtype (type, optional): Data type of the raw file (default: np.float32).

Returns:
np.ndarray: Loaded data as a NumPy array.
"""
_, ext = os.path.splitext(file_path)

if ext == ".npy":
# Load .npy file
return np.load(file_path)
elif ext in [".bin", ".raw"]:
# Load raw binary file
if shape is None:
raise ValueError(f"Shape must be provided for raw file: {file_path}")
return self._load_raw(file_path, shape, dtype)
else:
raise ValueError(f"Unsupported file format: {ext}")

def _load_raw(self, file_path, shape, dtype):
"""
Load raw binary data.

Args:
file_path (str): Path to the raw binary file.
shape (tuple): Shape of the data to reshape into.
dtype (type): Data type of the binary file.

Returns:
np.ndarray: Loaded data as a NumPy array.
"""
# Calculate the expected number of elements based on the provided shape
expected_elements = np.prod(shape)

# Calculate the expected size of the raw file in bytes
expected_size = expected_elements * np.dtype(dtype).itemsize

# Get the actual size of the raw file
actual_size = os.path.getsize(file_path)

# Check if the sizes match
if actual_size != expected_size:
raise ValueError(
f"Raw file size ({actual_size} bytes) does not match the expected size "
f"({expected_size} bytes) based on the provided shape {shape} and dtype {dtype}."
)

# Read and load the raw data
with open(file_path, "rb") as f:
data = f.read()
array = np.frombuffer(data, dtype=dtype)
if array.size != expected_elements:
raise ValueError(
f"Raw data size does not match the expected shape: {shape}. "
f"Expected {expected_elements} elements, got {array.size} elements.")
return array.reshape(shape)

def _create_batches(self):
"""
Precompute batches for inputs and expected outputs.

Returns:
tuple: Lists of batched inputs and batched expecteds.
"""
batched_inputs = []
batched_expecteds = []

for batch_start in range(0, self.num_samples, self.batch_size):
batch_end = min(batch_start + self.batch_size, self.num_samples)

# Collect batched inputs
inputs_batch = [
input_array[batch_start:batch_end] for input_array in self.inputs
]
if batch_end - batch_start < self.batch_size:
# Resize the last batch to match batch_size
inputs_batch = [
np.resize(batch, (self.batch_size, *batch.shape[1:]))
for batch in inputs_batch
]

batched_inputs.append(inputs_batch)

# Collect batched expecteds
expecteds_batch = [
expected_array[batch_start:batch_end] for expected_array in self.expecteds
]
if batch_end - batch_start < self.batch_size:
# Resize the last batch to match batch_size
expecteds_batch = [
np.resize(batch, (self.batch_size, *batch.shape[1:]))
for batch in expecteds_batch
]

batched_expecteds.append(expecteds_batch)

return batched_inputs, batched_expecteds

def __iter__(self):
"""
Make the DataLoader iterable.

Returns:
self
"""
self.index = 0
return self

def __next__(self):
"""
Return the next batch of data.

Returns:
tuple: (inputs, expecteds) for the next batch.
"""
if self.index >= len(self.batched_inputs):
raise StopIteration

# Retrieve precomputed batch
input_batch = self.batched_inputs[self.index]
expected_batch = self.batched_expecteds[self.index]

self.index += 1
return input_batch, expected_batch

def split(self, validation_split):
"""
Split the data into training and validation sets.

Args:
validation_split (float): Ratio of validation data. Must be between 0.0 and 1.0.

Returns:
tuple: Two DataLoader instances, one for training and one for validation.
"""
if not (0.0 <= validation_split <= 1.0):
Comment thread
ragmani marked this conversation as resolved.
raise ValueError("Validation split must be between 0.0 and 1.0.")

split_index = int(len(self.inputs[0]) * (1.0 - validation_split))

train_inputs = [input_array[:split_index] for input_array in self.inputs]
val_inputs = [input_array[split_index:] for input_array in self.inputs]
train_expecteds = [
expected_array[:split_index] for expected_array in self.expecteds
]
val_expecteds = [
expected_array[split_index:] for expected_array in self.expecteds
]

train_loader = DataLoader(train_inputs, train_expecteds, self.batch_size)
val_loader = DataLoader(val_inputs, val_expecteds, self.batch_size)

return train_loader, val_loader
2 changes: 1 addition & 1 deletion runtime/onert/core/src/compiler/train/TrainingCompiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ std::shared_ptr<CompilerArtifact> TrainingCompiler::compile(void)
auto &input = trainable_subg->operands().at(ind);
auto new_shape = input.info().shape();
// TODO Consider batch size index
if (new_shape.dim(0) != 1)
if (new_shape.dim(0) != 1 && new_shape.dim(0) != ir::Shape::kUnspecifiedDim)
throw std::runtime_error("the first dim is not 1. It is not supported yet.");
new_shape.dim(0) = _training_info.batchSize();
input.info().shape(new_shape);
Expand Down