Skip to content

Add Batch Creation using Zero Copy in Memory #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions prototype_experimental/batch_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import ROOT
import numpy as np


class Generator:
def __init__(self,batch_size, x_rdf, nevt):
self.batch_size = batch_size
self.x_rdf = x_rdf
self.nevt = nevt
self.generator = ROOT.Generator_t()

def generator_functor(self):
'''
Calls C++ Generator and returns numpy array.
RDF -> RTensor -> C++ Array -> Numpy Array
'''
batch = self.generator(self.batch_size, self.x_rdf, self.nevt)
# print(batch)
batch_shape = list(batch.GetShape()) ##RTensor shape in cppyy.gbl.std.vector<unsigned long>
# print(batch_shape)
batch_data = batch.GetData() ## RTensor Data in cppyy.LowLevelView
# print(batch_data)
batch_data.reshape((int(batch_shape[0]*batch_shape[1]),)) ## RTensor Data in cppyy.LowLevelView
# print(np.asarray(batch_data))
reshaped_batch_data = np.asarray(batch_data).reshape(batch_shape)
# print("reshaped_batch_data", reshaped_batch_data)
return reshaped_batch_data

x_rdf = ROOT.RDataFrame("sig_tree", "http://root.cern.ch/files/Higgs_data.root", ["jet1_phi", "jet1_eta", "jet1_pt", "jet2_phi", "jet2_pt"])

print("compiling Generator functor....")
ROOT.gInterpreter.ProcessLine('#include "batch_generator_functor.h"')

batch_size = 4
batch_num = 0
nevt = 16

generator_class = Generator(batch_size, x_rdf, nevt)

while True:
batch = generator_class.generator_functor()
if (len(batch) == 0):
break
print("Batch No.", batch_num)
print("Generator: ", batch)
batch_num+=1
137 changes: 137 additions & 0 deletions prototype_experimental/batch_generator_dataloader.C
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#include <iostream>
#include <tuple>
#include <vector>
#include <algorithm>

#include "TMVA/RTensor.hxx"
#include "ROOT/RDataFrame.hxx"

// Primary template for the DataLoader class.
// Required for the second class template to work
template <typename F, typename U>
class DataLoader;

// Dataloader class used to load content of a RDataFrame onto a RTensor.
template <typename T, std::size_t... N>
class DataLoader<T, std::index_sequence<N...>>
{
// Magic used to make make_index_sequence work.
// Code is based on the SofieFunctorHelper
template <std::size_t Idx>
using AlwaysT = T;

std::vector<std::vector<T>> fInput;

private:
size_t num_rows, num_columns, current_row = 0;
bool random_order;

std::vector<size_t> row_order;
TMVA::Experimental::RTensor<float>& x_tensor;

public:
DataLoader(TMVA::Experimental::RTensor<float>& x_tensor, const size_t num_columns, const size_t num_rows, bool random_order=true)
: x_tensor(x_tensor), num_columns(num_columns), num_rows(num_rows), random_order(random_order)
{
// Create a vector with elements 0...num_rows
row_order = std::vector<size_t>(num_rows);
std::iota(row_order.begin(), row_order.end(), 0);


// Randomize the order
if (random_order) {
std::random_shuffle(row_order.begin(), row_order.end());
}
}

// Assign the values of a given row to the TMVA::Experimental::RTensor
template <typename First_T>
void assign_to_tensor(size_t offset, size_t i, First_T first)
{
x_tensor.GetData()[offset + i] = first;
}
template <typename First_T, typename... Rest_T>
void assign_to_tensor(size_t offset, size_t i, First_T first, Rest_T... rest)
{
x_tensor.GetData()[offset + i] = first;
assign_to_tensor(offset, ++i, std::forward<Rest_T>(rest)...);
}

// Load the values of a row onto a random row of the Tensor
void operator()(AlwaysT<N>... values)
{
if (current_row >= num_rows)
return;

assign_to_tensor(row_order[current_row] * num_columns , 0, std::forward<AlwaysT<N>>(values)...);

current_row++;
}
};

class Generator_t
{
private:
size_t current_row = 0, batch_size, num_rows, num_columns;
TMVA::Experimental::RTensor<float>& x_tensor;
bool drop_last;

public:

Generator_t(TMVA::Experimental::RTensor<float>& x_tensor, const size_t batch_size, const size_t num_rows,
const size_t num_columns, bool drop_last=true)
: x_tensor(x_tensor), batch_size(batch_size), num_rows(num_rows), num_columns(num_columns), drop_last(drop_last) {}

// Return a batch from the data
TMVA::Experimental::RTensor<float> operator()()
{
if (current_row + batch_size <= num_rows)
{
unsigned long offset = current_row * num_columns;
TMVA::Experimental::RTensor<float> x_batch(x_tensor.GetData() + offset, {batch_size, num_columns});

current_row += batch_size;
return x_batch;
}
else
{
// TODO: Implement drop_last
return x_tensor.Slice({{0, 0}, {0, 0}});
}
}

bool HasData() {return (current_row + batch_size <= num_rows);}
};


void batch_generator_dataloader()
{
// Define variables
std::vector<std::string> cols = {"m_jj", "m_jjj", "m_jlv", "m_lv"};
size_t batch_size = 2, start_row = 5, num_rows = 5, num_columns = cols.size();
bool random_order = false, drop_last = false;

// Load the RDataFrame and create a new tensor
ROOT::RDataFrame x_rdf = ROOT::RDataFrame("sig_tree", "http://root.cern.ch/files/Higgs_data.root");
TMVA::Experimental::RTensor<float> x_tensor({num_rows, num_columns});

// Fill the RTensor with the data from the RDataFrame
DataLoader<float, std::make_index_sequence<4>>
func(x_tensor, num_columns, num_rows, random_order);

x_rdf.Range(start_row, start_row + num_rows).Foreach(func, cols);

// define generator
Generator_t generator(x_tensor, batch_size, num_rows, num_columns, drop_last);

// Generate new batches until all data has been returned
while (generator.HasData()) {
auto batch = generator();

std::cout << batch << std::endl;
}
}

int main() {
batch_generator_dataloader();
}
94 changes: 94 additions & 0 deletions prototype_experimental/batch_generator_functor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#include <cstddef>
#include <string>
#include <vector>

using namespace ROOT;
using namespace TMVA::Experimental;

template <size_t NCols, typename First, typename... Rest>
class HelperFunc
{
static_assert(1 + sizeof...(Rest) == NCols, "");

private:
// For N = 0, 1, ..., NCols - 1
template <size_t N>
struct AssignToTensor
{
template <typename FirstArgs, typename... RestArgs>
static void Call(TMVA::Experimental::RTensor<float> &x,
const size_t offset,
FirstArgs &&first, RestArgs &&...rest)
{
// Assign x[offset + N] = first
x.GetData()[offset + N] = first;
// Assign x[offset + N - 1] = first element of rest...
AssignToTensor<N + 1>::Call(x, offset, std::forward<RestArgs>(rest)...);
}
};
// Stop at N = NCols, do nothing
template <>
struct AssignToTensor<NCols>
{
template <typename... Args>
static void Call(TMVA::Experimental::RTensor<float> &, const size_t offset,
Args...) {}
};

private:
size_t offset = 0;
TMVA::Experimental::RTensor<float> &fTensor;

public:
HelperFunc(TMVA::Experimental::RTensor<float> &x_tensor)
: fTensor(x_tensor) {}

void operator()(First first, Rest... rest)
{
AssignToTensor<0>::Call(fTensor, offset, std::forward<First>(first), std::forward<Rest>(rest)...);
// offset += NCols;
};
};

class Generator_t
{
private:
size_t i = 0;

public:
RTensor<float> operator()(const size_t batch_size, RDataFrame &x_rdf,
const size_t nevt)
{

// TO DO: make column input dynamic with std::make_index_sequence
std::vector<std::string> cols = {"jet1_phi", "jet1_eta", "jet1_pt", "jet2_phi", "jet2_pt"};
TMVA::Experimental::RTensor<float> x_tensor({nevt, 5});
HelperFunc<5, float &, float &, float &, float &, float &> func(x_tensor);
size_t offset = 0;

x_rdf.Foreach(func, cols);

auto data_len = x_tensor.GetShape()[0];
auto num_column = x_tensor.GetShape()[1];
// std::cout << "data len = " << data_len << " and num_column = " << num_column
// << std::endl;
// std::cout << "Rtensor = \n";
// std::cout << x_tensor << std::endl;

if (i + batch_size < data_len)
{
unsigned long offset = i * num_column;
RTensor<float> x_batch(x_tensor.GetData() + offset,
{batch_size, num_column});

i += batch_size;
// cout << "x_batch is: " << x_batch << "\n"<< "\n";

return x_batch;
}
else
{
return x_tensor.Slice({{0, 0}, {0, 0}});
}
}
};
71 changes: 71 additions & 0 deletions prototype_experimental/bg_keras_exp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import ROOT
import numpy as np
from keras.models import Sequential
from keras.layers import Dense,Flatten
import tensorflow as tf

class Generator:
def __init__(self,batch_size, x_rdf, y_rdf, nevt):
self.batch_size = batch_size
self.x_rdf = x_rdf
self.y_rdf = y_rdf
self.nevt = nevt
self.x_generator = ROOT.Generator_t()
self.y_generator = ROOT.Generator_t()

def generator_batches(self):
'''
Calls C++ Generator and returns numpy array.
RDF -> RTensor -> C++ Array -> Numpy Array
'''

## x_batch creation
x_batch = self.x_generator(self.batch_size, self.x_rdf, self.nevt)
# print(batch)
x_batch_shape = list(x_batch.GetShape()) ##RTensor shape in cppyy.gbl.std.vector<unsigned long>
# print(batch_shape)
x_batch_data = x_batch.GetData() ## RTensor Data in cppyy.LowLevelView
# print(batch_data)
x_batch_data.reshape((int(x_batch_shape[0]*x_batch_shape[1]),)) ## RTensor Data in cppyy.LowLevelView
# print(np.asarray(batch_data))
x_reshaped_batch_data = np.asarray(x_batch_data).reshape(x_batch_shape)
# print("x_reshaped_batch_data", x_reshaped_batch_data)

## y_batch creation
y_batch = self.y_generator(self.batch_size, self.y_rdf, self.nevt)
# print(batch)
y_batch_shape = list(y_batch.GetShape()) ##RTensor shape in cppyy.gbl.std.vector<unsigned long>
# print(batch_shape)
y_batch_data = y_batch.GetData() ## RTensor Data in cppyy.LowLevelView
# print(batch_data)
y_batch_data.reshape((int(y_batch_shape[0]*y_batch_shape[1]),)) ## RTensor Data in cppyy.LowLevelView
# print(np.asarray(batch_data))
y_reshaped_batch_data = np.asarray(y_batch_data).reshape(y_batch_shape)

# if len(x_reshaped_batch_data) == 0:
# return None, None

return x_reshaped_batch_data,y_reshaped_batch_data

x_rdf = ROOT.RDataFrame("bkg_tree", "./Higgs_data.root", ["jet1_phi", "jet1_eta", "jet1_pt", "jet2_phi", "jet2_pt"])
y_rdf = ROOT.RDataFrame("bkg_tree", "./Higgs_data.root", ["jet1_phi", "jet1_eta", "jet1_pt", "jet2_phi", "jet2_pt"])

print("compiling Generator functor....")
ROOT.gInterpreter.ProcessLine('#include "batch_generator_functor.h"')

batch_size = 4
batch_is_empty = 0
batch_num = 0
nevt = 16

generator_class = Generator(batch_size, x_rdf, y_rdf, nevt)

model = Sequential()
model.add(Dense(12, input_dim=3, activation="relu"))
model.add(Dense(12, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.summary()
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


model.fit(generator_class.generator_batches(),steps_per_epoch=100/batch_size ,epochs=20)