Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,8 @@ add_dependencies(TinyLayerTest ChAI)
target_link_options(TinyLayerTest
PRIVATE
--main-module layer_test.chpl
-M ${PROJECT_ROOT_DIR}/lib
${BRIDGE_DIR}/include/bridge.h
${BRIDGE_OBJECT_FILES}
-L ${LIBTORCH_DIR}/lib
${LIBTORCH_LIBS_LINKER_ARGS}
--ldflags "-Wl,-rpath,${LIBTORCH_DIR}/lib"
# -M ${PROJECT_ROOT_DIR}/lib
${CHAI_LINKER_ARGS}
)
# chpl test/tiny/layer_test.chpl -M lib bridge/include/bridge.h build/CMakeFiles/bridge.dir/bridge/lib/bridge.cpp.o -L libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps --ldflags "-Wl,-rpath,libtorch/lib"

Expand Down
Binary file modified bridge/.DS_Store
Binary file not shown.
6 changes: 4 additions & 2 deletions bridge/include/bridge.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;

void debug_cpu_only_mode(bool_t mode);

typedef struct bridge_tensor_t {
float* data;
int* sizes;
Expand Down Expand Up @@ -51,9 +53,9 @@ bridge_tensor_t load_run_model(const uint8_t* model_path, bridge_tensor_t input)

bridge_pt_model_t load_model(const uint8_t* model_path);

bridge_tensor_t model_forward(bridge_pt_model_t model, bridge_tensor_t input);

bool_t accelerator_available(void);

bridge_tensor_t model_forward(bridge_pt_model_t model, bridge_tensor_t input);
bridge_tensor_t model_forward_style_transfer(bridge_pt_model_t model, bridge_tensor_t input);

bridge_tensor_t resize(bridge_tensor_t input,int height,int width);
Expand Down
122 changes: 83 additions & 39 deletions bridge/lib/bridge.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include <bridge.h>

#include <torch/torch.h>
#include <Aten/ATen.h>

#include <torch/script.h>

// #include <torch/script.h>
Expand All @@ -27,6 +29,58 @@



// Globals


torch::Device get_best_device();
torch::ScalarType get_best_dtype();

auto best_device = get_best_device();
auto best_dtype = get_best_dtype();

torch::NoGradGuard no_grad;
torch::AutoGradMode enable_grad(false);

bool debug_cpu_only = false;



torch::Device get_best_device() {
if (debug_cpu_only)
return torch::Device(torch::kCPU);

if (torch::hasMPS()) {
return torch::Device(torch::kMPS);
} else if (torch::hasCUDA()) {
return torch::Device(torch::kCUDA);
} else {
return torch::Device(torch::kCPU);
}
}

extern "C" void debug_cpu_only_mode(bool_t mode) {
debug_cpu_only = mode;
if (debug_cpu_only) {
best_device = torch::Device(torch::kCPU);
} else {
best_device = get_best_device();
}
}

extern "C" bool_t accelerator_available() {
return (best_device == torch::Device(torch::kCUDA) || best_device == torch::Device(torch::kMPS));
}

torch::ScalarType get_best_dtype() {
if (torch::hasMPS()) {
return torch::kFloat16;
} else if (torch::hasCUDA()) {
return torch::kFloat16;
} else {
return torch::kFloat32;
}
}

int bridge_tensor_elements(bridge_tensor_t &bt) {
int size = 1;
for (int i = 0; i < bt.dim; ++i) {
Expand All @@ -39,14 +93,14 @@ size_t bridge_tensor_size(bridge_tensor_t &bt) {
return sizeof(float32_t) * bridge_tensor_elements(bt);
}

void store_tensor(torch::Tensor &input, float32_t* dest) {
void store_tensor(at::Tensor &input, float32_t* dest) {
float32_t * data = input.data_ptr<float32_t>();
size_t bytes_size = sizeof(float32_t) * input.numel();
// std::memmove(dest,data,bytes_size);
std::memcpy(dest,data,bytes_size);
}

bridge_tensor_t torch_to_bridge(torch::Tensor &tensor) {
bridge_tensor_t torch_to_bridge(at::Tensor &tensor) {
bridge_tensor_t result;
result.created_by_c = true;
result.dim = tensor.dim();
Expand All @@ -59,13 +113,13 @@ bridge_tensor_t torch_to_bridge(torch::Tensor &tensor) {
return result;
}

torch::Tensor bridge_to_torch(bridge_tensor_t &bt) {
at::Tensor bridge_to_torch(bridge_tensor_t &bt) {
std::vector<int64_t> sizes_vec(bt.sizes, bt.sizes + bt.dim);
auto shape = torch::IntArrayRef(sizes_vec);
return torch::from_blob(bt.data, shape, torch::kFloat);
}

torch::Tensor bridge_to_torch(bridge_tensor_t &bt,torch::Device device, bool copy,torch::ScalarType dtype = torch::kFloat32) {
at::Tensor bridge_to_torch(bridge_tensor_t &bt,torch::Device device, bool copy,torch::ScalarType dtype = torch::kFloat32) {
std::vector<int64_t> sizes_vec(bt.sizes, bt.sizes + bt.dim);
auto shape = torch::IntArrayRef(sizes_vec);
auto t = torch::from_blob(bt.data, shape, torch::kFloat);
Expand Down Expand Up @@ -144,6 +198,8 @@ extern "C" bridge_tensor_t load_run_model(const uint8_t* model_path, bridge_tens
}




extern "C" bridge_pt_model_t load_model(const uint8_t* model_path) {

std::cout << "Begin loading model from path: " << model_path << std::endl;
Expand All @@ -153,21 +209,12 @@ extern "C" bridge_pt_model_t load_model(const uint8_t* model_path) {
std::cout.flush();

try {

auto* module = new torch::jit::Module(torch::jit::load(path));
module->to(torch::kMPS,torch::kFloat16,false);
module->to(best_device,best_dtype,false);
module->eval();
std::cout << "Model loaded successfully!" << std::endl;
std::cout.flush();
return { static_cast<void*>(module) };

// torch::jit::Module tmp = torch::jit::load(path);
// std::cout << "Model loaded successfully!" << std::endl;
// std::cout.flush();
// auto* module = new torch::jit::Module(std::move(tmp));
// std::cout << "Model moved successfully!" << std::endl;
// std::cout.flush();
// return { static_cast<void*>(module) };
} catch (const c10::Error& e) {
std::cerr << "error loading the model\n" << e.msg();
std::cout << "error loading the model\n" << e.msg();
Expand All @@ -178,49 +225,30 @@ extern "C" bridge_pt_model_t load_model(const uint8_t* model_path) {
std::cout.flush();

return { nullptr };



// bridge_pt_model_t model_wrapper;
// torch::jit::Module* pt_module = new torch::jit::Module(); // = (torch::jit::Module*) model_wrapper.pt_module;
// try {
// *pt_module = torch::jit::load(mp);
// std::cout << "Model loaded successfully!" << std::endl;
// std::cout.flush();
// model_wrapper.pt_module = pt_module;
// } catch (const c10::Error& e) {
// std::cerr << "error loading the model\n" << e.msg();
// std::cout << "error loading the model\n" << e.msg();
// std::cout.flush();
// std::cerr.flush();
// }

// std::cout << pt_module->dump_to_str(false,false,false) << std::endl;
// std::cout.flush();

// return model_wrapper;
}



bridge_tensor_t model_forward(bridge_pt_model_t model, bridge_tensor_t input, bool is_vgg_based_model) {

auto tn_mps = bridge_to_torch(input,torch::kMPS,true,torch::kFloat16);
// auto tn_mps = tn.to(torch::kMPS,false,true);
auto tn_mps = bridge_to_torch(input,best_device,true,best_dtype);
// tn_mps = tn_mps.permute({2, 0, 1}).contiguous();
// tn_mps.unsqueeze_(0);//.contiguous();
auto tn = tn_mps.permute({2, 0, 1}).unsqueeze(0).contiguous();

std::vector<torch::jit::IValue> ins;
ins.push_back(tn);

auto* module = static_cast<torch::jit::Module*>(model.pt_module);
auto o = module->forward(ins).toTensor();
// auto tn_out = o.squeeze(0).permute({1, 2, 0}).contiguous();
auto tn_out = o.squeeze(0).contiguous().permute({1, 2, 0}).contiguous();

if (is_vgg_based_model) {
tn_out = tn_out / 255.0;
tn_out.div_(255.0);
}

auto tn_out_cpu = tn_out.to(torch::kCPU,torch::kFloat32,false,true);

return torch_to_bridge(tn_out_cpu);

}
Expand All @@ -233,6 +261,22 @@ extern "C" bridge_tensor_t model_forward_style_transfer(bridge_pt_model_t model,
return model_forward(model, input, true);
}

// std::tuple<uint64_t, uint64_t> get_cpu_frame_size(uint64_t width, uint64_t height, float32_t scale_factor) {
// // if (best_device == torch::kMPS || best_device == torch::kCUDA)
// if (accelerator_available())
// return std::make_tuple(width, height);
// uint64_t new_width = static_cast<uint64_t>(width * scale_factor);
// uint64_t new_height = static_cast<uint64_t>(height * scale_factor);
// return std::make_tuple(new_width, new_height);
// }

// extern "C" uint64_t get_cpu_frame_width(uint64_t width,float32_t scale_factor) {
// return std::get<0>(get_cpu_frame_size(width, 0, scale_factor));
// }
// extern "C" uint64_t get_cpu_frame_height(uint64_t height,float32_t scale_factor) {
// return std::get<1>(get_cpu_frame_size(0, height, scale_factor));
// }


extern "C" void hello_world(void) {
std::cout << "Hello from C++!" << std::endl;
Expand Down
1 change: 1 addition & 0 deletions demos/models/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This folder contains the model architectures used in the demos.
103 changes: 103 additions & 0 deletions demos/models/transformer_net.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import torch


class TransformerNet(torch.nn.Module):
def __init__(self):
super(TransformerNet, self).__init__()
# Initial convolution layers
self.conv1 = ConvLayer(3, 32, kernel_size=9, stride=1)
self.in1 = torch.nn.InstanceNorm2d(32, affine=True)
self.conv2 = ConvLayer(32, 64, kernel_size=3, stride=2)
self.in2 = torch.nn.InstanceNorm2d(64, affine=True)
self.conv3 = ConvLayer(64, 128, kernel_size=3, stride=2)
self.in3 = torch.nn.InstanceNorm2d(128, affine=True)
# Residual layers
self.res1 = ResidualBlock(128)
self.res2 = ResidualBlock(128)
self.res3 = ResidualBlock(128)
self.res4 = ResidualBlock(128)
self.res5 = ResidualBlock(128)
# Upsampling Layers
self.deconv1 = UpsampleConvLayer(128, 64, kernel_size=3, stride=1, upsample=2)
self.in4 = torch.nn.InstanceNorm2d(64, affine=True)
self.deconv2 = UpsampleConvLayer(64, 32, kernel_size=3, stride=1, upsample=2)
self.in5 = torch.nn.InstanceNorm2d(32, affine=True)
self.deconv3 = ConvLayer(32, 3, kernel_size=9, stride=1)
# Non-linearities
self.relu = torch.nn.ReLU()

def forward(self, X):
y = self.relu(self.in1(self.conv1(X)))
y = self.relu(self.in2(self.conv2(y)))
y = self.relu(self.in3(self.conv3(y)))
y = self.res1(y)
y = self.res2(y)
y = self.res3(y)
y = self.res4(y)
y = self.res5(y)
y = self.relu(self.in4(self.deconv1(y)))
y = self.relu(self.in5(self.deconv2(y)))
y = self.deconv3(y)
return y


class ConvLayer(torch.nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride):
super(ConvLayer, self).__init__()
reflection_padding = kernel_size // 2
self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)

def forward(self, x):
out = self.reflection_pad(x)
out = self.conv2d(out)
return out


class ResidualBlock(torch.nn.Module):
"""ResidualBlock
introduced in: https://arxiv.org/abs/1512.03385
recommended architecture: http://torch.ch/blog/2016/02/04/resnets.html
"""

def __init__(self, channels):
super(ResidualBlock, self).__init__()
self.conv1 = ConvLayer(channels, channels, kernel_size=3, stride=1)
self.in1 = torch.nn.InstanceNorm2d(channels, affine=True)
self.conv2 = ConvLayer(channels, channels, kernel_size=3, stride=1)
self.in2 = torch.nn.InstanceNorm2d(channels, affine=True)
self.relu = torch.nn.ReLU()

def forward(self, x):
residual = x
out = self.relu(self.in1(self.conv1(x)))
out = self.in2(self.conv2(out))
out = out + residual
return out


class UpsampleConvLayer(torch.nn.Module):
"""UpsampleConvLayer
Upsamples the input and then does a convolution. This method gives better results
compared to ConvTranspose2d.
ref: http://distill.pub/2016/deconv-checkerboard/
"""

def __init__(self, in_channels, out_channels, kernel_size, stride, upsample):
super(UpsampleConvLayer, self).__init__()
# self.upsample = upsample
self.upsample = torch.nn.Upsample(scale_factor=2, mode='nearest')
reflection_padding = kernel_size // 2
self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)

def forward(self, x):
x_in = x
# print('upsample', self.upsample)
# x_in = torch.nn.functional.interpolate(x_in, mode='nearest', scale_factor=self.upsample)
# if self.upsample:
# x_in = torch.nn.functional.interpolate(x_in, mode='nearest', scale_factor=self.upsample)
out = self.upsample(x_in)
out = self.reflection_pad(out)
out = self.conv2d(out)
return out
2 changes: 1 addition & 1 deletion demos/video/chapel-webcam/lib/Makefile.smol
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ CHPL_THIRD_PARTY = /opt/homebrew/Cellar/chapel/2.4.0_1/libexec/third-party

CHPL_HOME = /opt/homebrew/Cellar/chapel/2.4.0_1/libexec

CHPL_CFLAGS = -Ilib -Wno-unused -Wno-uninitialized -Wno-pointer-sign -Wno-incompatible-pointer-types -Wno-tautological-compare -I/opt/homebrew/Cellar/chapel/2.4.0_1/libexec/modules/internal -I/opt/homebrew/Cellar/chapel/2.4.0_1/libexec/modules/packages -I$(CHPL_RUNTIME_INCL)/localeModels/flat -I$(CHPL_RUNTIME_INCL)/localeModels -I$(CHPL_RUNTIME_INCL)/comm/none -I$(CHPL_RUNTIME_INCL)/comm -I$(CHPL_RUNTIME_INCL)/tasks/qthreads -I$(CHPL_RUNTIME_INCL)/. -I$(CHPL_RUNTIME_INCL)/./qio -I$(CHPL_RUNTIME_INCL)/./atomics/cstdlib -I$(CHPL_RUNTIME_INCL)/./mem/jemalloc -I$(CHPL_THIRD_PARTY)/utf8-decoder -I$(CHPL_THIRD_PARTY)/qthread/install/darwin-arm64-native-llvm-none-flat-jemalloc-system/include -Wno-error=unused-variable -I$(CHPL_THIRD_PARTY)/re2/install/darwin-arm64-native-llvm-none/include -I. -I/opt/homebrew/Cellar/gmp/6.3.0/include -I/opt/homebrew/Cellar/hwloc/2.12.0/include -I/opt/homebrew/Cellar/jemalloc/5.3.0/include -I/opt/homebrew/include
CHPL_CFLAGS = -Ilib -Wno-unused -Wno-uninitialized -Wno-pointer-sign -Wno-incompatible-pointer-types -Wno-tautological-compare -I/opt/homebrew/Cellar/chapel/2.4.0_1/libexec/modules/internal -I/opt/homebrew/Cellar/chapel/2.4.0_1/libexec/modules/packages -I../../../lib -I$(CHPL_RUNTIME_INCL)/localeModels/flat -I$(CHPL_RUNTIME_INCL)/localeModels -I$(CHPL_RUNTIME_INCL)/comm/none -I$(CHPL_RUNTIME_INCL)/comm -I$(CHPL_RUNTIME_INCL)/tasks/qthreads -I$(CHPL_RUNTIME_INCL)/. -I$(CHPL_RUNTIME_INCL)/./qio -I$(CHPL_RUNTIME_INCL)/./atomics/cstdlib -I$(CHPL_RUNTIME_INCL)/./mem/jemalloc -I$(CHPL_THIRD_PARTY)/utf8-decoder -I$(CHPL_THIRD_PARTY)/qthread/install/darwin-arm64-native-llvm-none-flat-jemalloc-system/include -Wno-error=unused-variable -I$(CHPL_THIRD_PARTY)/re2/install/darwin-arm64-native-llvm-none/include -I. -I/opt/homebrew/Cellar/gmp/6.3.0/include -I/opt/homebrew/Cellar/hwloc/2.12.0/include -I/opt/homebrew/Cellar/jemalloc/5.3.0/include -I/opt/homebrew/include

CHPL_LDFLAGS = -Llib -lsmol -ltorch -ltorch_cpu -lc10 -ltorch_global_deps -lbridge_objs -L$(CHPL_RUNTIME_LIB)/darwin/llvm/arm64/cpu-native/loc-flat/comm-none/tasks-qthreads/tmr-generic/unwind-none/mem-jemalloc/atomics-cstdlib/hwloc-system/re2-bundled/fs-none/lib_pic-none/san-none -lchpl -L$(CHPL_THIRD_PARTY)/qthread/install/darwin-arm64-native-llvm-none-flat-jemalloc-system/lib -Wl,-rpath,$(CHPL_THIRD_PARTY)/qthread/install/darwin-arm64-native-llvm-none-flat-jemalloc-system/lib -lqthread -L/opt/homebrew/Cellar/hwloc/2.12.0/lib -L$(CHPL_THIRD_PARTY)/re2/install/darwin-arm64-native-llvm-none/lib -lre2 -Wl,-rpath,$(CHPL_THIRD_PARTY)/re2/install/darwin-arm64-native-llvm-none/lib -lm -lpthread -L/opt/homebrew/Cellar/gmp/6.3.0/lib -lgmp -L/opt/homebrew/Cellar/hwloc/2.12.0/lib -Wl,-rpath,/opt/homebrew/Cellar/hwloc/2.12.0/lib -lhwloc -L/opt/homebrew/Cellar/jemalloc/5.3.0/lib -Wl,-rpath,/opt/homebrew/Cellar/jemalloc/5.3.0/lib -ljemalloc -L/opt/homebrew/lib

Expand Down
2 changes: 1 addition & 1 deletion demos/video/chapel-webcam/lib/smol.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set(CHPL_THIRD_PARTY /opt/homebrew/Cellar/chapel/2.4.0_1/libexec/third-party)

set(CHPL_HOME /opt/homebrew/Cellar/chapel/2.4.0_1/libexec)

set(smol_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR} /opt/homebrew/Cellar/chapel/2.4.0_1/libexec/modules/internal /opt/homebrew/Cellar/chapel/2.4.0_1/libexec/modules/packages ${CHPL_RUNTIME_INCL}/localeModels/flat ${CHPL_RUNTIME_INCL}/localeModels ${CHPL_RUNTIME_INCL}/comm/none ${CHPL_RUNTIME_INCL}/comm ${CHPL_RUNTIME_INCL}/tasks/qthreads ${CHPL_RUNTIME_INCL}/. ${CHPL_RUNTIME_INCL}/./qio ${CHPL_RUNTIME_INCL}/./atomics/cstdlib ${CHPL_RUNTIME_INCL}/./mem/jemalloc ${CHPL_THIRD_PARTY}/utf8-decoder ${CHPL_THIRD_PARTY}/qthread/install/darwin-arm64-native-llvm-none-flat-jemalloc-system/include -Wno-error=unused-variable ${CHPL_THIRD_PARTY}/re2/install/darwin-arm64-native-llvm-none/include . /opt/homebrew/Cellar/gmp/6.3.0/include /opt/homebrew/Cellar/hwloc/2.12.0/include /opt/homebrew/Cellar/jemalloc/5.3.0/include /opt/homebrew/include)
set(smol_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR} /opt/homebrew/Cellar/chapel/2.4.0_1/libexec/modules/internal /opt/homebrew/Cellar/chapel/2.4.0_1/libexec/modules/packages ../../../lib ${CHPL_RUNTIME_INCL}/localeModels/flat ${CHPL_RUNTIME_INCL}/localeModels ${CHPL_RUNTIME_INCL}/comm/none ${CHPL_RUNTIME_INCL}/comm ${CHPL_RUNTIME_INCL}/tasks/qthreads ${CHPL_RUNTIME_INCL}/. ${CHPL_RUNTIME_INCL}/./qio ${CHPL_RUNTIME_INCL}/./atomics/cstdlib ${CHPL_RUNTIME_INCL}/./mem/jemalloc ${CHPL_THIRD_PARTY}/utf8-decoder ${CHPL_THIRD_PARTY}/qthread/install/darwin-arm64-native-llvm-none-flat-jemalloc-system/include -Wno-error=unused-variable ${CHPL_THIRD_PARTY}/re2/install/darwin-arm64-native-llvm-none/include . /opt/homebrew/Cellar/gmp/6.3.0/include /opt/homebrew/Cellar/hwloc/2.12.0/include /opt/homebrew/Cellar/jemalloc/5.3.0/include /opt/homebrew/include)

set(smol_LINK_LIBS -L${CMAKE_CURRENT_LIST_DIR} -lsmol -ltorch -ltorch_cpu -lc10 -ltorch_global_deps -lbridge_objs -L${CHPL_RUNTIME_LIB}/darwin/llvm/arm64/cpu-native/loc-flat/comm-none/tasks-qthreads/tmr-generic/unwind-none/mem-jemalloc/atomics-cstdlib/hwloc-system/re2-bundled/fs-none/lib_pic-none/san-none -lchpl -L${CHPL_THIRD_PARTY}/qthread/install/darwin-arm64-native-llvm-none-flat-jemalloc-system/lib -Wl,-rpath,${CHPL_THIRD_PARTY}/qthread/install/darwin-arm64-native-llvm-none-flat-jemalloc-system/lib -lqthread -L/opt/homebrew/Cellar/hwloc/2.12.0/lib -L${CHPL_THIRD_PARTY}/re2/install/darwin-arm64-native-llvm-none/lib -lre2 -Wl,-rpath,${CHPL_THIRD_PARTY}/re2/install/darwin-arm64-native-llvm-none/lib -lm -lpthread -L/opt/homebrew/Cellar/gmp/6.3.0/lib -lgmp -L/opt/homebrew/Cellar/hwloc/2.12.0/lib -Wl,-rpath,/opt/homebrew/Cellar/hwloc/2.12.0/lib -lhwloc -L/opt/homebrew/Cellar/jemalloc/5.3.0/lib -Wl,-rpath,/opt/homebrew/Cellar/jemalloc/5.3.0/lib -ljemalloc -L/opt/homebrew/lib -lsmol)

Expand Down
Loading
Loading