Add scale factor and performance optimizations to style transfer demo.

Iainmon · Iainmon · commit ef9de0c18898 · 2025-05-23T01:52:43.000-07:00
diff --git a/bridge/include/bridge.h b/bridge/include/bridge.h
@@ -51,11 +51,14 @@ bridge_tensor_t load_run_model(const uint8_t* model_path, bridge_tensor_t input)
 
 bridge_pt_model_t load_model(const uint8_t* model_path);
 
-bridge_tensor_t model_forward(bridge_pt_model_t model, bridge_tensor_t input);
-
+bool_t accelerator_available(void);
 
+bridge_tensor_t model_forward(bridge_pt_model_t model, bridge_tensor_t input);
 bridge_tensor_t model_forward_style_transfer(bridge_pt_model_t model, bridge_tensor_t input);
 
+uint64_t get_cpu_frame_width(uint64_t width, float32_t scale_factor);
+uint64_t get_cpu_frame_height(uint64_t height, float32_t scale_factor);
+
 bridge_tensor_t resize(bridge_tensor_t input,int height,int width);
 bridge_tensor_t imagenet_normalize(bridge_tensor_t input);
 
diff --git a/bridge/lib/bridge.cpp b/bridge/lib/bridge.cpp
@@ -29,9 +29,48 @@
 
 
 
+// Globals
+
+
+torch::Device get_best_device();
+torch::ScalarType get_best_dtype();
+
+auto best_device = get_best_device();
+auto best_dtype = get_best_dtype();
+
 torch::NoGradGuard no_grad;
 torch::AutoGradMode enable_grad(false);
 
+
+
+
+
+
+torch::Device get_best_device() {
+    if (torch::hasMPS()) {
+        return torch::Device(torch::kMPS);
+    } else if (torch::hasCUDA()) {
+        return torch::Device(torch::kCUDA);
+    } else {
+        return torch::Device(torch::kCPU);
+    }
+}
+
+extern "C" bool_t accelerator_available() {
+    return false;
+    // return torch::hasMPS() || torch::hasCUDA();
+}
+
+torch::ScalarType get_best_dtype() {
+    if (torch::hasMPS()) {
+        return torch::kFloat16;
+    } else if (torch::hasCUDA()) {
+        return torch::kFloat16;
+    } else {
+        return torch::kFloat32;
+    }
+}
+
 int bridge_tensor_elements(bridge_tensor_t &bt) {
     int size = 1;
     for (int i = 0; i < bt.dim; ++i) {
@@ -149,8 +188,6 @@ extern "C" bridge_tensor_t load_run_model(const uint8_t* model_path, bridge_tens
 }
 
 
-#define DEVICE torch::kMPS
-#define DTYPE torch::kFloat16
 
 
 extern "C" bridge_pt_model_t load_model(const uint8_t* model_path) {
@@ -163,7 +200,7 @@ extern "C" bridge_pt_model_t load_model(const uint8_t* model_path) {
 
     try {
         auto* module = new torch::jit::Module(torch::jit::load(path));
-        module->to(DEVICE,DTYPE,false);
+        module->to(best_device,best_dtype,false);
         module->eval();
         std::cout << "Model loaded successfully!" << std::endl;
         std::cout.flush();
@@ -183,24 +220,24 @@ extern "C" bridge_pt_model_t load_model(const uint8_t* model_path) {
 
 
 bridge_tensor_t model_forward(bridge_pt_model_t model, bridge_tensor_t input, bool is_vgg_based_model) {
-    auto tn_mps = bridge_to_torch(input,DEVICE,true,DTYPE);
-    tn_mps = tn_mps.permute({2, 0, 1}).contiguous();
-    tn_mps.unsqueeze_(0);//.contiguous();
-    // auto tn = tn_mps.permute({2, 0, 1}).unsqueeze(0).contiguous();
+    auto tn_mps = bridge_to_torch(input,best_device,true,best_dtype);
+    // tn_mps = tn_mps.permute({2, 0, 1}).contiguous();
+    // tn_mps.unsqueeze_(0);//.contiguous();
+    auto tn = tn_mps.permute({2, 0, 1}).unsqueeze(0).contiguous();
 
     std::vector<torch::jit::IValue> ins;
-    ins.push_back(tn_mps);
+    ins.push_back(tn);
 
     auto* module = static_cast<torch::jit::Module*>(model.pt_module);
     auto o = module->forward(ins).toTensor();
-    auto tn_out = o.squeeze(0).permute({1, 2, 0}).contiguous();
-    // auto tn_out = o.squeeze(0).contiguous().permute({1, 2, 0}).contiguous();
+    // auto tn_out = o.squeeze(0).permute({1, 2, 0}).contiguous();
+    auto tn_out = o.squeeze(0).contiguous().permute({1, 2, 0}).contiguous();
 
     if (is_vgg_based_model) {
         tn_out.div_(255.0);
     }
 
-    auto tn_out_cpu = tn_out.to(torch::kCPU,torch::kFloat32,false,false);
+    auto tn_out_cpu = tn_out.to(torch::kCPU,torch::kFloat32,false,true);
 
     return torch_to_bridge(tn_out_cpu);
 
@@ -214,6 +251,22 @@ extern "C" bridge_tensor_t model_forward_style_transfer(bridge_pt_model_t model,
     return model_forward(model, input, true);
 }
 
+std::tuple<uint64_t, uint64_t> get_cpu_frame_size(uint64_t width, uint64_t height, float32_t scale_factor) {
+    // if (best_device == torch::kMPS || best_device == torch::kCUDA)
+    if (accelerator_available())
+        return std::make_tuple(width, height);
+    uint64_t new_width = static_cast<uint64_t>(width * scale_factor);
+    uint64_t new_height = static_cast<uint64_t>(height * scale_factor);
+    return std::make_tuple(new_width, new_height);
+}
+
+extern "C" uint64_t get_cpu_frame_width(uint64_t width,float32_t scale_factor) {
+    return std::get<0>(get_cpu_frame_size(width, 0, scale_factor));
+}
+extern "C" uint64_t get_cpu_frame_height(uint64_t height,float32_t scale_factor) {
+    return std::get<1>(get_cpu_frame_size(0, height, scale_factor));
+}
+
 
 extern "C" void hello_world(void) {
     std::cout << "Hello from C++!" << std::endl;
diff --git a/demos/video/chapel-webcam/lib/smol.h b/demos/video/chapel-webcam/lib/smol.h
@@ -31,6 +31,9 @@ void chpl__init_ndarrayRandom(int64_t _ln,
                               int32_t _fn);
 void chpl__init_smol(int64_t _ln,
                      int32_t _fn);
+chpl_bool acceleratorAvailable(void);
+int64_t getCPUFrameWidth(int64_t width);
+int64_t getCPUFrameHeight(int64_t height);
 int64_t square(int64_t x);
 void printArray(chpl_external_array * a);
 void globalLoadModel(void);
diff --git a/demos/video/chapel-webcam/main.cpp b/demos/video/chapel-webcam/main.cpp
@@ -113,14 +113,34 @@ int mirror() {
     const std::string windowName = "Webcam Feed";
     cv::namedWindow(windowName, cv::WINDOW_AUTOSIZE);
 
+    cv::Size frame_size;
+    cv::Size new_frame_size;
+
     while (true) {
         // Capture a new frame from webcam
         cap >> frame;
         if (frame.empty()) {
             std::cerr << "Error: Empty frame captured.\n";
             break;
         }
+        frame_size = frame.size();
+        if (!acceleratorAvailable()) {
+            const auto width = getCPUFrameWidth(frame_size.width);
+            const auto height = getCPUFrameHeight(frame_size.height);
+            new_frame_size = cv::Size(width, height);
+        } else {
+            new_frame_size = frame_size;
+        }
+        
+        cv::resize(frame, frame, new_frame_size);
+
+        std::cout << "Frame size: " << frame.size() << std::endl;
+        std::cout << "New frame size: " << new_frame_size << std::endl;
+
         cv::Mat next_frame = new_frame(frame);
+
+        cv::resize(next_frame, next_frame, frame_size);
+
         // Display the captured frame
         cv::imshow(windowName, next_frame);
 
diff --git a/demos/video/chapel-webcam/model2.ipynb b/demos/video/chapel-webcam/model2.ipynb
diff --git a/demos/video/chapel-webcam/smol.chpl b/demos/video/chapel-webcam/smol.chpl
@@ -2,6 +2,19 @@ use Tensor;
 use Layer;
 import Utilities as util;
 
+config const cpuScaleFactor: real(32) = 0.2;
+
+writeln("CPU Scale Factor: ", cpuScaleFactor);
+
+export proc acceleratorAvailable(): bool do
+    return Bridge.acceleratorAvailable();
+
+export proc getCPUFrameWidth(width: int): int do
+    return Bridge.getCPUFrameWidth(width,cpuScaleFactor : real(32));
+
+export proc getCPUFrameHeight(height: int): int do
+    return Bridge.getCPUFrameHeight(height,cpuScaleFactor : real(32));
+
 
 export proc square(x: int): int {
     writeln(x, " * ", x, " = ", x * x);
@@ -35,7 +48,9 @@ const startTime = getTime();
 
 // ../style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float32.pt
 // ../style-transfer/models/exports/mps/udnie_float32.pt
-config const modelPath: string = "../style-transfer/models/exports/mps/starry_ep3_bt4_sw1e11_cw_1e5_float32.pt";
+// ../style-transfer/models/exports/mps/starry_ep3_bt4_sw1e11_cw_1e5_float32.pt // This is the one
+// ../style-transfer/models/exports/cpu/mosaic_float16.pt
+config const modelPath: string = "../style-transfer/models/exports/cpu/mosaic_float16.pt";
 var model : Bridge.bridge_pt_model_t;
 
 var modelLayer : shared TorchModule(real(32))?;
diff --git a/demos/video/chapel-webcam/transformer_net.py b/demos/video/chapel-webcam/transformer_net.py
@@ -0,0 +1,104 @@
+import torch
+
+
+class TransformerNet(torch.nn.Module):
+    def __init__(self):
+        super(TransformerNet, self).__init__()
+        # Initial convolution layers
+        self.conv1 = ConvLayer(3, 32, kernel_size=9, stride=1)
+        self.in1 = torch.nn.InstanceNorm2d(32, affine=True)
+        self.conv2 = ConvLayer(32, 64, kernel_size=3, stride=2)
+        self.in2 = torch.nn.InstanceNorm2d(64, affine=True)
+        self.conv3 = ConvLayer(64, 128, kernel_size=3, stride=2)
+        self.in3 = torch.nn.InstanceNorm2d(128, affine=True)
+        # Residual layers
+        self.res1 = ResidualBlock(128)
+        self.res2 = ResidualBlock(128)
+        self.res3 = ResidualBlock(128)
+        self.res4 = ResidualBlock(128)
+        self.res5 = ResidualBlock(128)
+        # Upsampling Layers
+        self.deconv1 = UpsampleConvLayer(128, 64, kernel_size=3, stride=1, upsample=2)
+        self.in4 = torch.nn.InstanceNorm2d(64, affine=True)
+        self.deconv2 = UpsampleConvLayer(64, 32, kernel_size=3, stride=1, upsample=2)
+        self.in5 = torch.nn.InstanceNorm2d(32, affine=True)
+        self.deconv3 = ConvLayer(32, 3, kernel_size=9, stride=1)
+        # Non-linearities
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, X):
+        y = self.relu(self.in1(self.conv1(X)))
+        y = self.relu(self.in2(self.conv2(y)))
+        y = self.relu(self.in3(self.conv3(y)))
+        y = self.res1(y)
+        y = self.res2(y)
+        y = self.res3(y)
+        y = self.res4(y)
+        y = self.res5(y)
+        y = self.relu(self.in4(self.deconv1(y)))
+        y = self.relu(self.in5(self.deconv2(y)))
+        y = self.deconv3(y)
+        return y
+
+
+class ConvLayer(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride):
+        super(ConvLayer, self).__init__()
+        reflection_padding = kernel_size // 2
+        self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
+        self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
+
+    def forward(self, x):
+        out = self.reflection_pad(x)
+        out = self.conv2d(out)
+        return out
+
+
+class ResidualBlock(torch.nn.Module):
+    """ResidualBlock
+    introduced in: https://arxiv.org/abs/1512.03385
+    recommended architecture: http://torch.ch/blog/2016/02/04/resnets.html
+    """
+
+    def __init__(self, channels):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = ConvLayer(channels, channels, kernel_size=3, stride=1)
+        self.in1 = torch.nn.InstanceNorm2d(channels, affine=True)
+        self.conv2 = ConvLayer(channels, channels, kernel_size=3, stride=1)
+        self.in2 = torch.nn.InstanceNorm2d(channels, affine=True)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+        out = self.relu(self.in1(self.conv1(x)))
+        out = self.in2(self.conv2(out))
+        out = out + residual
+        return out
+
+
+class UpsampleConvLayer(torch.nn.Module):
+    """UpsampleConvLayer
+    Upsamples the input and then does a convolution. This method gives better results
+    compared to ConvTranspose2d.
+    ref: http://distill.pub/2016/deconv-checkerboard/
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride, upsample):
+        super(UpsampleConvLayer, self).__init__()
+        # self.upsample = upsample
+        self.upsample = torch.nn.Upsample(scale_factor=2, mode='nearest')
+        reflection_padding = kernel_size // 2
+        self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
+        self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
+
+    def forward(self, x):
+        x_in = x
+        # print('upsample', self.upsample)
+        # x_in = torch.nn.functional.interpolate(x_in, mode='nearest', scale_factor=self.upsample)
+        # if self.upsample:
+        #     x_in = torch.nn.functional.interpolate(x_in, mode='nearest', scale_factor=self.upsample)
+        out = self.upsample(x_in)
+        out = self.reflection_pad(out)
+        # out = self.reflection_pad(out.to(torch.float32)).to(x.dtype)
+        out = self.conv2d(out)
+        return out
diff --git a/lib/Bridge.chpl b/lib/Bridge.chpl
@@ -71,6 +71,14 @@ module Bridge {
         in model: bridge_pt_model_t,
         in input: bridge_tensor_t): bridge_tensor_t;
 
+    extern "accelerator_available" 
+        proc acceleratorAvailable(): bool;
+
+    extern "get_cpu_frame_width" 
+        proc getCPUFrameWidth(width: int(64), scale_factor: real(32)): int(64);
+    extern "get_cpu_frame_height" 
+        proc getCPUFrameHeight(height: int(64), scale_factor: real(32)): int(64);
+
 
     extern proc convolve2d(
         in input: bridge_tensor_t,