diff --git a/bridge/include/bridge.h b/bridge/include/bridge.h
index 39f4b9b91..d747c1e26 100644
--- a/bridge/include/bridge.h
+++ b/bridge/include/bridge.h
@@ -104,6 +104,11 @@ proto_bridge_simple(softsign);
 
 proto_bridge_simple(tanhshrink);
 
+void split_loop(int64_t idx, int64_t n);
+void split_loop_filler(int64_t n,int64_t* ret);
+
+void show_webcam(void);
+
 
 // bridge_tensor_t conv2d(
 //     bridge_tensor_t input,
diff --git a/bridge/include/bridge.h.pch b/bridge/include/bridge.h.pch
new file mode 100644
index 000000000..338831a66
Binary files /dev/null and b/bridge/include/bridge.h.pch differ
diff --git a/bridge/lib/bridge.cpp b/bridge/lib/bridge.cpp
index 9356b8ab4..b34db7554 100644
--- a/bridge/lib/bridge.cpp
+++ b/bridge/lib/bridge.cpp
@@ -13,6 +13,11 @@
 #include <cstdlib>
 #include <vector>
 #include <cstdint>
+#include <chrono>
+#include <thread>
+
+#include <opencv2/opencv.hpp>
+
 
 #define def_bridge_simple(Name) \
     extern "C" bridge_tensor_t Name(bridge_tensor_t input) { \
@@ -381,3 +386,55 @@ extern "C" float sumArray(float* arr, int* sizes, int dim) {
     // auto t = torch::from_blob(arr, shape, torch::kFloat);
     // return t.sum().item<float>();
 }
+
+
+extern "C" void split_loop(int64_t idx, int64_t n) {
+    for (int i = 0; i < n; ++i) {
+        std::cout << "idx(" << idx << "," << n << ") = " << i << std::endl;
+        std::cout.flush();
+    }
+}
+
+extern "C" void split_loop_filler(int64_t n,int64_t* ret) {
+    for (int i = 0; i < n; ++i) {
+        *ret = i;
+        std::this_thread::sleep_for(std::chrono::seconds(0));
+    }
+}
+
+
+
+cv::VideoCapture open_camera(int cam_index) {
+    cv::VideoCapture cap(cam_index, cv::CAP_AVFOUNDATION);
+    if (!cap.isOpened()) {
+        std::cerr << "Could not open camera index " << cam_index << std::endl;
+        return cv::VideoCapture();
+    }
+    cap.set(cv::CAP_PROP_BUFFERSIZE, 1); // minimal internal buffering
+    cap.set(cv::CAP_PROP_FPS, 60);       // request higher FPS if possible
+    return cap;
+}
+
+
+extern "C" void show_webcam(void) {
+    cv::VideoCapture cap;
+    cap = open_camera(0);
+
+    cv::Mat frame_bgr;
+
+    while (true) {
+        if (!cap.read(frame_bgr) || frame_bgr.empty()) {
+            std::cerr << "[WARN] Empty frame, exiting" << std::endl;
+            break;
+        }
+
+        cv::imshow("webcam", frame_bgr);
+
+        if (cv::waitKey(1) == 27) { // ESC key
+            break;
+        }
+    }
+
+    cap.release();
+    cv::destroyAllWindows();
+}
\ No newline at end of file
diff --git a/demos/video/CMakeLists.txt b/demos/video/CMakeLists.txt
index 4a883bfd7..c0b762c78 100644
--- a/demos/video/CMakeLists.txt
+++ b/demos/video/CMakeLists.txt
@@ -9,22 +9,19 @@ find_library(METAL Metal REQUIRED)
 find_library(FOUNDATION Foundation REQUIRED)
 
 
+
 add_executable(VidStreamer
-    ${CMAKE_CURRENT_SOURCE_DIR}/webcam_infer.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/cvtool.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/imageops.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/webcam-capture/webcam_infer.cpp
 )
 
 target_include_directories(VidStreamer
     PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/include
         ${LIBTORCH_DIR}/include
         ${LIBTORCH_DIR}/include/torch/csrc/api/include
 )
 
-target_link_directories(VidStreamer
-    PRIVATE
-        ${LIBTORCH_DIR}/lib
-)
+target_link_directories(VidStreamer PRIVATE ${LIBTORCH_DIR}/lib)
 
 target_link_libraries(VidStreamer
     PRIVATE 
@@ -43,10 +40,58 @@ set_target_properties(VidStreamer PROPERTIES
     RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
 )
 
-
 if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
   target_compile_options(VidStreamer PRIVATE -Ofast -flto -ffast-math)
   target_link_options(VidStreamer PRIVATE -flto)
 endif()
 
 
+
+
+
+
+
+add_executable(StyleTransfer
+    ${CMAKE_CURRENT_SOURCE_DIR}/style-transfer/style_transfer.cpp
+)
+
+target_include_directories(StyleTransfer
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/include
+        ${LIBTORCH_DIR}/include
+        ${LIBTORCH_DIR}/include/torch/csrc/api/include
+)
+
+target_link_directories(StyleTransfer PRIVATE ${LIBTORCH_DIR}/lib)
+
+target_link_libraries(StyleTransfer
+    PRIVATE 
+        -ltorch
+        -ltorch_cpu
+        -lc10
+        -ltorch_global_deps
+        ${OpenCV_LIBS}
+        # ${TORCH_LIBRARIES}
+        ${ACCELERATE}
+        ${METAL}
+        ${FOUNDATION}
+)
+
+set_target_properties(StyleTransfer PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+)
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  target_compile_options(StyleTransfer PRIVATE -Ofast -flto -ffast-math)
+  target_link_options(StyleTransfer PRIVATE -flto)
+endif()
+
+
+add_custom_command(
+    TARGET StyleTransfer
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_directory
+            "${CMAKE_CURRENT_SOURCE_DIR}/style-transfer/models"
+            "$<TARGET_FILE_DIR:StyleTransfer>/style-transfer/models"
+    COMMENT "NOT! Copying ${PROJECT_ROOT_DIR}/examples/vgg/images to $<TARGET_FILE_DIR:vgg>/images"
+)
\ No newline at end of file
diff --git a/demos/video/cvtool.hpp b/demos/video/include/cvtool.hpp
similarity index 50%
rename from demos/video/cvtool.hpp
rename to demos/video/include/cvtool.hpp
index fed25eb3b..ba3e63d40 100644
--- a/demos/video/cvtool.hpp
+++ b/demos/video/include/cvtool.hpp
@@ -8,6 +8,41 @@
 #include <utility>
 
 
+namespace cvtool {
+    static torch::Device default_device(torch::kCPU);
+    static bool default_device_set = false;
+    static torch::Device set_default_device(torch::Device device) {
+        default_device = device;
+        default_device_set = true;
+        return default_device;
+    }
+    torch::Device get_default_device() {
+        if (!default_device_set) {
+            if (torch::mps::is_available()) {
+                std::cout << "[INFO] Running on MPS" << std::endl;
+                default_device = torch::Device(torch::kMPS);
+            } else {
+                std::cout << "[INFO] MPS not available, falling back to CPU" << std::endl;
+                default_device = torch::Device(torch::kCPU);
+            }
+        }
+        return default_device;
+    }
+
+    bool can_get_default_device() {
+        return default_device_set || !torch::mps::is_available();
+    }
+
+    torch::Device get_host_device() {
+        return torch::Device(torch::kCPU);
+    }
+}
+
+// enum CVToColorPermutation {
+//     RGB_TO_BGR = cv::COLOR_RGB2BGR,
+//     BGR_TO_RGB = cv::COLOR_BGR2RGB,
+// };
+
 static torch::Device default_device(torch::kCPU);
 torch::Device get_default_device();
 
@@ -69,13 +104,76 @@ std::shared_ptr<at::Tensor> create_frame_buffer_tensor(int height,int width,torc
     return create_buffer_tensor(sizes, torch::kFloat32);
 }
 
-at::Tensor to_tensor(cv::Mat &img) {
-    auto t = torch::from_blob(img.data, {1, img.rows, img.cols, 3}, torch::kUInt8).clone();
-    t = t.to(default_device);
-    t = t.to(torch::kFloat32).permute({0, 3, 1, 2}) / 255.0;
-    return t;//.to(default_device,true);
+at::Tensor to_tensor(cv::Mat &frame, torch::Device device = default_device) {
+
+
+    auto t = at::from_blob(frame.data, {1, frame.rows, frame.cols, 3}, torch::kUInt8).permute({0, 3, 1, 2}).clone();
+    auto options = at::TensorOptions()
+                    .dtype(torch::kFloat16)
+                    .device(device)
+                    .requires_grad(false);
+    return t.to(options,true).contiguous().div_(255.0);
+
+    // t = t.to(default_device,);
+    // t = t.to(torch::kFloat32).permute({0, 3, 1, 2}).contiguous() / 255.0;
+
+    // return t;//.to(default_device,true);
+}
+
+// at::Tensor to_tensor(cv::Mat &img, cv::ColorConversionCodes color_conversion = cv::COLOR_BGR2RGB) {
+//     auto t = torch::from_blob(img.data, {1, img.rows, img.cols, 3}, torch::kUInt8).clone();
+//     t = t.to(default_device);
+//     t = t.to(torch::kFloat32).permute({0, 3, 1, 2}) / 255.0;
+//     return t;//.to(default_device,true);
+// }
+
+// at::Tensor to_tensor(cv::Mat &img, cv::ColorConversionCodes color_conversion = cv::COLOR_BGR2RGB, device = ) {
+//     auto t = torch::from_blob(img.data, {1, img.rows, img.cols, 3}, torch::kUInt8).clone();
+//     t = t.to(default_device);
+//     t = t.to(torch::kFloat32).permute({0, 3, 1, 2}) / 255.0;
+//     return t;//.to(default_device,true);
+// }
+
+// at::Tensor to_tensor(cv::Mat &img, torch::Device device = cvtool::get_default_device()) {
+//     auto img_t = torch::from_blob(img.data, {1, img.rows, img.cols, 3}, torch::kUInt8);
+//     auto t = img_t.clone().to(device);
+//     t = t.to(torch::kFloat32).permute({0, 3, 1, 2}) / 255.0;
+//     return t;//.to(default_device,true);
+// }
+
+//--------------------------------------------------------------------
+// • img : any H×W×C OpenCV matrix (CV_8U, CV_32F, CV_16F …, planar or packed)
+// • device : torch::kCUDA, torch::kMPS or torch::kCPU (default = current CUDA if available)
+//--------------------------------------------------------------------
+at::Tensor to_tensor_(const cv::Mat& img, torch::Device device = get_default_device())
+{
+    // 1. Make sure the source data are contiguous
+    cv::Mat contiguous = img.isContinuous() ? img : img.clone();
+
+    // 2. Convert pixel type to 32‑bit float in [0,1] so we keep enough
+    //    head‑room for the later FP16 cast.  (OpenCV has only limited
+    //    native FP16 support, so converting to CV_32F first is usually
+    //    safer and portable.)
+    cv::Mat float32;
+    contiguous.convertTo(float32, CV_32F, 1.0 / 255.0);   // scale if img was CV_8U
+
+    // 3. Wrap the OpenCV buffer with a *view* tensor (no copy yet).
+    auto tmp = torch::from_blob(
+                  float32.data,                             // raw pointer
+                  {float32.rows, float32.cols, float32.channels()},
+                  torch::TensorOptions().dtype(torch::kFloat32));
+
+    // 4. Re‑arrange to CHW, move to wanted device, cast to FP16 *and* copy
+    //    so that the returned tensor owns its storage (clone() is mandatory).
+    auto t = tmp.permute({2, 0, 1})                        // HWC → CHW
+                 .to(device, /*dtype=*/torch::kFloat16,
+                     /*non_blocking=*/true, /*copy=*/true) // copy = true ⇒ owns memory
+                 .clone();                                 // guarantees ownership
+
+    return t; //  C×H×W, float16, on CUDA / MPS / CPU
 }
 
+
 cv::Mat to_mat(at::Tensor &tensor) {
     // Ensure the tensor is on the CPU and not on the GPU
     // at::Tensor cpu_tensor = tensor.to(torch::kCPU);
@@ -87,17 +185,31 @@ cv::Mat to_mat(at::Tensor &tensor) {
     int height = tensor.size(2);
     int width = tensor.size(3);
     auto t = tensor
-                .mul(255)
-                .squeeze()
                 .detach()
+                .squeeze()
+                .contiguous()
+                .mul(255.0)
+                .clamp(0, 255)
                 .permute({1, 2, 0})
                 .contiguous()
                 .to(torch::kUInt8)
-                // .clamp(0, 255)
                 .clone()
-                .to(torch::kCPU);
+                .to(at::kCPU,true);
+                
+
+    // auto t = tensor
+    //             .mul(255)
+    //             .squeeze()
+    //             .detach()
+    //             .permute({1, 2, 0})
+    //             .contiguous()
+    //             .to(torch::kUInt8)
+    //             // .clamp(0, 255)
+    //             .clone()
+    //             // .to(cvtool::get_default_device(), /*non_blocking=*/true, /*copy=*/true)
+    //             .to(torch::kCPU);
     cv::Mat mat = cv::Mat(height, width, CV_8UC3, t.data_ptr());
-    return mat;
+    return mat.clone();
 
 
 
@@ -112,10 +224,32 @@ cv::Mat to_mat(at::Tensor &tensor) {
     // return mat.clone();
 }
 
+
+cv::Mat to_mat(at::Tensor &tensor, cv::ColorConversionCodes color_conversion) {
+
+    int height = tensor.size(2);
+    int width = tensor.size(3);
+    auto t = tensor
+                // .to(torch::kFloat32)
+                .mul(255.0)
+                .clamp(0.0, 255.0)
+                .to(torch::kUInt8)
+                .squeeze()
+                .detach()
+                .permute({1, 2, 0})
+                .contiguous()
+                .clone()
+                .to(torch::kCPU);
+    cv::Mat mat = cv::Mat(height, width, CV_8UC3, t.data_ptr());
+    cv::Mat mat2;
+    cv::cvtColor(mat, mat2, color_conversion);
+    return mat2.clone();
+}
+
 torch::Device get_default_device() {
     if (torch::mps::is_available()) {
-        // default_device = torch::Device(torch::kMPS);
         std::cout << "[INFO] Running on MPS" << std::endl;
+        default_device = torch::Device(torch::kMPS);
     } else {
         std::cout << "[INFO] MPS not available, falling back to CPU" << std::endl;
     }
@@ -233,3 +367,37 @@ at::Tensor capture_webcam(int cam_index) {
     auto tensor = to_tensor(frame);
     return tensor;
 }
+
+
+torch::Tensor sobel_edge_detection(torch::Tensor& input,torch::Device device = cvtool::get_default_device()) {
+    // // // Sobel edge detection
+    // auto sobel_x = torch::tensor({{-1, 0, 1}, {-2, 0, 2}, {-1, 0, 1}}, input.dtype()).view({1, 1, 3, 3});
+    // auto sobel_y = torch::tensor({{1, 2, 1}, {0, 0, 0}, {-1, -2, -1}}, input.dtype()).view({1, 1, 3, 3});
+    // sobel_x.to(input.device());
+    // sobel_y.to(input.device());
+
+    // auto edges_x = torch::nn::functional::conv2d(input.unsqueeze(0), sobel_x);
+    // auto edges_y = torch::nn::functional::conv2d(input.unsqueeze(0), sobel_y);
+
+    // return (edges_x + edges_y).squeeze(0);
+
+
+    torch::Tensor sobel_dx = torch::tensor({{-1, 0, 1},
+                                            {-2, 0, 2},
+                                            {-1, 0, 1}}).to(input.dtype());
+    torch::Tensor sobel_dy = torch::tensor({{-1, -2, -1},
+                                            {0, 0, 0},
+                                            {1, 2, 1}}).to(input.dtype());
+    sobel_dx.to(input.device());
+    sobel_dy.to(input.device());
+
+
+    torch::Tensor sobel_kernel = torch::cat({sobel_dx, sobel_dy}, 0).unsqueeze(0).unsqueeze(0);
+    sobel_kernel.to(input.device());
+
+    return torch::conv2d(input, sobel_kernel, {}, 1, 1);
+}
+
+
+
+
diff --git a/demos/video/imageops.hpp b/demos/video/include/imageops.hpp
similarity index 100%
rename from demos/video/imageops.hpp
rename to demos/video/include/imageops.hpp
diff --git a/demos/video/style-transfer/.gitignore b/demos/video/style-transfer/.gitignore
new file mode 100644
index 000000000..39d36c725
--- /dev/null
+++ b/demos/video/style-transfer/.gitignore
@@ -0,0 +1 @@
+train2014.zip
diff --git a/demos/video/style-transfer/README.md b/demos/video/style-transfer/README.md
new file mode 100644
index 000000000..7921f9050
--- /dev/null
+++ b/demos/video/style-transfer/README.md
@@ -0,0 +1,9 @@
+
+
+
+python3 neural_style.py export --model saved_models/udnie.pth --accel
+python3 neural_style.py export --model saved_models/candy.pth --accel
+python3 neural_style.py export --model saved_models/mosaic.pth --accel
+
+python3 style_transfer_test.py --model-file=models/exports/cpu/mosaic_float16.pt --input-video-file=videos/deer.mp4 --output-video-file=videos/mosaic_deer.mp4 --show-output
+
diff --git a/demos/video/style-transfer/WORKING.sh b/demos/video/style-transfer/WORKING.sh
new file mode 100644
index 000000000..be987cbf5
--- /dev/null
+++ b/demos/video/style-transfer/WORKING.sh
@@ -0,0 +1 @@
+/usr/bin/clang++ -std=c++20 style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
\ No newline at end of file
diff --git a/demos/video/style-transfer/build.sh b/demos/video/style-transfer/build.sh
new file mode 100644
index 000000000..b8865e8ad
--- /dev/null
+++ b/demos/video/style-transfer/build.sh
@@ -0,0 +1,3 @@
+/usr/bin/clang++ -std=c++20 -c -fPIC mirror.cpp -o mirror.o -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
+
+/usr/bin/clang++ -shared -o libmirror.dylib mirror.o -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
\ No newline at end of file
diff --git a/demos/video/style-transfer/download_saved_models.py b/demos/video/style-transfer/download_saved_models.py
new file mode 100644
index 000000000..569aee6ad
--- /dev/null
+++ b/demos/video/style-transfer/download_saved_models.py
@@ -0,0 +1,30 @@
+import os
+import zipfile
+
+# PyTorch 1.1 moves _download_url_to_file
+#   from torch.utils.model_zoo to torch.hub
+# PyTorch 1.0 exists another _download_url_to_file
+#   2 argument
+# TODO: If you remove support PyTorch 1.0 or older,
+#       You should remove torch.utils.model_zoo
+#       Ref. PyTorch #18758
+#         https://github.com/pytorch/pytorch/pull/18758/commits
+try:
+    from torch.utils.model_zoo import _download_url_to_file
+except ImportError:
+    try:
+        from torch.hub import download_url_to_file as _download_url_to_file
+    except ImportError:
+        from torch.hub import _download_url_to_file
+
+
+def unzip(source_filename, dest_dir):
+    with zipfile.ZipFile(source_filename) as zf:
+        zf.extractall(path=dest_dir)
+
+
+if __name__ == '__main__':
+    _download_url_to_file('https://www.dropbox.com/s/lrvwfehqdcxoza8/saved_models.zip?dl=1', 'saved_models.zip', None, True)
+    unzip('saved_models.zip', '.')
+    _download_url_to_file('http://images.cocodataset.org/zips/train2014.zip', 'train2014.zip', None, True)
+    unzip('train2014.zip', '.')
diff --git a/demos/video/style-transfer/export_and_run_model.sh b/demos/video/style-transfer/export_and_run_model.sh
new file mode 100644
index 000000000..d301eec94
--- /dev/null
+++ b/demos/video/style-transfer/export_and_run_model.sh
@@ -0,0 +1,12 @@
+
+
+MODEL_NAME=$1
+
+# MODEL_NAME="nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000"
+# MODEL_NAME="nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000"
+
+python3 neural_style.py export --model saved_models/${MODEL_NAME}.model --accel \
+    || python3 neural_style.py export --model saved_models/${MODEL_NAME}.pth --accel
+
+
+python3 style_transfer_test.py --model-file=models/exports/cpu/${MODEL_NAME}_float16.pt --input-video-file=videos/deer.mp4 --show-output
\ No newline at end of file
diff --git a/demos/video/style-transfer/export_model.sh b/demos/video/style-transfer/export_model.sh
new file mode 100644
index 000000000..810321cf6
--- /dev/null
+++ b/demos/video/style-transfer/export_model.sh
@@ -0,0 +1,4 @@
+python3 neural_style.py export --model saved_models/ckpt_epoch_0_batch_id_18000.pth --accel
+# python3 style_transfer_test.py --model-file=models/exports/cpu/ckpt_epoch_0_batch_id_18000_float16.pt --use-webcam --show-output
+
+python3 style_transfer_test.py --model-file=models/exports/cpu/ckpt_epoch_0_batch_id_18000_float16.pt --input-video-file=videos/deer.mp4 --show-output
\ No newline at end of file
diff --git a/demos/video/style-transfer/helpme.txt b/demos/video/style-transfer/helpme.txt
new file mode 100644
index 000000000..926c95704
--- /dev/null
+++ b/demos/video/style-transfer/helpme.txt
@@ -0,0 +1,51 @@
+ 5378  clang++ mirror.cpp
+ 5379  clang++ mirror.cpp -o mirror -I/usr/local/include/opencv4 -L/usr/local/lib -lopencv_core -lopencv_imgcodecs -lopencv_imgproc -lopencv_highgui
+ 5380  clang++ mirror.cpp -o mirror $(pkg-config --cflags --libs opencv)
+ 5381  clang++ mirror.cpp -o mirror $(pkg-config --cflags --libs opencv4)
+ 5382  ls
+ 5383  ./mirror
+ 5384  clang++ mirror.cpp -o mirror -I $(pkg-config --cflags --libs opencv4)
+ 5385  ls
+ 5386  clang++ mirror.cpp -o mirror $(pkg-config --cflags --libs opencv4)
+ 5387  pwd
+ 5388  clang++ mirror.cpp -o mirror $(pkg-config --cflags --libs opencv4)
+ 5389  ls ../../
+ 5390  ls ../../../
+ 5391  clang++ mirror.cpp -o mirror $(pkg-config --cflags --libs opencv4) -I ../../../libtorch/include -I ../../../include/torch/csrc/api/include
+ 5392  clang++ style_transfer.cpp -o styletransfer $(pkg-config --cflags --libs opencv4) -I ../../../libtorch/include -I ../../../include/torch/csrc/api/include
+ 5393  clang++ style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../include/torch/csrc/api/include $(pkg-config --cflags --libs opencv4)
+ 5394  pwd
+ 5395  ls ../../
+ 5396  ls ../../..
+ 5397  ls ../../../libtorch
+ 5398  ls ../../../libtorch/include
+ 5399  clang++ style_transfer.cpp -o styletransfer -I../../../libtorch/include -I../../../include/torch/csrc/api/include $(pkg-config --cflags --libs opencv4)
+ 5400  clang style_transfer.cpp -o styletransfer -I../../../libtorch/include -I../../../include/torch/csrc/api/include $(pkg-config --cflags --libs opencv4)
+ 5401  clang style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include $(pkg-config --cflags --libs opencv4)
+ 5402  clang style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4)
+ 5403  /usr/bin/clang style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4)
+ 5404  clang style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4)
+ 5405  g++ style_transfer.cpp -o styletransfer  -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4)
+ 5406  clang -std=c++20 style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4)
+ 5407  clang -std=c++17 style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4)
+ 5408  /usr/bin/clang -std=c++17 style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4)
+ 5409  /usr/bin/clang -std=c++17 style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
+ 5410  /usr/bin/clang -std=c++20 style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
+ 5411  /usr/bin/clang++ -std=c++20 style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
+ 5412  pwd
+ 5413  git add --all
+ 5414  git commit -m "Futile compilation attempt working. see new file. "
+ 5415  git push
+ 5416  /usr/bin/clang++ -std=c++20 style_transfer.cpp -o styletransfer -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include -I ../include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
+ 5417  ls
+ 5418  otool -L styletransfer
+ 5419  /usr/bin/clang++ -std=c++20 mirror.cpp -o mirror -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
+ 5420  /usr/bin/clang++ -std=c++20 -c -fPIC mirror.cpp -o mirror -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
+ 5421  /usr/bin/clang++ -std=c++20 -c -fPIC mirror.cpp -o mirror.o -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
+ 5422  ls
+ 5423  otool -L mirror.o
+ 5424  /usr/bin/clang++ -std=c++20 -c -fPIC mirror.cpp -o mirror.o -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
+ 5425  /usr/bin/clang++ -shared -o libmirror.dylib mirror.o
+ 5426  /usr/bin/clang++ -shared -o libmirror.dylib mirror.o -I ../../../libtorch/include -I ../../../libtorch/include/torch/csrc/api/include $(pkg-config --cflags --libs opencv4) -L ../../../libtorch/lib -ltorch -ltorch_cpu -lc10 -ltorch_global_deps
+ 5427  ls
+ 5428  otool
\ No newline at end of file
diff --git a/demos/video/style-transfer/libmirror.dylib b/demos/video/style-transfer/libmirror.dylib
new file mode 100755
index 000000000..163bb2120
Binary files /dev/null and b/demos/video/style-transfer/libmirror.dylib differ
diff --git a/demos/video/style-transfer/mirror.chpl b/demos/video/style-transfer/mirror.chpl
new file mode 100644
index 000000000..9793ef6d5
--- /dev/null
+++ b/demos/video/style-transfer/mirror.chpl
@@ -0,0 +1,13 @@
+
+use CTypes;
+
+require "mirror.h", "-lmirror";
+
+extern proc run_mirror(): void;
+
+
+proc main(args: [] string) {
+    writeln("Hello, world!");
+
+    run_mirror();
+}
diff --git a/demos/video/style-transfer/mirror.cpp b/demos/video/style-transfer/mirror.cpp
new file mode 100644
index 000000000..52c0f8522
--- /dev/null
+++ b/demos/video/style-transfer/mirror.cpp
@@ -0,0 +1,26 @@
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "mirror.h"
+
+extern "C" void run_mirror() {
+    cv::VideoCapture cap(0); // Open the default camera (0)
+    if (!cap.isOpened()) {
+        std::cerr << "Error: Could not open camera." << std::endl;
+    }
+
+    cv::Mat frame;
+    while (true) {
+        cap >> frame; // Capture a new frame
+        if (frame.empty()) {
+            std::cerr << "Error: Could not capture frame." << std::endl;
+            break;
+        }
+
+        cv::imshow("Webcam", frame); // Display the captured frame
+        if (cv::waitKey(30) >= 0) break; // Exit on any key press
+    }
+
+    cap.release(); // Release the camera
+    cv::destroyAllWindows(); // Close all OpenCV windows
+}
+
diff --git a/demos/video/style-transfer/mirror.h b/demos/video/style-transfer/mirror.h
new file mode 100644
index 000000000..95271acc9
--- /dev/null
+++ b/demos/video/style-transfer/mirror.h
@@ -0,0 +1,15 @@
+
+#ifndef MIRROR_H
+#define MIRROR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void run_mirror();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MIRROR_H
\ No newline at end of file
diff --git a/demos/video/style-transfer/mirror.o b/demos/video/style-transfer/mirror.o
new file mode 100644
index 000000000..ea4bed776
Binary files /dev/null and b/demos/video/style-transfer/mirror.o differ
diff --git a/demos/video/style-transfer/model.ipynb b/demos/video/style-transfer/model.ipynb
new file mode 100644
index 000000000..acb96fccc
--- /dev/null
+++ b/demos/video/style-transfer/model.ipynb
@@ -0,0 +1,300 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6e4d2e04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "ec74c8a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MyModule(torch.nn.Module):\n",
+    "    def __init__(self, N, M):\n",
+    "        super(MyModule, self).__init__()\n",
+    "        self.linear = torch.nn.Linear(N, M)\n",
+    "\n",
+    "    def forward(self, input):\n",
+    "        return self.linear(input)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "180e54ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "my_module = MyModule(10,20)\n",
+    "# sm = torch.jit.script(my_module)\n",
+    "sm = torch.jit.script(my_module)\n",
+    "sm.save(\"models/my_module.pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5e377e0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89e90304",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "Parent directory models does not exist.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodels/my_module.pt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.venv/lib/python3.12/site-packages/torch/jit/_script.py:754\u001b[0m, in \u001b[0;36mRecursiveScriptModule.save\u001b[0;34m(self, f, **kwargs)\u001b[0m\n\u001b[1;32m    745\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21msave\u001b[39m(\u001b[38;5;28mself\u001b[39m, f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    746\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Save with a file-like object.\u001b[39;00m\n\u001b[1;32m    747\u001b[0m \n\u001b[1;32m    748\u001b[0m \u001b[38;5;124;03m    save(f, _extra_files={})\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    752\u001b[0m \u001b[38;5;124;03m    DO NOT confuse these two functions when it comes to the 'f' parameter functionality.\u001b[39;00m\n\u001b[1;32m    753\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 754\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_c\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Parent directory models does not exist."
+     ]
+    }
+   ],
+   "source": [
+    "# sm.save(\"models/my_module.pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d85b6e83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "m = torch.jit.load(\"models/my_module.pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "7d6255fd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RecursiveScriptModule(\n",
+       "  original_name=MyModule\n",
+       "  (linear): RecursiveScriptModule(original_name=Linear)\n",
+       ")"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "m"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "0d8ff397",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = torch.randn(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "ffe62563",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([-0.5386,  0.6545,  0.4650, -0.3320,  0.2735,  0.2796, -0.4549,  0.2646,\n",
+       "        -0.9322, -0.3031, -0.3441, -0.3761,  0.6457,  0.6456, -0.2478, -0.2270,\n",
+       "         0.8485,  0.9710, -0.0596,  0.6110], grad_fn=<ViewBackward0>)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "m(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "51739d61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# sm = torch.jit.script(style_model.to(torch.float32))\n",
+    "# sm.save(f\"models/{model_name}_float32.pt\")\n",
+    "\n",
+    "# sm = torch.jit.script(style_model.to(torch.float16))\n",
+    "# sm.save(f\"models/{model_name}_float16.pt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0173e2e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# torch::Tensor sobel_dx = torch::tensor({{-1, 0, 1},\n",
+    "#                                     {-2, 0, 2},\n",
+    "#                                     {-1, 0, 1}}).to(torch::kFloat32);\n",
+    "# torch::Tensor sobel_dy = torch::tensor({{-1, -2, -1},\n",
+    "#                                     {0, 0, 0},\n",
+    "#                                     {1, 2, 1}}).to(torch::kFloat32);\n",
+    "\n",
+    "# torch::Tensor sobel_kernel = torch::cat({sobel_dx, sobel_dy}, 0).unsqueeze(0).unsqueeze(0);\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "c09f3a28",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "Given groups=1, weight of size [1, 1, 6, 3], expected input[1, 3, 1428, 1904] to have 1 channels, but got 3 channels instead",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[32], line 20\u001b[0m\n\u001b[1;32m     17\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msobel_cnn(x)\n\u001b[1;32m     19\u001b[0m sobel \u001b[38;5;241m=\u001b[39m Sobel()\u001b[38;5;241m.\u001b[39mto(torch\u001b[38;5;241m.\u001b[39mfloat16)\n\u001b[0;32m---> 20\u001b[0m \u001b[43msobel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrandn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1428\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1904\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfloat16\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     21\u001b[0m sm \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mjit\u001b[38;5;241m.\u001b[39mscript(sobel)\n\u001b[1;32m     22\u001b[0m sm\u001b[38;5;241m.\u001b[39msave(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodels/sobel.pt\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "Cell \u001b[0;32mIn[32], line 17\u001b[0m, in \u001b[0;36mSobel.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x):\n\u001b[0;32m---> 17\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msobel_cnn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m~/.venv/lib/python3.12/site-packages/torch/nn/modules/conv.py:554\u001b[0m, in \u001b[0;36mConv2d.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    553\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 554\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_conv_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/.venv/lib/python3.12/site-packages/torch/nn/modules/conv.py:549\u001b[0m, in \u001b[0;36mConv2d._conv_forward\u001b[0;34m(self, input, weight, bias)\u001b[0m\n\u001b[1;32m    537\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpadding_mode \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mzeros\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    538\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m F\u001b[38;5;241m.\u001b[39mconv2d(\n\u001b[1;32m    539\u001b[0m         F\u001b[38;5;241m.\u001b[39mpad(\n\u001b[1;32m    540\u001b[0m             \u001b[38;5;28minput\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reversed_padding_repeated_twice, mode\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpadding_mode\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    547\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgroups,\n\u001b[1;32m    548\u001b[0m     )\n\u001b[0;32m--> 549\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconv2d\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    550\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdilation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroups\u001b[49m\n\u001b[1;32m    551\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Given groups=1, weight of size [1, 1, 6, 3], expected input[1, 3, 1428, 1904] to have 1 channels, but got 3 channels instead"
+     ]
+    }
+   ],
+   "source": [
+    "class Sobel(torch.nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(Sobel, self).__init__()\n",
+    "        sobel_dx = torch.tensor([[-1, 0, 1],\n",
+    "                                [-2, 0, 2],\n",
+    "                                [-1, 0, 1]]).to(torch.float16)\n",
+    "        sobel_dy = torch.tensor([[-1, -2, -1],\n",
+    "                                [0, 0, 0],\n",
+    "                                [1, 2, 1]]).to(torch.float16)\n",
+    "        sobel_kernel = torch.cat((sobel_dx, sobel_dy), 0).unsqueeze(0).unsqueeze(0)\n",
+    "        sobel_kernel = sobel_kernel.to(torch.float16)\n",
+    "        self.sobel_kernel = torch.nn.Parameter(sobel_kernel, requires_grad=False)\n",
+    "        self.sobel_cnn = torch.nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.sobel_cnn.weight = torch.nn.Parameter(sobel_kernel, requires_grad=False)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.sobel_cnn(x)\n",
+    "\n",
+    "sobel = Sobel().to(torch.float16)\n",
+    "sobel(torch.randn(3, 1428, 1904).to(torch.float16))\n",
+    "sm = torch.jit.script(sobel)\n",
+    "sm.save(\"models/sobel.pt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "3507a1fb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([3, 3])\n",
+      "torch.Size([3, 3])\n",
+      "torch.Size([2, 3, 3])\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "expected stride to be a single integer value or a list of 1 values to match the convolution dimensions, but got stride=[1, 1]",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[57], line 18\u001b[0m\n\u001b[1;32m     11\u001b[0m X \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mrandn(\u001b[38;5;241m1\u001b[39m,\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m1428\u001b[39m, \u001b[38;5;241m1904\u001b[39m)\u001b[38;5;241m.\u001b[39mto(torch\u001b[38;5;241m.\u001b[39mfloat16)\n\u001b[1;32m     13\u001b[0m \u001b[38;5;66;03m# sobel_cnn = torch.nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=1, bias=False)\u001b[39;00m\n\u001b[1;32m     14\u001b[0m \u001b[38;5;66;03m# sobel_cnn.weight = torch.nn.Parameter(sobel_kernel, requires_grad=False)\u001b[39;00m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;66;03m# sobel_cnn = sobel_cnn.to(torch.float16)\u001b[39;00m\n\u001b[1;32m     16\u001b[0m \u001b[38;5;66;03m# sobel_cnn(X)\u001b[39;00m\n\u001b[0;32m---> 18\u001b[0m Y \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunctional\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconv2d\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msobel_kernel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstride\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpadding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: expected stride to be a single integer value or a list of 1 values to match the convolution dimensions, but got stride=[1, 1]"
+     ]
+    }
+   ],
+   "source": [
+    "sobel_dx = torch.tensor([[-1, 0, 1],\n",
+    "                        [-2, 0, 2],\n",
+    "                        [-1, 0, 1]]).to(torch.float16)\n",
+    "print(sobel_dx.shape)\n",
+    "sobel_dy = torch.tensor([[-1, -2, -1],\n",
+    "                        [0, 0, 0],\n",
+    "                        [1, 2, 1]]).to(torch.float16)\n",
+    "print(sobel_dy.shape)\n",
+    "sobel_kernel = torch.cat([sobel_dx.unsqueeze(0), sobel_dy.unsqueeze(0)], 0)\n",
+    "print(sobel_kernel.shape)\n",
+    "X = torch.randn(1,3, 1428, 1904).to(torch.float16)\n",
+    "\n",
+    "# sobel_cnn = torch.nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "# sobel_cnn.weight = torch.nn.Parameter(sobel_kernel, requires_grad=False)\n",
+    "# sobel_cnn = sobel_cnn.to(torch.float16)\n",
+    "# sobel_cnn(X)\n",
+    "\n",
+    "Y = torch.nn.functional.conv2d(X, sobel_kernel, stride=1, padding=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59d85573",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/demos/video/style-transfer/models/exports/cpu/candy_float16.pt b/demos/video/style-transfer/models/exports/cpu/candy_float16.pt
new file mode 100644
index 000000000..b9ea6701f
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/candy_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/candy_float32.pt b/demos/video/style-transfer/models/exports/cpu/candy_float32.pt
new file mode 100644
index 000000000..3905901fc
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/candy_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_18000_float16.pt b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_18000_float16.pt
new file mode 100644
index 000000000..33ab0aa00
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_18000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_18000_float32.pt b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_18000_float32.pt
new file mode 100644
index 000000000..7aa93300d
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_18000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_4000_float16.pt b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_4000_float16.pt
new file mode 100644
index 000000000..a590c3536
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_4000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_4000_float32.pt b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_4000_float32.pt
new file mode 100644
index 000000000..2ded75fc3
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_4000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_6000_float16.pt b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_6000_float16.pt
new file mode 100644
index 000000000..ebc9bcb06
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_6000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_6000_float32.pt b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_6000_float32.pt
new file mode 100644
index 000000000..80abc6605
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_0_batch_id_6000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_1_batch_id_20000_float16.pt b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_1_batch_id_20000_float16.pt
new file mode 100644
index 000000000..dcf4feb9f
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_1_batch_id_20000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_1_batch_id_20000_float32.pt b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_1_batch_id_20000_float32.pt
new file mode 100644
index 000000000..9289e4479
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/ckpt_epoch_1_batch_id_20000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/mosaic_float16.pt b/demos/video/style-transfer/models/exports/cpu/mosaic_float16.pt
new file mode 100644
index 000000000..4baab793c
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/mosaic_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/mosaic_float32.pt b/demos/video/style-transfer/models/exports/cpu/mosaic_float32.pt
new file mode 100644
index 000000000..1d394c029
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/mosaic_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt
new file mode 100644
index 000000000..1a968da33
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt
new file mode 100644
index 000000000..5db556232
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt
new file mode 100644
index 000000000..b310caa58
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt
new file mode 100644
index 000000000..63473ff23
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt
new file mode 100644
index 000000000..0a39be9c4
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt
new file mode 100644
index 000000000..5d3d5c44f
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float16.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float16.pt
new file mode 100644
index 000000000..eea465323
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float32.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float32.pt
new file mode 100644
index 000000000..c007ce65a
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt
new file mode 100644
index 000000000..6eeb9f112
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt
new file mode 100644
index 000000000..c191f84f4
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt
new file mode 100644
index 000000000..ae46b430e
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt
new file mode 100644
index 000000000..5aa506f4a
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float16.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float16.pt
new file mode 100644
index 000000000..030f2c19a
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float32.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float32.pt
new file mode 100644
index 000000000..e55af10b4
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float16.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float16.pt
new file mode 100644
index 000000000..18d624f32
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float32.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float32.pt
new file mode 100644
index 000000000..3cc1bd399
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float16.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float16.pt
new file mode 100644
index 000000000..759bda665
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float32.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float32.pt
new file mode 100644
index 000000000..2d337e11e
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt
new file mode 100644
index 000000000..05484f9e8
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt
new file mode 100644
index 000000000..5c67b9ab3
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/rain_princess_float16.pt b/demos/video/style-transfer/models/exports/cpu/rain_princess_float16.pt
new file mode 100644
index 000000000..1fb0fc569
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/rain_princess_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/rain_princess_float32.pt b/demos/video/style-transfer/models/exports/cpu/rain_princess_float32.pt
new file mode 100644
index 000000000..159ff9091
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/rain_princess_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float16.pt b/demos/video/style-transfer/models/exports/cpu/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float16.pt
new file mode 100644
index 000000000..5c8fc88eb
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float32.pt b/demos/video/style-transfer/models/exports/cpu/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float32.pt
new file mode 100644
index 000000000..0d1d193bf
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float16.pt b/demos/video/style-transfer/models/exports/cpu/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float16.pt
new file mode 100644
index 000000000..4663b0888
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float32.pt b/demos/video/style-transfer/models/exports/cpu/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float32.pt
new file mode 100644
index 000000000..e6b332ddc
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/starry_v_bt4_1e10_ep2_float16.pt b/demos/video/style-transfer/models/exports/cpu/starry_v_bt4_1e10_ep2_float16.pt
new file mode 100644
index 000000000..5ba8cddba
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/starry_v_bt4_1e10_ep2_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/starry_v_bt4_1e10_ep2_float32.pt b/demos/video/style-transfer/models/exports/cpu/starry_v_bt4_1e10_ep2_float32.pt
new file mode 100644
index 000000000..cf4e461d7
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/starry_v_bt4_1e10_ep2_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/udnie_float16.pt b/demos/video/style-transfer/models/exports/cpu/udnie_float16.pt
new file mode 100644
index 000000000..a10ed7fd6
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/udnie_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/cpu/udnie_float32.pt b/demos/video/style-transfer/models/exports/cpu/udnie_float32.pt
new file mode 100644
index 000000000..c41b66671
Binary files /dev/null and b/demos/video/style-transfer/models/exports/cpu/udnie_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/candy_float16.pt b/demos/video/style-transfer/models/exports/mps/candy_float16.pt
new file mode 100644
index 000000000..af32cac44
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/candy_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/candy_float32.pt b/demos/video/style-transfer/models/exports/mps/candy_float32.pt
new file mode 100644
index 000000000..2c7d26d00
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/candy_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_18000_float16.pt b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_18000_float16.pt
new file mode 100644
index 000000000..466fd8c33
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_18000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_18000_float32.pt b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_18000_float32.pt
new file mode 100644
index 000000000..9718f4fdb
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_18000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_4000_float16.pt b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_4000_float16.pt
new file mode 100644
index 000000000..21d89acdc
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_4000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_4000_float32.pt b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_4000_float32.pt
new file mode 100644
index 000000000..2ba3cde18
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_4000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_6000_float16.pt b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_6000_float16.pt
new file mode 100644
index 000000000..d1e59ff22
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_6000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_6000_float32.pt b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_6000_float32.pt
new file mode 100644
index 000000000..e3b90e9e1
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_0_batch_id_6000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/ckpt_epoch_1_batch_id_20000_float16.pt b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_1_batch_id_20000_float16.pt
new file mode 100644
index 000000000..6186acb07
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_1_batch_id_20000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/ckpt_epoch_1_batch_id_20000_float32.pt b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_1_batch_id_20000_float32.pt
new file mode 100644
index 000000000..44d1c8ce9
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/ckpt_epoch_1_batch_id_20000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/mosaic_float16.pt b/demos/video/style-transfer/models/exports/mps/mosaic_float16.pt
new file mode 100644
index 000000000..5835d2715
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/mosaic_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/mosaic_float32.pt b/demos/video/style-transfer/models/exports/mps/mosaic_float32.pt
new file mode 100644
index 000000000..1b7f290cc
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/mosaic_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt
new file mode 100644
index 000000000..13e7b2e80
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt
new file mode 100644
index 000000000..fe05c3304
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt
new file mode 100644
index 000000000..f510f34fb
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt
new file mode 100644
index 000000000..51a466826
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt
new file mode 100644
index 000000000..47d460519
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt
new file mode 100644
index 000000000..b449ced58
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float16.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float16.pt
new file mode 100644
index 000000000..5be568f7a
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float32.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float32.pt
new file mode 100644
index 000000000..8487764d8
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt
new file mode 100644
index 000000000..d8e508a68
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt
new file mode 100644
index 000000000..2d1046684
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt
new file mode 100644
index 000000000..048d7d469
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt
new file mode 100644
index 000000000..335f647cc
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float16.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float16.pt
new file mode 100644
index 000000000..36d9256ef
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float32.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float32.pt
new file mode 100644
index 000000000..a8233ac96
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float16.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float16.pt
new file mode 100644
index 000000000..ed9a2dce8
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float32.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float32.pt
new file mode 100644
index 000000000..b46115943
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float16.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float16.pt
new file mode 100644
index 000000000..ed170257d
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float32.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float32.pt
new file mode 100644
index 000000000..1bd333e32
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt
new file mode 100644
index 000000000..1f31548f6
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt
new file mode 100644
index 000000000..41e3dc5fc
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/rain_princess_float16.pt b/demos/video/style-transfer/models/exports/mps/rain_princess_float16.pt
new file mode 100644
index 000000000..27949620e
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/rain_princess_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/rain_princess_float32.pt b/demos/video/style-transfer/models/exports/mps/rain_princess_float32.pt
new file mode 100644
index 000000000..ccf9370d0
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/rain_princess_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float16.pt b/demos/video/style-transfer/models/exports/mps/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float16.pt
new file mode 100644
index 000000000..6b66a2d55
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float32.pt b/demos/video/style-transfer/models/exports/mps/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float32.pt
new file mode 100644
index 000000000..f2f5e3022
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float16.pt b/demos/video/style-transfer/models/exports/mps/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float16.pt
new file mode 100644
index 000000000..46e1c0833
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float32.pt b/demos/video/style-transfer/models/exports/mps/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float32.pt
new file mode 100644
index 000000000..b73626b7e
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/starry_h_bt4_5e11_ep4_epoch_0_batch_id_18000_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/starry_v_bt4_1e10_ep2_float16.pt b/demos/video/style-transfer/models/exports/mps/starry_v_bt4_1e10_ep2_float16.pt
new file mode 100644
index 000000000..be2d3d54e
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/starry_v_bt4_1e10_ep2_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/starry_v_bt4_1e10_ep2_float32.pt b/demos/video/style-transfer/models/exports/mps/starry_v_bt4_1e10_ep2_float32.pt
new file mode 100644
index 000000000..4ddb06abd
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/starry_v_bt4_1e10_ep2_float32.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/udnie_float16.pt b/demos/video/style-transfer/models/exports/mps/udnie_float16.pt
new file mode 100644
index 000000000..e75e9d4db
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/udnie_float16.pt differ
diff --git a/demos/video/style-transfer/models/exports/mps/udnie_float32.pt b/demos/video/style-transfer/models/exports/mps/udnie_float32.pt
new file mode 100644
index 000000000..7b8c2f48b
Binary files /dev/null and b/demos/video/style-transfer/models/exports/mps/udnie_float32.pt differ
diff --git a/demos/video/style-transfer/models/mosaic.pt b/demos/video/style-transfer/models/mosaic.pt
new file mode 100644
index 000000000..d9504d793
Binary files /dev/null and b/demos/video/style-transfer/models/mosaic.pt differ
diff --git a/demos/video/style-transfer/models/mosaic_float16.pt b/demos/video/style-transfer/models/mosaic_float16.pt
new file mode 100644
index 000000000..9718339de
Binary files /dev/null and b/demos/video/style-transfer/models/mosaic_float16.pt differ
diff --git a/demos/video/style-transfer/models/mosaic_float32.pt b/demos/video/style-transfer/models/mosaic_float32.pt
new file mode 100644
index 000000000..e03440a7d
Binary files /dev/null and b/demos/video/style-transfer/models/mosaic_float32.pt differ
diff --git a/demos/video/style-transfer/models/my_module.pt b/demos/video/style-transfer/models/my_module.pt
new file mode 100644
index 000000000..9cda2b666
Binary files /dev/null and b/demos/video/style-transfer/models/my_module.pt differ
diff --git a/demos/video/style-transfer/models/sobel.pt b/demos/video/style-transfer/models/sobel.pt
new file mode 100644
index 000000000..b62cf687a
Binary files /dev/null and b/demos/video/style-transfer/models/sobel.pt differ
diff --git a/demos/video/style-transfer/models/sobel_edge_float32.pt b/demos/video/style-transfer/models/sobel_edge_float32.pt
new file mode 100644
index 000000000..3c522c17c
Binary files /dev/null and b/demos/video/style-transfer/models/sobel_edge_float32.pt differ
diff --git a/demos/video/style-transfer/models/sobel_float16.pt b/demos/video/style-transfer/models/sobel_float16.pt
new file mode 100644
index 000000000..3b5609fd5
Binary files /dev/null and b/demos/video/style-transfer/models/sobel_float16.pt differ
diff --git a/demos/video/style-transfer/models/sobel_float32.pt b/demos/video/style-transfer/models/sobel_float32.pt
new file mode 100644
index 000000000..a373c695e
Binary files /dev/null and b/demos/video/style-transfer/models/sobel_float32.pt differ
diff --git a/demos/video/style-transfer/neural_style.py b/demos/video/style-transfer/neural_style.py
new file mode 100644
index 000000000..4692f25c2
--- /dev/null
+++ b/demos/video/style-transfer/neural_style.py
@@ -0,0 +1,338 @@
+import argparse
+import os
+import sys
+import time
+import re
+
+import numpy as np
+import torch
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision import transforms
+import torch.onnx
+
+import utils
+from transformer_net import TransformerNet
+from vgg import Vgg16
+
+
+def available_accelerator():
+    """
+    Check if accelerator is available.
+    """
+    return torch.cuda.is_available() or torch.backends.mps.is_available()
+
+def current_accelerator(args):
+    """
+    Get the current accelerator.
+    """
+    if args.accel and available_accelerator():
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            raise RuntimeError("No accelerator available")
+    else:
+        return torch.device("cpu")
+
+def check_paths(args):
+    try:
+        if not os.path.exists(args.save_model_dir):
+            os.makedirs(args.save_model_dir)
+        if args.checkpoint_model_dir is not None and not (os.path.exists(args.checkpoint_model_dir)):
+            os.makedirs(args.checkpoint_model_dir)
+    except OSError as e:
+        print(e)
+        sys.exit(1)
+
+
+def train(args):
+    if args.accel:
+        device = current_accelerator(args)
+    else:
+        device = torch.device("cpu")
+
+    print(f"Using device: {device}")
+
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    transform = transforms.Compose([
+        transforms.Resize(args.image_size),
+        transforms.CenterCrop(args.image_size),
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    train_dataset = datasets.ImageFolder(args.dataset, transform)
+    train_loader = DataLoader(train_dataset, batch_size=args.batch_size)
+
+    transformer = TransformerNet().to(device)
+    optimizer = Adam(transformer.parameters(), args.lr)
+    mse_loss = torch.nn.MSELoss()
+
+    vgg = Vgg16(requires_grad=False).to(device)
+    style_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    style = utils.load_image(args.style_image, size=args.style_size)
+    style = style_transform(style)
+    style = style.repeat(args.batch_size, 1, 1, 1).to(device)
+
+    features_style = vgg(utils.normalize_batch(style))
+    gram_style = [utils.gram_matrix(y) for y in features_style]
+
+    for e in range(args.epochs):
+        transformer.train()
+        agg_content_loss = 0.
+        agg_style_loss = 0.
+        count = 0
+        for batch_id, (x, _) in enumerate(train_loader):
+            n_batch = len(x)
+            count += n_batch
+            optimizer.zero_grad()
+
+            x = x.to(device)
+            y = transformer(x)
+
+            y = utils.normalize_batch(y)
+            x = utils.normalize_batch(x)
+
+            features_y = vgg(y)
+            features_x = vgg(x)
+
+            content_loss = args.content_weight * mse_loss(features_y.relu2_2, features_x.relu2_2)
+
+            style_loss = 0.
+            for ft_y, gm_s in zip(features_y, gram_style):
+                gm_y = utils.gram_matrix(ft_y)
+                style_loss += mse_loss(gm_y, gm_s[:n_batch, :, :])
+            style_loss *= args.style_weight
+
+            total_loss = content_loss + style_loss
+            total_loss.backward()
+            optimizer.step()
+
+            agg_content_loss += content_loss.item()
+            agg_style_loss += style_loss.item()
+
+            if (batch_id + 1) % args.log_interval == 0:
+                mesg = "{}\tEpoch {}:\t[{}/{}]\tcontent: {:.6f}\tstyle: {:.6f}\ttotal: {:.6f}".format(
+                    time.ctime(), e + 1, count, len(train_dataset),
+                                  agg_content_loss / (batch_id + 1),
+                                  agg_style_loss / (batch_id + 1),
+                                  (agg_content_loss + agg_style_loss) / (batch_id + 1)
+                )
+                print(mesg)
+
+            if args.checkpoint_model_dir is not None and (batch_id + 1) % args.checkpoint_interval == 0:
+                transformer.eval().cpu()
+                ckpt_model_filename = "ckpt_epoch_" + str(e) + "_batch_id_" + str(batch_id + 1) + ".pth"
+                ckpt_model_path = os.path.join(args.checkpoint_model_dir, ckpt_model_filename)
+                torch.save(transformer.state_dict(), ckpt_model_path)
+                transformer.to(device).train()
+
+    # save model
+    transformer.eval().cpu()
+    save_model_filename = "epoch_" + str(args.epochs) + "_" + str(time.ctime()).replace(' ', '_') + "_" + str(
+        args.content_weight) + "_" + str(args.style_weight) + ".model"
+    save_model_path = os.path.join(args.save_model_dir, save_model_filename)
+    torch.save(transformer.state_dict(), save_model_path)
+
+    print("\nDone, trained model saved at", save_model_path)
+
+
+def export_model(args,device=None):
+    if device is None:
+        if args.accel:
+            device = current_accelerator(args)
+        else:
+            device = torch.device("cpu")
+
+    print(f"Using device: {device}")
+
+    with torch.no_grad():
+        style_model = TransformerNet()
+        state_dict = torch.load(args.model)
+        # remove saved deprecated running_* keys in InstanceNorm from the checkpoint
+        for k in list(state_dict.keys()):
+            if re.search(r'in\d+\.running_(mean|var)$', k):
+                del state_dict[k]
+        style_model.load_state_dict(state_dict)
+        style_model.to(device)
+        style_model.eval()
+
+        from pathlib import Path
+        model_name = Path(args.model).stem
+
+        export_dir = Path(f'models/exports/{device.type}')
+        export_dir.mkdir(parents=True, exist_ok=True)
+
+        sm = torch.jit.script(style_model.to(torch.float32))
+        sm.save(f"models/exports/{device.type}/{model_name}_float32.pt")
+
+        sm = torch.jit.script(style_model.to(torch.float16))
+        sm.save(f"models/exports/{device.type}/{model_name}_float16.pt")
+
+
+def stylize(args):
+    if args.accel:
+        device = current_accelerator(args)
+    else:
+        device = torch.device("cpu")
+    
+    print(f"Using device: {device}")
+
+    content_image = utils.load_image(args.content_image, scale=args.content_scale)
+    content_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    content_image = content_transform(content_image)
+    content_image = content_image.unsqueeze(0).to(device)
+
+    if args.model.endswith(".onnx"):
+        output = stylize_onnx(content_image, args)
+    else:
+        with torch.no_grad():
+            style_model = TransformerNet()
+            state_dict = torch.load(args.model)
+            # remove saved deprecated running_* keys in InstanceNorm from the checkpoint
+            for k in list(state_dict.keys()):
+                if re.search(r'in\d+\.running_(mean|var)$', k):
+                    del state_dict[k]
+            style_model.load_state_dict(state_dict)
+            style_model.to(device)
+            style_model.eval()
+            if args.export_onnx:
+                assert args.export_onnx.endswith(".onnx"), "Export model file should end with .onnx"
+                output = torch.onnx._export(
+                    style_model, content_image, args.export_onnx, opset_version=11,
+                ).cpu()            
+            else:
+                print('Content image shape:', content_image.shape)
+                output = style_model(content_image).cpu()
+
+            utils.save_image(args.output_image, output[0])
+            from pathlib import Path
+            model_name = Path(args.model).stem
+
+            sm = torch.jit.script(style_model.to(torch.float32))
+            sm.save(f"models/used/{model_name}_float32.pt")
+
+            sm = torch.jit.script(style_model.to(torch.float16))
+            sm.save(f"models/used/{model_name}_float16.pt")
+
+    utils.save_image(args.output_image, output[0])
+
+
+def stylize_onnx(content_image, args):
+    """
+    Read ONNX model and run it using onnxruntime
+    """
+
+    assert not args.export_onnx
+
+    import onnxruntime
+
+    ort_session = onnxruntime.InferenceSession(args.model)
+
+    def to_numpy(tensor):
+        return (
+            tensor.detach().cpu().numpy()
+            if tensor.requires_grad
+            else tensor.cpu().numpy()
+        )
+
+    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(content_image)}
+    ort_outs = ort_session.run(None, ort_inputs)
+    img_out_y = ort_outs[0]
+
+    return torch.from_numpy(img_out_y)
+
+
+def main():
+    main_arg_parser = argparse.ArgumentParser(description="parser for fast-neural-style")
+    subparsers = main_arg_parser.add_subparsers(title="subcommands", dest="subcommand")
+
+    train_arg_parser = subparsers.add_parser("train", help="parser for training arguments")
+    train_arg_parser.add_argument("--epochs", type=int, default=2,
+                                  help="number of training epochs, default is 2")
+    train_arg_parser.add_argument("--batch-size", type=int, default=4,
+                                  help="batch size for training, default is 4")
+    train_arg_parser.add_argument("--dataset", type=str, required=True,
+                                  help="path to training dataset, the path should point to a folder "
+                                       "containing another folder with all the training images")
+    train_arg_parser.add_argument("--style-image", type=str, default="images/style-images/mosaic.jpg",
+                                  help="path to style-image")
+    train_arg_parser.add_argument("--save-model-dir", type=str, required=True,
+                                  help="path to folder where trained model will be saved.")
+    train_arg_parser.add_argument("--checkpoint-model-dir", type=str, default=None,
+                                  help="path to folder where checkpoints of trained models will be saved")
+    train_arg_parser.add_argument("--image-size", type=int, default=256,
+                                  help="size of training images, default is 256 X 256")
+    train_arg_parser.add_argument("--style-size", type=int, default=None,
+                                  help="size of style-image, default is the original size of style image")
+    train_arg_parser.add_argument('--accel', action='store_true',
+                                  help='use accelerator')
+    train_arg_parser.add_argument("--seed", type=int, default=42,
+                                  help="random seed for training")
+    train_arg_parser.add_argument("--content-weight", type=float, default=1e5,
+                                  help="weight for content-loss, default is 1e5")
+    train_arg_parser.add_argument("--style-weight", type=float, default=1e10,
+                                  help="weight for style-loss, default is 1e10")
+    train_arg_parser.add_argument("--lr", type=float, default=1e-3,
+                                  help="learning rate, default is 1e-3")
+    train_arg_parser.add_argument("--log-interval", type=int, default=500,
+                                  help="number of images after which the training loss is logged, default is 500")
+    train_arg_parser.add_argument("--checkpoint-interval", type=int, default=2000,
+                                  help="number of batches after which a checkpoint of the trained model will be created")
+
+    eval_arg_parser = subparsers.add_parser("eval", help="parser for evaluation/stylizing arguments")
+    eval_arg_parser.add_argument("--content-image", type=str, required=True,
+                                 help="path to content image you want to stylize")
+    eval_arg_parser.add_argument("--content-scale", type=float, default=None,
+                                 help="factor for scaling down the content image")
+    eval_arg_parser.add_argument("--output-image", type=str, required=True,
+                                 help="path for saving the output image")
+    eval_arg_parser.add_argument("--model", type=str, required=True,
+                                 help="saved model to be used for stylizing the image. If file ends in .pth - PyTorch path is used, if in .onnx - Caffe2 path")
+    eval_arg_parser.add_argument("--export_onnx", type=str,
+                                 help="export ONNX model to a given file")
+    eval_arg_parser.add_argument('--accel', action='store_true',
+                                 help='use accelerator')
+
+
+    export_arg_parser = subparsers.add_parser("export", help="parser for exporting trained style transfer model")
+    # export_arg_parser.add_argument("--content-scale", type=float, default=None,
+    #                              help="factor for scaling down the content image")
+    export_arg_parser.add_argument("--model", type=str, required=True,
+                                 help="saved model to be used for stylizing the image. If file ends in .pth - PyTorch path is used, if in .onnx - Caffe2 path")
+    export_arg_parser.add_argument('--accel', action='store_true',
+                                 help='use accelerator')
+
+    args = main_arg_parser.parse_args()
+
+    if args.subcommand is None:
+        print("ERROR: specify either train or eval")
+        sys.exit(1)
+    if args.accel and not available_accelerator():
+        print("ERROR: accelerator is not available, try running on CPU")
+        sys.exit(1)
+    if not args.accel and available_accelerator():
+        print("WARNING: accelerator is available, run with --accel to enable it")
+
+    if args.subcommand == "train":
+        check_paths(args)
+        train(args)
+    elif args.subcommand == "eval":
+        stylize(args)
+    elif args.subcommand == "export":
+        export_model(args,device=torch.device('cpu'))
+        export_model(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demos/video/style-transfer/neural_style_dev_server_working.py b/demos/video/style-transfer/neural_style_dev_server_working.py
new file mode 100644
index 000000000..d48e9dfd1
--- /dev/null
+++ b/demos/video/style-transfer/neural_style_dev_server_working.py
@@ -0,0 +1,368 @@
+print('importing')
+import argparse
+import os
+import sys
+import time
+import re
+
+import numpy as np
+import torch
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision import transforms
+import torch.onnx
+
+import utils
+from transformer_net import TransformerNet
+from vgg import Vgg16
+
+torch.cuda.empty_cache()
+
+
+def available_accelerator():
+    """
+    Check if accelerator is available.
+    """
+    return True
+    # return torch.cuda.is_available() or torch.backends.mps.is_available()
+
+def current_accelerator(args):
+    """
+    Get the current accelerator.
+    """
+    print('curr accel called')
+    return torch.device("cuda")
+    if args.accel and available_accelerator():
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            raise RuntimeError("No accelerator available")
+    else:
+        return torch.device("cpu")
+
+def check_paths(args):
+    try:
+        if not os.path.exists(args.save_model_dir):
+            os.makedirs(args.save_model_dir)
+        if args.checkpoint_model_dir is not None and not (os.path.exists(args.checkpoint_model_dir)):
+            os.makedirs(args.checkpoint_model_dir)
+    except OSError as e:
+        print(e)
+        sys.exit(1)
+
+
+def train(args):
+    if args.accel:
+        device = current_accelerator(args)
+    else:
+        device = torch.device("cpu")
+    # device = torch.device('cuda')
+
+    print(f"Using device: {device}")
+
+    args.seed = 0
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    transform = transforms.Compose([
+        transforms.Resize(args.image_size),
+        transforms.CenterCrop(args.image_size),
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    print('constructed transform')
+    train_dataset = datasets.ImageFolder(args.dataset, transform)
+    train_loader = DataLoader(train_dataset, batch_size=args.batch_size)
+    print('constructed transform loaders')
+
+    print('making transformer.')
+    transformer = TransformerNet().to(device)
+    print('constructed TransformerNet')
+    optimizer = Adam(transformer.parameters(), args.lr)
+    print('created adam optimizer')
+    mse_loss = torch.nn.MSELoss()
+
+    vgg = Vgg16(requires_grad=False).to(device)
+    print('constructed vgg16')
+    style_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    print('constructed style_transform')
+    print('block 1')
+    style = utils.load_image(args.style_image, size=args.style_size)
+    print('block 2')
+    style = style_transform(style)
+    print('block 3')
+    style = style.repeat(args.batch_size, 1, 1, 1).to(device)
+    print('block 4')
+    
+    features_style = vgg(utils.normalize_batch(style))
+    print('block 5')
+    gram_style = [utils.gram_matrix(y) for y in features_style]
+    print('block 6')
+    
+    for e in range(args.epochs):
+        print(f'epochs: {e}')
+        transformer.train()
+        agg_content_loss = 0.
+        agg_style_loss = 0.
+        count = 0
+        for batch_id, (x, _) in enumerate(train_loader):
+            # print(f'batch {batch_id}')
+            n_batch = len(x)
+            count += n_batch
+            optimizer.zero_grad()
+
+            x = x.to(device)
+            y = transformer(x)
+
+            y = utils.normalize_batch(y)
+            x = utils.normalize_batch(x)
+
+            features_y = vgg(y)
+            features_x = vgg(x)
+
+            content_loss = args.content_weight * mse_loss(features_y.relu2_2, features_x.relu2_2)
+
+            style_loss = 0.
+            for ft_y, gm_s in zip(features_y, gram_style):
+                gm_y = utils.gram_matrix(ft_y)
+                style_loss += mse_loss(gm_y, gm_s[:n_batch, :, :])
+            style_loss *= args.style_weight
+
+            total_loss = content_loss + style_loss
+            total_loss.backward()
+            optimizer.step()
+
+            agg_content_loss += content_loss.item()
+            agg_style_loss += style_loss.item()
+
+            if (batch_id + 1) % args.log_interval == 0:
+                mesg = "{}\tEpoch {}:\t[{}/{}]\tcontent: {:.6f}\tstyle: {:.6f}\ttotal: {:.6f}".format(
+                    time.ctime(), e + 1, count, len(train_dataset),
+                                  agg_content_loss / (batch_id + 1),
+                                  agg_style_loss / (batch_id + 1),
+                                  (agg_content_loss + agg_style_loss) / (batch_id + 1)
+                )
+                print(mesg)
+
+            if args.checkpoint_model_dir is not None and (batch_id + 1) % args.checkpoint_interval == 0:
+                transformer.eval().cpu()
+                ckpt_model_filename = "ckpt_epoch_" + str(e) + "_batch_id_" + str(batch_id + 1) + ".pth"
+                ckpt_model_path = os.path.join(args.checkpoint_model_dir, ckpt_model_filename)
+                torch.save(transformer.state_dict(), ckpt_model_path)
+                transformer.to(device).train()
+
+    # save model
+    transformer.eval().cpu()
+    save_model_filename = "epoch_" + str(args.epochs) + "_" + str(time.ctime()).replace(' ', '_') + "_" + str(
+        args.content_weight) + "_" + str(args.style_weight) + ".model"
+    save_model_path = os.path.join(args.save_model_dir, save_model_filename)
+    torch.save(transformer.state_dict(), save_model_path)
+
+    print("\nDone, trained model saved at", save_model_path)
+
+
+def export_model(args,device=None):
+    if device is None:
+        if args.accel:
+            device = current_accelerator(args)
+        else:
+            device = torch.device("cpu")
+
+    print(f"Using device: {device}")
+
+    with torch.no_grad():
+        style_model = TransformerNet()
+        state_dict = torch.load(args.model)
+        # remove saved deprecated running_* keys in InstanceNorm from the checkpoint
+        for k in list(state_dict.keys()):
+            if re.search(r'in\d+\.running_(mean|var)$', k):
+                del state_dict[k]
+        style_model.load_state_dict(state_dict)
+        style_model.to(device)
+        style_model.eval()
+
+        from pathlib import Path
+        model_name = Path(args.model).stem
+
+        export_dir = Path(f'models/exports/{device.type}')
+        export_dir.mkdir(parents=True, exist_ok=True)
+
+        sm = torch.jit.script(style_model.to(torch.float32))
+        sm.save(f"models/exports/{device.type}/{model_name}_float32.pt")
+
+        sm = torch.jit.script(style_model.to(torch.float16))
+        sm.save(f"models/exports/{device.type}/{model_name}_float16.pt")
+
+
+def stylize(args):
+    if args.accel:
+        device = current_accelerator(args)
+    else:
+        device = torch.device("cpu")
+    
+    print(f"Using device: {device}")
+
+    content_image = utils.load_image(args.content_image, scale=args.content_scale)
+    content_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    content_image = content_transform(content_image)
+    content_image = content_image.unsqueeze(0).to(device)
+
+    if args.model.endswith(".onnx"):
+        output = stylize_onnx(content_image, args)
+    else:
+        with torch.no_grad():
+            style_model = TransformerNet()
+            state_dict = torch.load(args.model)
+            # remove saved deprecated running_* keys in InstanceNorm from the checkpoint
+            for k in list(state_dict.keys()):
+                if re.search(r'in\d+\.running_(mean|var)$', k):
+                    del state_dict[k]
+            style_model.load_state_dict(state_dict)
+            style_model.to(device)
+            style_model.eval()
+            if args.export_onnx:
+                assert args.export_onnx.endswith(".onnx"), "Export model file should end with .onnx"
+                output = torch.onnx._export(
+                    style_model, content_image, args.export_onnx, opset_version=11,
+                ).cpu()            
+            else:
+                print('Content image shape:', content_image.shape)
+                output = style_model(content_image).cpu()
+
+            utils.save_image(args.output_image, output[0])
+            from pathlib import Path
+            model_name = Path(args.model).stem
+
+            sm = torch.jit.script(style_model.to(torch.float32))
+            sm.save(f"models/used/{model_name}_float32.pt")
+
+            sm = torch.jit.script(style_model.to(torch.float16))
+            sm.save(f"models/used/{model_name}_float16.pt")
+
+    utils.save_image(args.output_image, output[0])
+
+
+def stylize_onnx(content_image, args):
+    """
+    Read ONNX model and run it using onnxruntime
+    """
+
+    assert not args.export_onnx
+
+    import onnxruntime
+
+    ort_session = onnxruntime.InferenceSession(args.model)
+
+    def to_numpy(tensor):
+        return (
+            tensor.detach().cpu().numpy()
+            if tensor.requires_grad
+            else tensor.cpu().numpy()
+        )
+
+    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(content_image)}
+    ort_outs = ort_session.run(None, ort_inputs)
+    img_out_y = ort_outs[0]
+
+    return torch.from_numpy(img_out_y)
+
+
+def main():
+    print('im in main')
+    main_arg_parser = argparse.ArgumentParser(description="parser for fast-neural-style")
+    subparsers = main_arg_parser.add_subparsers(title="subcommands", dest="subcommand")
+
+    train_arg_parser = subparsers.add_parser("train", help="parser for training arguments")
+    train_arg_parser.add_argument("--epochs", type=int, default=2,
+                                  help="number of training epochs, default is 2")
+    train_arg_parser.add_argument("--batch-size", type=int, default=4,
+                                  help="batch size for training, default is 4")
+    train_arg_parser.add_argument("--dataset", type=str, required=True,
+                                  help="path to training dataset, the path should point to a folder "
+                                       "containing another folder with all the training images")
+    train_arg_parser.add_argument("--style-image", type=str, default="images/style-images/mosaic.jpg",
+                                  help="path to style-image")
+    train_arg_parser.add_argument("--save-model-dir", type=str, required=True,
+                                  help="path to folder where trained model will be saved.")
+    train_arg_parser.add_argument("--checkpoint-model-dir", type=str, default=None,
+                                  help="path to folder where checkpoints of trained models will be saved")
+    train_arg_parser.add_argument("--image-size", type=int, default=256,
+                                  help="size of training images, default is 256 X 256")
+    train_arg_parser.add_argument("--style-size", type=int, default=None,
+                                  help="size of style-image, default is the original size of style image")
+    train_arg_parser.add_argument('--accel', action='store_true',
+                                  help='use accelerator')
+    train_arg_parser.add_argument("--seed", type=int, default=42,
+                                  help="random seed for training")
+    train_arg_parser.add_argument("--content-weight", type=float, default=1e5,
+                                  help="weight for content-loss, default is 1e5")
+    train_arg_parser.add_argument("--style-weight", type=float, default=1e10,
+                                  help="weight for style-loss, default is 1e10")
+    train_arg_parser.add_argument("--lr", type=float, default=1e-3,
+                                  help="learning rate, default is 1e-3")
+    train_arg_parser.add_argument("--log-interval", type=int, default=500,
+                                  help="number of images after which the training loss is logged, default is 500")
+    train_arg_parser.add_argument("--checkpoint-interval", type=int, default=2000,
+                                  help="number of batches after which a checkpoint of the trained model will be created")
+
+    eval_arg_parser = subparsers.add_parser("eval", help="parser for evaluation/stylizing arguments")
+    eval_arg_parser.add_argument("--content-image", type=str, required=True,
+                                 help="path to content image you want to stylize")
+    eval_arg_parser.add_argument("--content-scale", type=float, default=None,
+                                 help="factor for scaling down the content image")
+    eval_arg_parser.add_argument("--output-image", type=str, required=True,
+                                 help="path for saving the output image")
+    eval_arg_parser.add_argument("--model", type=str, required=True,
+                                 help="saved model to be used for stylizing the image. If file ends in .pth - PyTorch path is used, if in .onnx - Caffe2 path")
+    eval_arg_parser.add_argument("--export_onnx", type=str,
+                                 help="export ONNX model to a given file")
+    eval_arg_parser.add_argument('--accel', action='store_true',
+                                 help='use accelerator')
+
+
+    export_arg_parser = subparsers.add_parser("export", help="parser for exporting trained style transfer model")
+    # export_arg_parser.add_argument("--content-scale", type=float, default=None,
+    #                              help="factor for scaling down the content image")
+    export_arg_parser.add_argument("--model", type=str, required=True,
+                                 help="saved model to be used for stylizing the image. If file ends in .pth - PyTorch path is used, if in .onnx - Caffe2 path")
+    export_arg_parser.add_argument('--accel', action='store_true',
+                                 help='use accelerator')
+
+    args = main_arg_parser.parse_args()
+
+    print('args parsed')
+    print(vars(args))
+
+    
+
+    if args.subcommand is None:
+        print("ERROR: specify either train or eval")
+        sys.exit(1)
+    # if args.accel and not available_accelerator():
+    #     print("ERROR: accelerator is not available, try running on CPU")
+    #     sys.exit(1)
+    # if not args.accel and available_accelerator():
+    #     print("WARNING: accelerator is available, run with --accel to enable it")
+
+    if args.subcommand == "train":
+        check_paths(args)
+        print("Training...")
+        train(args)
+    elif args.subcommand == "eval":
+        stylize(args)
+    elif args.subcommand == "export":
+        export_model(args,device=torch.device('cpu'))
+        export_model(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demos/video/style-transfer/neural_style_dev_server_working_2.py b/demos/video/style-transfer/neural_style_dev_server_working_2.py
new file mode 100644
index 000000000..f0930597f
--- /dev/null
+++ b/demos/video/style-transfer/neural_style_dev_server_working_2.py
@@ -0,0 +1,378 @@
+print('importing')
+import argparse
+import os
+import sys
+import time
+import re
+
+import numpy as np
+import torch
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision import transforms
+import torch.onnx
+
+import utils
+from transformer_net import TransformerNet
+from vgg import Vgg16
+
+torch.cuda.empty_cache()
+
+
+def available_accelerator():
+    """
+    Check if accelerator is available.
+    """
+    return True
+    # return torch.cuda.is_available() or torch.backends.mps.is_available()
+
+def current_accelerator(args):
+    """
+    Get the current accelerator.
+    """
+    print('curr accel called')
+    return torch.device("cuda")
+    if args.accel and available_accelerator():
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            raise RuntimeError("No accelerator available")
+    else:
+        return torch.device("cpu")
+
+def check_paths(args):
+    try:
+        if not os.path.exists(args.save_model_dir):
+            os.makedirs(args.save_model_dir)
+        if args.checkpoint_model_dir is not None and not (os.path.exists(args.checkpoint_model_dir)):
+            os.makedirs(args.checkpoint_model_dir)
+    except OSError as e:
+        print(e)
+        sys.exit(1)
+
+import math
+def to_sci_not(x):
+    exp = math.floor(math.log10(abs(x)))
+    multiplier = 10 ** exp
+    return (x / multiplier), exp
+
+def get_file_name(args):
+    base_name,_ = os.path.splitext(os.path.basename(args.style_image))
+    return f'{base_name}_ep{args.epochs}_bt{args.batch_size}_sw{to_sci_not(args.style_weight)}_cw_{to_sci_not(args.content_weight)}'
+
+def train(args):
+    if args.accel:
+        device = current_accelerator(args)
+    else:
+        device = torch.device("cpu")
+    # device = torch.device('cuda')
+
+    print(f"Using device: {device}")
+
+    args.seed = 0
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    transform = transforms.Compose([
+        transforms.Resize(args.image_size),
+        transforms.CenterCrop(args.image_size),
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    print('constructed transform')
+    train_dataset = datasets.ImageFolder(args.dataset, transform)
+    train_loader = DataLoader(train_dataset, batch_size=args.batch_size)
+    print('constructed transform loaders')
+
+    print('making transformer.')
+    transformer = TransformerNet().to(device)
+    print('constructed TransformerNet')
+    optimizer = Adam(transformer.parameters(), args.lr)
+    print('created adam optimizer')
+    mse_loss = torch.nn.MSELoss()
+
+    vgg = Vgg16(requires_grad=False).to(device)
+    print('constructed vgg16')
+    style_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    print('constructed style_transform')
+    print('block 1')
+    style = utils.load_image(args.style_image, size=args.style_size)
+    print('block 2')
+    style = style_transform(style)
+    print('block 3')
+    style = style.repeat(args.batch_size, 1, 1, 1).to(device)
+    print('block 4')
+    
+    features_style = vgg(utils.normalize_batch(style))
+    print('block 5')
+    gram_style = [utils.gram_matrix(y) for y in features_style]
+    print('block 6')
+    
+    for e in range(args.epochs):
+        print(f'epochs: {e}')
+        transformer.train()
+        agg_content_loss = 0.
+        agg_style_loss = 0.
+        count = 0
+        for batch_id, (x, _) in enumerate(train_loader):
+            # print(f'batch {batch_id}')
+            n_batch = len(x)
+            count += n_batch
+            optimizer.zero_grad()
+
+            x = x.to(device)
+            y = transformer(x)
+
+            y = utils.normalize_batch(y)
+            x = utils.normalize_batch(x)
+
+            features_y = vgg(y)
+            features_x = vgg(x)
+
+            content_loss = args.content_weight * mse_loss(features_y.relu2_2, features_x.relu2_2)
+
+            style_loss = 0.
+            for ft_y, gm_s in zip(features_y, gram_style):
+                gm_y = utils.gram_matrix(ft_y)
+                style_loss += mse_loss(gm_y, gm_s[:n_batch, :, :])
+            style_loss *= args.style_weight
+
+            total_loss = content_loss + style_loss
+            total_loss.backward()
+            optimizer.step()
+
+            agg_content_loss += content_loss.item()
+            agg_style_loss += style_loss.item()
+
+            if (batch_id + 1) % args.log_interval == 0:
+                mesg = "{}\tEpoch {}:\t[{}/{}]\tcontent: {:.6f}\tstyle: {:.6f}\ttotal: {:.6f}".format(
+                    time.ctime(), e + 1, count, len(train_dataset),
+                                  agg_content_loss / (batch_id + 1),
+                                  agg_style_loss / (batch_id + 1),
+                                  (agg_content_loss + agg_style_loss) / (batch_id + 1)
+                )
+                print(mesg)
+
+            if args.checkpoint_model_dir is not None and (batch_id + 1) % args.checkpoint_interval == 0:
+                transformer.eval().cpu()
+                ckpt_model_filename = get_file_name(args) + "_flash_epoch_" + str(e) + "_batch_id_" + str(batch_id + 1) + ".model"
+                ckpt_model_path = os.path.join(args.checkpoint_model_dir, ckpt_model_filename)
+                torch.save(transformer.state_dict(), ckpt_model_path)
+                transformer.to(device).train()
+
+    # save model
+    transformer.eval().cpu()
+    # save_model_filename = "epoch_" + str(args.epochs) + "_" + str(time.ctime()).replace(' ', '_') + "_" + str(
+    #     args.content_weight) + "_" + str(args.style_weight) + ".model"
+    save_model_filename = get_file_name(args) + '.model'
+    save_model_path = os.path.join(args.save_model_dir, save_model_filename)
+    torch.save(transformer.state_dict(), save_model_path)
+
+    print("\nDone, trained model saved at", save_model_path)
+
+
+def export_model(args,device=None):
+    if device is None:
+        if args.accel:
+            device = current_accelerator(args)
+        else:
+            device = torch.device("cpu")
+
+    print(f"Using device: {device}")
+
+    with torch.no_grad():
+        style_model = TransformerNet()
+        state_dict = torch.load(args.model)
+        # remove saved deprecated running_* keys in InstanceNorm from the checkpoint
+        for k in list(state_dict.keys()):
+            if re.search(r'in\d+\.running_(mean|var)$', k):
+                del state_dict[k]
+        style_model.load_state_dict(state_dict)
+        style_model.to(device)
+        style_model.eval()
+
+        from pathlib import Path
+        model_name = Path(args.model).stem
+
+        export_dir = Path(f'models/exports/{device.type}')
+        export_dir.mkdir(parents=True, exist_ok=True)
+
+        sm = torch.jit.script(style_model.to(torch.float32))
+        sm.save(f"models/exports/{device.type}/{model_name}_float32.pt")
+
+        sm = torch.jit.script(style_model.to(torch.float16))
+        sm.save(f"models/exports/{device.type}/{model_name}_float16.pt")
+
+
+def stylize(args):
+    if args.accel:
+        device = current_accelerator(args)
+    else:
+        device = torch.device("cpu")
+    
+    print(f"Using device: {device}")
+
+    content_image = utils.load_image(args.content_image, scale=args.content_scale)
+    content_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    content_image = content_transform(content_image)
+    content_image = content_image.unsqueeze(0).to(device)
+
+    if args.model.endswith(".onnx"):
+        output = stylize_onnx(content_image, args)
+    else:
+        with torch.no_grad():
+            style_model = TransformerNet()
+            state_dict = torch.load(args.model)
+            # remove saved deprecated running_* keys in InstanceNorm from the checkpoint
+            for k in list(state_dict.keys()):
+                if re.search(r'in\d+\.running_(mean|var)$', k):
+                    del state_dict[k]
+            style_model.load_state_dict(state_dict)
+            style_model.to(device)
+            style_model.eval()
+            if args.export_onnx:
+                assert args.export_onnx.endswith(".onnx"), "Export model file should end with .onnx"
+                output = torch.onnx._export(
+                    style_model, content_image, args.export_onnx, opset_version=11,
+                ).cpu()            
+            else:
+                print('Content image shape:', content_image.shape)
+                output = style_model(content_image).cpu()
+
+            utils.save_image(args.output_image, output[0])
+            from pathlib import Path
+            model_name = Path(args.model).stem
+
+            sm = torch.jit.script(style_model.to(torch.float32))
+            sm.save(f"models/used/{model_name}_float32.pt")
+
+            sm = torch.jit.script(style_model.to(torch.float16))
+            sm.save(f"models/used/{model_name}_float16.pt")
+
+    utils.save_image(args.output_image, output[0])
+
+
+def stylize_onnx(content_image, args):
+    """
+    Read ONNX model and run it using onnxruntime
+    """
+
+    assert not args.export_onnx
+
+    import onnxruntime
+
+    ort_session = onnxruntime.InferenceSession(args.model)
+
+    def to_numpy(tensor):
+        return (
+            tensor.detach().cpu().numpy()
+            if tensor.requires_grad
+            else tensor.cpu().numpy()
+        )
+
+    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(content_image)}
+    ort_outs = ort_session.run(None, ort_inputs)
+    img_out_y = ort_outs[0]
+
+    return torch.from_numpy(img_out_y)
+
+
+def main():
+    print('im in main')
+    main_arg_parser = argparse.ArgumentParser(description="parser for fast-neural-style")
+    subparsers = main_arg_parser.add_subparsers(title="subcommands", dest="subcommand")
+
+    train_arg_parser = subparsers.add_parser("train", help="parser for training arguments")
+    train_arg_parser.add_argument("--epochs", type=int, default=2,
+                                  help="number of training epochs, default is 2")
+    train_arg_parser.add_argument("--batch-size", type=int, default=4,
+                                  help="batch size for training, default is 4")
+    train_arg_parser.add_argument("--dataset", type=str, required=True,
+                                  help="path to training dataset, the path should point to a folder "
+                                       "containing another folder with all the training images")
+    train_arg_parser.add_argument("--style-image", type=str, default="images/style-images/mosaic.jpg",
+                                  help="path to style-image")
+    train_arg_parser.add_argument("--save-model-dir", type=str, required=True,
+                                  help="path to folder where trained model will be saved.")
+    train_arg_parser.add_argument("--checkpoint-model-dir", type=str, default=None,
+                                  help="path to folder where checkpoints of trained models will be saved")
+    train_arg_parser.add_argument("--image-size", type=int, default=256,
+                                  help="size of training images, default is 256 X 256")
+    train_arg_parser.add_argument("--style-size", type=int, default=None,
+                                  help="size of style-image, default is the original size of style image")
+    train_arg_parser.add_argument('--accel', action='store_true',
+                                  help='use accelerator')
+    train_arg_parser.add_argument("--seed", type=int, default=42,
+                                  help="random seed for training")
+    train_arg_parser.add_argument("--content-weight", type=float, default=1e5,
+                                  help="weight for content-loss, default is 1e5")
+    train_arg_parser.add_argument("--style-weight", type=float, default=1e10,
+                                  help="weight for style-loss, default is 1e10")
+    train_arg_parser.add_argument("--lr", type=float, default=1e-3,
+                                  help="learning rate, default is 1e-3")
+    train_arg_parser.add_argument("--log-interval", type=int, default=500,
+                                  help="number of images after which the training loss is logged, default is 500")
+    train_arg_parser.add_argument("--checkpoint-interval", type=int, default=2000,
+                                  help="number of batches after which a checkpoint of the trained model will be created")
+
+    eval_arg_parser = subparsers.add_parser("eval", help="parser for evaluation/stylizing arguments")
+    eval_arg_parser.add_argument("--content-image", type=str, required=True,
+                                 help="path to content image you want to stylize")
+    eval_arg_parser.add_argument("--content-scale", type=float, default=None,
+                                 help="factor for scaling down the content image")
+    eval_arg_parser.add_argument("--output-image", type=str, required=True,
+                                 help="path for saving the output image")
+    eval_arg_parser.add_argument("--model", type=str, required=True,
+                                 help="saved model to be used for stylizing the image. If file ends in .pth - PyTorch path is used, if in .onnx - Caffe2 path")
+    eval_arg_parser.add_argument("--export_onnx", type=str,
+                                 help="export ONNX model to a given file")
+    eval_arg_parser.add_argument('--accel', action='store_true',
+                                 help='use accelerator')
+
+
+    export_arg_parser = subparsers.add_parser("export", help="parser for exporting trained style transfer model")
+    # export_arg_parser.add_argument("--content-scale", type=float, default=None,
+    #                              help="factor for scaling down the content image")
+    export_arg_parser.add_argument("--model", type=str, required=True,
+                                 help="saved model to be used for stylizing the image. If file ends in .pth - PyTorch path is used, if in .onnx - Caffe2 path")
+    export_arg_parser.add_argument('--accel', action='store_true',
+                                 help='use accelerator')
+
+    args = main_arg_parser.parse_args()
+
+    print('args parsed')
+    print(vars(args))
+
+    
+
+    if args.subcommand is None:
+        print("ERROR: specify either train or eval")
+        sys.exit(1)
+    # if args.accel and not available_accelerator():
+    #     print("ERROR: accelerator is not available, try running on CPU")
+    #     sys.exit(1)
+    # if not args.accel and available_accelerator():
+    #     print("WARNING: accelerator is available, run with --accel to enable it")
+
+    if args.subcommand == "train":
+        check_paths(args)
+        print("Training...")
+        train(args)
+    elif args.subcommand == "eval":
+        stylize(args)
+    elif args.subcommand == "export":
+        export_model(args,device=torch.device('cpu'))
+        export_model(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demos/video/style-transfer/saved_models.zip b/demos/video/style-transfer/saved_models.zip
new file mode 100644
index 000000000..cdb980749
Binary files /dev/null and b/demos/video/style-transfer/saved_models.zip differ
diff --git a/demos/video/style-transfer/saved_models/candy.pth b/demos/video/style-transfer/saved_models/candy.pth
new file mode 100644
index 000000000..f7767f70a
Binary files /dev/null and b/demos/video/style-transfer/saved_models/candy.pth differ
diff --git a/demos/video/style-transfer/saved_models/mosaic.pth b/demos/video/style-transfer/saved_models/mosaic.pth
new file mode 100644
index 000000000..7b75d0d82
Binary files /dev/null and b/demos/video/style-transfer/saved_models/mosaic.pth differ
diff --git a/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5.model b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5.model
new file mode 100644
index 000000000..440729bd8
Binary files /dev/null and b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5.model differ
diff --git a/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000.model b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000.model
new file mode 100644
index 000000000..b2d557b09
Binary files /dev/null and b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_1_batch_id_8000.model differ
diff --git a/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000.model b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000.model
new file mode 100644
index 000000000..33b76bff7
Binary files /dev/null and b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_16000.model differ
diff --git a/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000.model b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000.model
new file mode 100644
index 000000000..eb16d2571
Binary files /dev/null and b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e10_cw_1e5_flash_epoch_2_batch_id_2000.model differ
diff --git a/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5.model b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5.model
new file mode 100644
index 000000000..f4e67ce14
Binary files /dev/null and b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5.model differ
diff --git a/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000.model b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000.model
new file mode 100644
index 000000000..02481fbb4
Binary files /dev/null and b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_16000.model differ
diff --git a/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000.model b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000.model
new file mode 100644
index 000000000..2a362bd23
Binary files /dev/null and b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw1e11_cw_1e5_flash_epoch_2_batch_id_2000.model differ
diff --git a/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000.model b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000.model
new file mode 100644
index 000000000..0dbf5eefb
Binary files /dev/null and b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_12000.model differ
diff --git a/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000.model b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000.model
new file mode 100644
index 000000000..4bc641533
Binary files /dev/null and b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_0_batch_id_20000.model differ
diff --git a/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000.model b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000.model
new file mode 100644
index 000000000..c8457f6b4
Binary files /dev/null and b/demos/video/style-transfer/saved_models/nature_oil_painting_ep4_bt4_sw5e9_cw_1e5_flash_epoch_1_batch_id_8000.model differ
diff --git a/demos/video/style-transfer/saved_models/oil_h_bt4_5e11_ep4_epoch_0_batch_id_18000.pth b/demos/video/style-transfer/saved_models/oil_h_bt4_5e11_ep4_epoch_0_batch_id_18000.pth
new file mode 100644
index 000000000..71ed76034
Binary files /dev/null and b/demos/video/style-transfer/saved_models/oil_h_bt4_5e11_ep4_epoch_0_batch_id_18000.pth differ
diff --git a/demos/video/style-transfer/saved_models/rain_princess.pth b/demos/video/style-transfer/saved_models/rain_princess.pth
new file mode 100644
index 000000000..3a20f1710
Binary files /dev/null and b/demos/video/style-transfer/saved_models/rain_princess.pth differ
diff --git a/demos/video/style-transfer/saved_models/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800.pth b/demos/video/style-transfer/saved_models/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800.pth
new file mode 100644
index 000000000..521125012
Binary files /dev/null and b/demos/video/style-transfer/saved_models/starry_h_bt10_1e10_ep5_epoch_2_batch_id_4800.pth differ
diff --git a/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2.model b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2.model
new file mode 100644
index 000000000..7cbcd7233
Binary files /dev/null and b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2.model differ
diff --git a/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_0_batch_id_18000.pth b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_0_batch_id_18000.pth
new file mode 100644
index 000000000..c761339be
Binary files /dev/null and b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_0_batch_id_18000.pth differ
diff --git a/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_0_batch_id_4000.pth b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_0_batch_id_4000.pth
new file mode 100644
index 000000000..2a4a2ad72
Binary files /dev/null and b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_0_batch_id_4000.pth differ
diff --git a/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_0_batch_id_6000.pth b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_0_batch_id_6000.pth
new file mode 100644
index 000000000..ba2a8f97f
Binary files /dev/null and b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_0_batch_id_6000.pth differ
diff --git a/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_1_batch_id_20000.pth b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_1_batch_id_20000.pth
new file mode 100644
index 000000000..0374a90b0
Binary files /dev/null and b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_1_batch_id_20000.pth differ
diff --git a/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_1_batch_id_8000.pth b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_1_batch_id_8000.pth
new file mode 100644
index 000000000..735729f26
Binary files /dev/null and b/demos/video/style-transfer/saved_models/starry_v_bt4_1e10_ep2_epoch_1_batch_id_8000.pth differ
diff --git a/demos/video/style-transfer/saved_models/udnie.pth b/demos/video/style-transfer/saved_models/udnie.pth
new file mode 100644
index 000000000..85e384569
Binary files /dev/null and b/demos/video/style-transfer/saved_models/udnie.pth differ
diff --git a/demos/video/style-transfer/sobel.ipynb b/demos/video/style-transfer/sobel.ipynb
new file mode 100644
index 000000000..1eb6e48ee
--- /dev/null
+++ b/demos/video/style-transfer/sobel.ipynb
@@ -0,0 +1,238 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "8299f21a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "\n",
+    "sobel_dx = torch.tensor([[-1, 0, 1],\n",
+    "                         [-2, 0, 2],\n",
+    "                         [-1, 0, 1]], dtype=torch.float32)\n",
+    "\n",
+    "sobel_dy = torch.tensor([[-1, -2, -1],\n",
+    "                         [ 0,  0,  0],\n",
+    "                         [ 1,  2,  1]], dtype=torch.float32)\n",
+    "\n",
+    "kernel = torch.stack([sobel_dx, sobel_dy])   # [2,3,3]\n",
+    "kernel = kernel.unsqueeze(1).repeat(1, 3, 1, 1)  # [2,3,3,3]\n",
+    "\n",
+    "def sobel_filter(img: torch.Tensor) -> torch.Tensor:\n",
+    "    \"\"\"\n",
+    "    img: Nx3xHxW float32 in [0,1] or [0,255]\n",
+    "    returns: Nx2xHxW  (channel 0 = ∂I/∂x, channel 1 = ∂I/∂y)\n",
+    "    \"\"\"\n",
+    "    return F.conv2d(img, kernel, padding=1)\n",
+    "\n",
+    "def sobel_magnitude(img: torch.Tensor) -> torch.Tensor:\n",
+    "    g = sobel_filter(img)\n",
+    "    return (g ** 2).sum(1, keepdim=True).sqrt()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "76b4a97a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 2, 1428, 1904])"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "img = torch.rand(1, 3, 1428, 1904)\n",
+    "sobel_filter(img).shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "2bb81778",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Sobel(torch.nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(Sobel, self).__init__()\n",
+    "        sobel_dx = torch.tensor([[-1, 0, 1],\n",
+    "                                [-2, 0, 2],\n",
+    "                                [-1, 0, 1]], dtype=torch.float32)\n",
+    "\n",
+    "        sobel_dy = torch.tensor([[-1, -2, -1],\n",
+    "                                [ 0,  0,  0],\n",
+    "                                [ 1,  2,  1]], dtype=torch.float32)\n",
+    "\n",
+    "        sobel_kernel = torch.stack([sobel_dx, sobel_dy])   # [2,3,3]\n",
+    "        sobel_kernel = sobel_kernel.unsqueeze(1).repeat(1, 3, 1, 1)  # [2,3,3,3]\n",
+    "        sobel_kernel = sobel_kernel.to(torch.float32)\n",
+    "\n",
+    "        self.sobel_kernel = torch.nn.Parameter(sobel_kernel, requires_grad=False)\n",
+    "        # self.sobel_cnn = torch.nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=1, bias=False).to(torch.float16)\n",
+    "        # self.sobel_cnn.weight = torch.nn.Parameter(sobel_kernel, requires_grad=False)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        # return self.sobel_cnn(x)\n",
+    "        return F.conv2d(x, self.sobel_kernel, padding=1,stride=1)\n",
+    "\n",
+    "sobel = Sobel().to('mps').to(torch.float32)\n",
+    "sm = torch.jit.script(sobel)\n",
+    "sm.save(\"models/sobel_float32.pt\")\n",
+    "sobel = Sobel().to('mps').to(torch.float16)\n",
+    "sm = torch.jit.script(sobel)\n",
+    "sm.save(\"models/sobel_float16.pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48b8c033",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "805e5f83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "m = torch.jit.load(\"models/sobel.pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "2b79f222",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img = torch.rand(1, 3, 1428, 1904).to(torch.float16)\n",
+    "existing_model_output = sobel(img)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b29aea58",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "torch.Size([1, 2, 1428, 1904])\n",
+      "torch.Size([1, 2, 1428, 1904])\n"
+     ]
+    }
+   ],
+   "source": [
+    "loaded_model_output = m(img)\n",
+    "print(torch.allclose(existing_model_output, loaded_model_output, atol=1e-5))\n",
+    "print(existing_model_output.shape)\n",
+    "print(loaded_model_output.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24038040",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-05-06 14:52:39.256 Python[52849:2569153] WARNING: AVCaptureDeviceTypeExternal is deprecated for Continuity Cameras. Please use AVCaptureDeviceTypeContinuityCamera and add NSCameraUseContinuityCameraDeviceType to your Info.plist.\n"
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
+      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
+      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "import cv2\n",
+    "\n",
+    "# Open the default camera\n",
+    "cam = cv2.VideoCapture(0)\n",
+    "\n",
+    "# Get the default frame width and height\n",
+    "frame_width = int(cam.get(cv2.CAP_PROP_FRAME_WIDTH))\n",
+    "frame_height = int(cam.get(cv2.CAP_PROP_FRAME_HEIGHT))\n",
+    "\n",
+    "# Define the codec and create VideoWriter object\n",
+    "fourcc = cv2.VideoWriter_fourcc(*'mp4v')\n",
+    "out = cv2.VideoWriter('output.mp4', fourcc, 20.0, (frame_width, frame_height))\n",
+    "\n",
+    "while True:\n",
+    "    ret, frame = cam.read()\n",
+    "\n",
+    "    # Write the frame to the output file\n",
+    "    out.write(frame)\n",
+    "\n",
+    "    # Display the captured frame\n",
+    "    cv2.imshow('Camera', frame)\n",
+    "\n",
+    "    # Press 'q' to exit the loop\n",
+    "    if cv2.waitKey(1) == ord('q'):\n",
+    "        break\n",
+    "\n",
+    "# Release the capture and writer objects\n",
+    "cam.release()\n",
+    "out.release()\n",
+    "cv2.destroyAllWindows()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad2f6628",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/demos/video/style-transfer/sobel.py b/demos/video/style-transfer/sobel.py
new file mode 100644
index 000000000..1f920047f
--- /dev/null
+++ b/demos/video/style-transfer/sobel.py
@@ -0,0 +1,267 @@
+import cv2
+import torch
+import numpy as np
+import utils
+import torchvision
+import argparse
+
+
+
+# default_device = torch.device('cpu')
+# if torch.backends.mps.is_available():
+#     default_device = torch.device('mps')
+#     print('using mps')
+
+# if torch.backends.cuda.is_available():
+#     default_device = torch.device('cuda')
+#     print('using cuda')
+
+# print('using default device:', default_device)
+
+# torch.set_default_device(default_device)
+
+
+
+# Open the default camera
+cam = cv2.VideoCapture(0)
+
+# Get the default frame width and height
+frame_width = int(cam.get(cv2.CAP_PROP_FRAME_WIDTH))
+frame_height = int(cam.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+# Define the codec and create VideoWriter object
+fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+# out = cv2.VideoWriter('output.mp4', fourcc, 20.0, (frame_width, frame_height))
+
+
+import torch.nn.functional as F
+
+
+# sobel_dx = torch.tensor([[-1, 0, 1],
+#                          [-2, 0, 2],
+#                          [-1, 0, 1]], dtype=torch.float32)
+
+# sobel_dy = torch.tensor([[-1, -2, -1],
+#                          [ 0,  0,  0],
+#                          [ 1,  2,  1]], dtype=torch.float32)
+
+# kernel = torch.cat([sobel_dx.unsqueeze(0), sobel_dy.unsqueeze(0)],0)   # [2,3,3]
+# kernel = kernel.unsqueeze(1).to('mps')  # [2,3,3,3]
+
+# # kernel = kernel.unsqueeze(1).repeat(1, 3, 1, 1).to('mps')  # [2,3,3,3]
+
+# def sobel_filter(img: torch.Tensor) -> torch.Tensor:
+#     """
+#     img: Nx3xHxW float32 in [0,1] or [0,255]
+#     returns: Nx2xHxW  (channel 0 = ∂I/∂x, channel 1 = ∂I/∂y)
+#     """
+#     return F.conv2d(img, kernel, padding=1)
+
+# def sobel_magnitude(img: torch.Tensor) -> torch.Tensor:
+#     g = sobel_filter(img)
+#     return (g ** 2).sum(1, keepdim=True).sqrt()
+
+
+def sobel_edges(rgb: torch.Tensor) -> torch.Tensor:
+    """
+    rgb : (N, 3, H, W) float tensor in the range [0, 1] or [-1, 1]
+          (any range is fine as long as it's float)
+    
+    Returns
+    -------
+    edges : (N, 3, H, W) tensor – per‑channel Sobel edge magnitude,
+            same H and W as the input (no cropping or padding artifacts).
+    """
+    # --- 1. Build Sobel kernels ------------------------------------------------
+    sobel_x = torch.tensor([[-1., 0., 1.],
+                            [-2., 0., 2.],
+                            [-1., 0., 1.]],requires_grad=False).to('mps')
+    sobel_y = sobel_x.T
+
+    # Each colour channel must be convolved with *its own* kernel.
+    # We therefore use depth‑wise (grouped) convolution with groups=3.
+    # Weight shape for conv2d: (out_channels, in_channels/groups, kH, kW)
+    # Here:  out_channels = in_channels = 3   and   groups = 3
+    weight_x = sobel_x.expand(3, 1, 3, 3).to(rgb)       # (3,1,3,3)
+    weight_y = sobel_y.expand(3, 1, 3, 3).to(rgb)
+
+    # --- 2. Apply the 2D convolutions -----------------------------------------
+    # Kernel size is 3 ⇒ one‑pixel border is enough to keep size unchanged.
+    grad_x = F.conv2d(rgb, weight_x, padding=1, groups=3)
+    grad_y = F.conv2d(rgb, weight_y, padding=1, groups=3)
+
+    # --- 3. Edge magnitude per channel ----------------------------------------
+    # A small epsilon avoids a zero‑gradient sqrt warning.
+    edges = torch.sqrt(grad_x**2 + grad_y**2 + 1e-6)
+
+    return edges
+
+
+def tensor_to_bgr(frame_tensor, *, undo_normalise=False, mean=None, std=None):
+    """
+    Args
+    ----
+    frame_tensor : torch.Tensor
+        (C,H,W) or (1,C,H,W)   ―  float or half   ―  RGB
+    undo_normalise : bool
+        True if you previously applied (x - mean) / std
+    mean, std : list/tuple of 3 floats
+        Same numbers you used for normalising (e.g. ImageNet)
+    Returns
+    -------
+    frame_bgr : np.ndarray   (H,W,3) uint8   BGR  contiguous
+    """
+    # 1) squeeze batch dimension if present
+    if frame_tensor.ndim == 4:
+        frame_tensor = frame_tensor[0]
+
+    # 2) move to CPU & float32 for math
+    img = frame_tensor.detach()
+
+    # 3) (optional) reverse mean/std normalisation
+    if undo_normalise:
+        if mean is None or std is None:
+            raise ValueError("Supply mean and std to undo normalisation")
+        mean = torch.tensor(mean).to(img).view(3,1,1)
+        std  = torch.tensor(std).to(img).view(3,1,1)
+        img = img * std + mean
+
+    # 4) scale back to 0‑255, clamp, uint8
+    img = (img * 255.0)
+    # img = img # .to(torch.float16)
+    img = img.clamp(0,255).byte()
+
+    # 5) channel‑last & numpy
+    img = img.permute(1,2,0).cpu().numpy()                 # H,W,C  RGB
+    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)       # → BGR
+    img = np.ascontiguousarray(img)                  # ensure OpenCV‑happy
+    return img
+
+
+def undo_normalize(tensor):
+    mean = (0.485, 0.456, 0.406)  # ImageNet defaults (RGB)
+    std  = (0.229, 0.224, 0.225)
+    mean = torch.tensor(mean, dtype=tensor.dtype, device=tensor.device)[:, None, None]
+    std  = torch.tensor(std,  dtype=tensor.dtype, device=tensor.device)[:, None, None]
+    return (tensor * std + mean).clamp(0, 1)
+
+
+class Sobel(torch.nn.Module):
+    def __init__(self):
+        super(Sobel, self).__init__()
+
+        # self.sobel_kernel = torch.nn.Parameter(sobel_kernel, requires_grad=False)
+        # self.sobel_cnn = torch.nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=1, bias=False).to(torch.float16)
+        # self.sobel_cnn.weight = torch.nn.Parameter(sobel_kernel, requires_grad=False)
+
+    def forward(self, rgb):
+        # return self.sobel_cnn(x)
+        sobel_x = torch.tensor([[-1., 0., 1.],
+                            [-2., 0., 2.],
+                            [-1., 0., 1.]],requires_grad=False)
+        sobel_y = sobel_x.T
+
+        # Each colour channel must be convolved with *its own* kernel.
+        # We therefore use depth‑wise (grouped) convolution with groups=3.
+        # Weight shape for conv2d: (out_channels, in_channels/groups, kH, kW)
+        # Here:  out_channels = in_channels = 3   and   groups = 3
+        weight_x = sobel_x.expand(3, 1, 3, 3).to(rgb)       # (3,1,3,3)
+        weight_y = sobel_y.expand(3, 1, 3, 3).to(rgb)
+
+        # --- 2. Apply the 2D convolutions -----------------------------------------
+        # Kernel size is 3 ⇒ one‑pixel border is enough to keep size unchanged.
+        grad_x = F.conv2d(rgb, weight_x, padding=1, groups=3)
+        grad_y = F.conv2d(rgb, weight_y, padding=1, groups=3)
+
+        # --- 3. Edge magnitude per channel ----------------------------------------
+        # A small epsilon avoids a zero‑gradient sqrt warning.
+        edges = torch.sqrt(grad_x**2 + grad_y**2 + 1e-6)
+        # edges = grad_x + grad_y
+        return edges
+
+
+sobel = Sobel().to('mps').to(torch.float32)
+sm = torch.jit.script(sobel)
+sm.save("models/sobel_edge_float32.pt")
+
+sm = torch.jit.load("models/sobel_edge_float32.pt")
+# sm = torch.jit.load("models/mosaic_float32.pt")
+# sm.to('mps')
+
+mosaic = torch.jit.load("models/mosaic_float16.pt")
+mosaic.to('mps')
+
+# print(sm)
+
+import sys
+# sys.exit(0)
+import time
+
+ticks = 1
+
+while True:
+    ret, frame_bgr = cam.read()
+
+    # Write the frame to the output file
+    # out.write(frame)
+
+    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+
+    # 3) Ensure the array is contiguous (torch needs it) -------------------------
+    frame_rgb = np.ascontiguousarray(frame_rgb)
+
+    # 4) numpy -> torch, move channels, scale, add batch if wanted --------------
+    tensor = torch.from_numpy(frame_rgb)     # H x W x C, uint8 → int tensor
+    tensor = tensor.to("mps", non_blocking=True)
+    
+    tensor = tensor.permute(2, 0, 1)         # C x H x W
+    tensor = tensor.to(torch.float32).div(255.0)       # float32, [0,1]
+
+    # normalize tensor to ImageNet mean and std
+    # mean = (0.485, 0.456, 0.406)  # ImageNet defaults (RGB)
+    # std  = (0.229, 0.224, 0.225)
+    # mean = torch.tensor(mean, dtype=tensor.dtype, device=tensor.device)[:, None, None]
+    # std  = torch.tensor(std,  dtype=tensor.dtype, device=tensor.device)[:, None, None]
+    # tensor.sub_(mean).div_(std)    
+
+    # 5) (Optional) add a batch dim and push to GPU ------------------------------
+    tensor = tensor.unsqueeze(0)             # 1 x C x H x W
+
+    # if ticks == 3:
+    #     tensor = tensor.to(torch.float16)
+    #     mosaic = torch.jit.load("models/mosaic_float16.pt")
+    #     mosaic.to('mps')
+    #     mosaic_output = mosaic(tensor) / 255.0
+    #     # mosaic_output = undo_normalize(mosaic_output)
+    #     print('input:',tensor.shape,tensor.dtype)
+    #     print('mosaic output:',mosaic_output.shape,mosaic_output.dtype)
+    #     torchvision.utils.save_image(tensor[0], 'input_tensor.png')
+    #     torchvision.utils.save_image(mosaic_output[0], 'mosaic_output.png')
+
+    #     sys.exit(0)
+
+    output_tensor = sm(tensor.to(torch.float16))
+    # print('input:',tensor.shape,tensor.dtype)
+    # print('output:',output_tensor.shape)
+
+
+    # frame_bgr_out = tensor_to_bgr(output_tensor, undo_normalise=True,mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    frame_bgr_out = tensor_to_bgr(output_tensor)
+
+    # Display the captured frame
+    cv2.imshow('Camera', frame_bgr_out)
+
+    # time.sleep(1.0)
+
+    # Press 'q' to exit the loop
+    # if ticks > 10:
+    #     break
+
+    if cv2.waitKey(1) == ord('q'):
+        break
+    ticks += 1
+
+# Release the capture and writer objects
+cam.release()
+# out.release()
+cv2.destroyAllWindows()
\ No newline at end of file
diff --git a/demos/video/style-transfer/style_transfer.cpp b/demos/video/style-transfer/style_transfer.cpp
new file mode 100644
index 000000000..09fd1462e
--- /dev/null
+++ b/demos/video/style-transfer/style_transfer.cpp
@@ -0,0 +1,312 @@
+#include <torch/torch.h>
+#include <torch/script.h>
+#include <opencv2/opencv.hpp>
+#include <chrono>
+#include <utility>
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <cvtool.hpp>
+
+
+
+int run_webcam_model(torch::jit::Module& module, int cam_index, int max_fps, bool is_video_loop, std::string vid_path);
+
+static torch::Device default_device_st = torch::Device(torch::kMPS);
+
+
+torch::jit::Module load_model(const std::string& model_path) {
+    std::cout << "Loading model from path: " << model_path << std::endl;
+    torch::jit::Module module;
+    try {
+        // Deserialize the ScriptModule from a file using torch::jit::load().
+        module = torch::jit::load(model_path);
+        std::cout << "Model loaded successfully." << std::endl;
+
+        std::cout << "Moving model to device..." << std::endl;
+        auto device = cvtool::get_default_device();
+        module.to(device);
+        std::cout << "Model moved to device." << std::endl;
+
+    } catch (const c10::Error& e) {
+        std::cerr << "error loading the model\n" << e.msg();
+    }
+    std::cout << "Model loaded successfully." << std::endl;
+    return module;
+
+}
+
+torch::Tensor preprocess_input(const torch::Tensor& input) {
+    // Preprocess the input tensor as needed
+    // For example, normalize the input tensor
+    // auto mean = torch::tensor({0.485, 0.456, 0.406}).view({1, 3, 1, 1});
+    // auto std = torch::tensor({0.229, 0.224, 0.225}).view({1, 3, 1, 1});
+    // return (input - mean) / std;
+    return input;
+}
+
+at::Tensor run_model(torch::jit::Module& module, const at::Tensor& input) {
+
+    auto input_dtype = input.dtype();
+    // std::cout.flush();
+    // std::cout << "Input dtype: " << input.dtype() << std::endl;
+    // std::cout << "Input sizes: " << input.sizes() << std::endl;
+    // std::cout << "Input device: " << input.device() << std::endl;
+    // std::cout.flush();
+    // std::system("pause");
+
+    // auto model_dtype = module.dtype();
+    // std::cout << "Module: " << module << std::endl;
+
+
+    // module.to(torch::kMPS);
+    // module.eval();
+
+
+    std::vector<torch::jit::IValue> inputs;
+    inputs.push_back(input);
+
+    // std::cout << "Input tensor: " << input.sizes() << std::endl;
+    // auto output = module.forward(inputs).toTensor();
+    auto output = module.forward(inputs).toTensor();
+
+    // std::cout << "Model output: " << output.sizes() << std::endl;
+    return output;
+}
+
+torch::Tensor eval_model(torch::jit::Module& module, const torch::Tensor& input) {
+    std::vector<torch::jit::IValue> inputs;
+    inputs.push_back(input);
+
+    // Forward pass
+    auto output = module.forward(inputs).toTensor();
+
+    return output;
+}
+
+torch::indexing::Slice slice() {
+    return torch::indexing::Slice();
+}
+
+torch::Tensor test_channel(torch::Tensor& input) {
+    std::cout << "Input device: " << input.device() << std::endl;
+
+    int channel_to_disable = 0;
+    // auto img = input.select(1, channel_to_disable).zero();  
+    auto output = input.clone();
+    output.select(1, channel_to_disable).zero_();
+    // auto output = img;
+    return output;
+}
+
+
+
+int main() {
+    // Load the model
+    // std::string model_path = "style-transfer/models/my_module.pt";
+    // torch::jit::Module module = load_model(model_path);
+    // torch::Tensor input = torch::randn({10});
+    // torch::Tensor output = run_model(module, input);
+
+
+    if (torch::mps::is_available()) {
+        default_device_st = torch::Device(torch::kMPS);
+        std::cout << "MPS is available and set as the default device." << std::endl;
+    } else {
+        default_device_st = torch::Device(torch::kCPU);
+        std::cout << "MPS is not available. Using CPU instead. " << std::endl;
+    }
+    cvtool::set_default_device(default_device_st);
+
+    auto device = cvtool::get_default_device();
+
+    // default_device = default_device_st;
+
+    // std::string model_path = "style-transfer/models/mosaic_float32.pt";
+    std::string model_path = "style-transfer/models/mosaic_float16.pt" ;
+    torch::jit::Module module = load_model(model_path);
+/*
+    // module.to(torch::kFloat16);
+    torch::Tensor input = torch::randn({1, 3, 1080, 1920}, device);
+    std::cout << "Input tensor: " << input.sizes() << std::endl;
+    std::cout << "Input tensor dtype: " << input.dtype() << std::endl;
+    std::cout << "Input tensor device: " << input.device() << std::endl;
+    // std::cout << "Model device: " << module.device() << std::endl;
+    // std::cout << "Model dtype: " << module.dtype() << std::endl;
+
+    torch::Tensor output = run_model(module, input);
+
+    // Print the output tensor
+    std::cout << "Output tensor: " << output.sizes() << std::endl;
+*/
+    return run_webcam_model(module, 0, 60, false, "");
+
+}
+
+
+int run_webcam_model(torch::jit::Module& module, int cam_index, int max_fps, bool is_video_loop, std::string vid_path = "") {
+
+    torch::Device device = cvtool::get_default_device();
+
+    module.eval();
+    module.to(device);
+
+    bool video_loop = false;
+    cv::VideoCapture cap;
+    if (is_video_loop) { 
+        cap = open_camera(vid_path);
+        video_loop = true;
+    } else {
+        cap = open_camera(cam_index);
+    }
+
+    auto camera_resolution = get_camera_resolution(cap);
+    int height = std::get<0>(camera_resolution);
+    int width  = std::get<1>(camera_resolution);
+
+
+    cv::Mat frame_bgr;
+    cv::Mat output_bgr;
+
+    const auto to_mps = [&](torch::Tensor& t){ return device.is_mps() ? t.to(device, /*non_blocking=*/true) : t; };
+
+    torch::NoGradGuard no_grad;                 // inference only
+
+    std::chrono::time_point<std::chrono::system_clock> start_total = std::chrono::system_clock::now();
+    std::chrono::time_point<std::chrono::system_clock> last_update = std::chrono::system_clock::now();
+
+    size_t frame_count = 0;
+    size_t last_frame_count = 0;
+
+    while (true) {
+        // std::cout << "\r[INFO] Processing frame... " << frame_count + 1 << std::flush;
+
+        if (!cap.read(frame_bgr) || frame_bgr.empty()) {
+        if (video_loop && frame_count > 0) {
+            cap = open_camera(vid_path);
+            frame_count = 0;
+            last_frame_count = 0;
+            start_total = std::chrono::system_clock::now();
+            last_update = std::chrono::system_clock::now(); // ??? not sure
+            cap.set(cv::CAP_PROP_POS_FRAMES, 0);
+            std::cout << "[INFO] Replaying video..." << std::endl;
+            continue;
+        }
+        std::cerr << "[WARN] Empty frame, exiting" << std::endl;
+        break;
+        }
+
+
+        ++frame_count;
+        const std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
+        auto delta = now - last_update;
+        double delta_time = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
+        auto fps = 1.0 / delta_time;
+        std::cout << "\r[INFO] FPS: " << fps << " fps" << std::flush;
+        double sleep_time = (1.0 / ((double)max_fps)) - delta_time;
+        std::this_thread::sleep_for(std::chrono::duration<double>(sleep_time));
+
+
+
+        bool skip = true;
+        if (skip) {
+
+            auto start = std::chrono::high_resolution_clock::now();
+
+            cv::Mat frame_rgb;
+            cv::cvtColor(frame_bgr, frame_rgb, cv::COLOR_BGR2RGB);
+
+            auto input_tensor = to_tensor(frame_rgb,device);
+
+            auto end = std::chrono::high_resolution_clock::now();
+            std::chrono::duration<double> elapsed = end - start;
+            std::cout << "Elapsed time (1): " << elapsed.count() * 1000.0 << " ms\n";
+
+
+            // // // works
+            start = std::chrono::high_resolution_clock::now();
+            auto input = input_tensor.div_(255.0);
+            end = std::chrono::high_resolution_clock::now();
+            elapsed = end - start;
+            std::cout << "Elapsed time (2): " << elapsed.count() * 1000.0 << " ms\n";
+
+            // auto input = input_tensor.to(device,true).to(torch::kFloat16) / 255.0;
+
+            start = std::chrono::high_resolution_clock::now();
+            auto model_output = run_model(module,input).div_(255.0);
+            end = std::chrono::high_resolution_clock::now();
+            elapsed = end - start;
+            std::cout << "Elapsed time (3): " << elapsed.count() * 1000.0 << " ms\n";
+
+
+            start = std::chrono::high_resolution_clock::now();
+            output_bgr = to_mat(model_output, cv::COLOR_RGB2BGR);
+            end = std::chrono::high_resolution_clock::now();
+            elapsed = end - start;
+            std::cout << "Elapsed time (4): " << elapsed.count() * 1000.0 << " ms\n";
+
+            // // works
+            // auto processed_input = prepped_input;
+            // auto out_processed_input = processed_input.to(torch::kCPU,true);
+            // frame_rgb = to_mat(out_processed_input);
+            // cv::cvtColor(frame_rgb, output_bgr, cv::COLOR_RGB2BGR); 
+
+            // works
+            // auto out_mps_tensor = mps_tensor.to(torch::kCPU,true);
+            // frame_rgb = to_mat(out_mps_tensor);
+            // cv::cvtColor(frame_rgb, output_bgr, cv::COLOR_RGB2BGR); 
+
+
+
+            // // works
+            // frame_rgb = to_mat(input_tensor);
+            // cv::cvtColor(frame_rgb, output_bgr, cv::COLOR_RGB2BGR); 
+
+
+
+        } else {
+
+            auto input_tensor = to_tensor(frame_bgr);
+
+            auto mps_tensor = input_tensor.to(device,true);
+
+            auto prepped_input = preprocess_input(mps_tensor);
+
+            // Forward pass
+            auto output = eval_model(module, prepped_input);
+            auto processed_output = output.to(torch::kCPU,true);
+            
+            output_bgr = to_mat(processed_output);
+        }
+
+        cv::imshow("webcam", output_bgr);
+
+        // Display FPS
+
+        // std::cout << "[INFO] dt: " << delta_time << std::endl;
+        // std::cout << "[INFO] FPS: " << fps << std::endl;
+
+
+
+
+        // std::thread::sleep_for(std::chrono::milliseconds(700));
+
+        // std::thread::sleep_for()
+
+
+        // std::thread::sleep_for(std::chrono::milliseconds(expected_time_index - (delta_time + last_time_index)));
+
+
+        last_frame_count = frame_count;
+        last_update = now; // std::chrono::system_clock::now();
+        if (cv::waitKey(1) == 27) { // ESC key
+        break;
+        }
+    }
+
+    cap.release();
+    cv::destroyAllWindows();
+    return 0;
+}
\ No newline at end of file
diff --git a/demos/video/style-transfer/style_transfer_test.py b/demos/video/style-transfer/style_transfer_test.py
new file mode 100644
index 000000000..7bc5436bf
--- /dev/null
+++ b/demos/video/style-transfer/style_transfer_test.py
@@ -0,0 +1,205 @@
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision
+import cv2
+import utils
+
+def dir_path(path):
+    if os.path.isdir(path):
+        return path
+    else:
+        raise argparse.ArgumentTypeError(f'readable_dir:{path} is not a valid path')
+
+def torch_device(device_name):
+    if device_name == 'cpu':
+        return torch.device('cpu')
+    elif device_name == 'cuda':
+        if torch.cuda.is_available():
+            return torch.device('cuda')
+        else:
+            raise argparse.ArgumentTypeError(f'cuda is not available')
+    elif device_name == 'mps':
+        if torch.backends.mps.is_available():
+            return torch.device('mps')
+        else:
+            raise argparse.ArgumentTypeError(f'mps is not available')
+    elif device_name == None:
+        if torch.backends.mps.is_available():
+            return torch.device('mps')
+        elif torch.cuda.is_available():
+            return torch.device('cuda')
+        else:
+            return torch.device('cpu')
+    else:
+        raise argparse.ArgumentTypeError(f'unknown device name: {device_name}')
+
+parser = argparse.ArgumentParser(description='Process files in a directory.')
+parser.add_argument('--device', dest='device', type=torch_device, default=None,
+                    help='Device to use for computation (default: cpu).')
+
+parser.add_argument('--model-file', type=Path, required=True,
+                    help='Path to the model file (e.g., .pt).')
+
+parser.add_argument('--use-webcam', action='store_true',
+                    help='Use webcam for input (default: False).')
+
+parser.add_argument('--input-video-file', type=Path, help='Path to the input video file (default: webcam).', default=None)
+parser.add_argument('--output-video-file', type=Path, help='Path to the output video file (default: webcam).', default=None)
+
+parser.add_argument('--show-output', action='store_true',
+                    help='Show output video in a window (default: False).')
+
+
+args = parser.parse_args()
+
+arg_dict = vars(args)
+for arg in arg_dict:
+    print(f'args.{arg}: {arg_dict[arg]}')
+
+if args.use_webcam or args.input_video_file:
+    if args.input_video_file and args.use_webcam:
+        raise argparse.ArgumentTypeError('Cannot use both webcam and input video file at the same time.')
+    if args.input_video_file:
+        print('using input video file:', args.input_video_file)
+        args.use_webcam = False
+    else:
+        args.input_video_file = None
+        args.use_webcam = True
+        print('using webcam for input video')
+
+
+
+default_device = args.device
+if default_device is None:
+    default_device = torch.device('cpu')
+    if torch.backends.mps.is_available():
+        default_device = torch.device('mps')
+        print('using mps')
+
+    if torch.cuda.is_available():
+        default_device = torch.device('cuda')
+        print('using cuda')
+
+print('using default device:', default_device)
+
+torch.set_default_device(default_device)
+
+
+# Open the default camera
+cam = cv2.VideoCapture(str(args.input_video_file) if args.input_video_file else 0)
+if not cam.isOpened():
+    print("Error: Could not open video.")
+    sys.exit()
+
+# Get the default frame width and height
+frame_width = int(cam.get(cv2.CAP_PROP_FRAME_WIDTH))
+frame_height = int(cam.get(cv2.CAP_PROP_FRAME_HEIGHT))
+capture_fps = int(cam.get(cv2.CAP_PROP_FPS))
+# Define the codec and create VideoWriter object
+if args.output_video_file:
+    if not args.output_video_file.exists():
+        args.output_video_file.parent.mkdir(parents=True, exist_ok=True)
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = out = cv2.VideoWriter(str(args.output_video_file), fourcc, capture_fps, (frame_width, frame_height))
+
+
+
+model = torch.jit.load(str(args.model_file))
+model = model.to(default_device)
+model.eval()
+print('Loaded model:', args.model_file)
+
+
+done_writing_to_output = False
+
+while True:
+    ret, frame_bgr = cam.read()
+    if not ret:
+        if args.use_webcam:
+            print("Error: Could not read frame from webcam.")
+        if args.input_video_file and args.show_output:
+            done_writing_to_output = True
+            cam = cv2.VideoCapture(str(args.input_video_file))
+            ret, frame_bgr = cam.read()
+            if not ret:
+                print("Error: Could not read frame from input video file.")
+                break
+            else:
+                continue
+        else:
+            break
+
+
+
+    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+
+    # 3) Ensure the array is contiguous (torch needs it) -------------------------
+    frame_rgb = np.ascontiguousarray(frame_rgb)
+
+    # 4) numpy -> torch, move channels, scale, add batch if wanted --------------
+    tensor = torch.from_numpy(frame_rgb)     # H x W x C, uint8 → int tensor
+    tensor = tensor.to(default_device, non_blocking=True)
+    
+    tensor = tensor.permute(2, 0, 1)         # C x H x W
+    tensor = tensor.to(torch.float32).div(255.0)       # float32, [0,1]
+
+    # normalize tensor to ImageNet mean and std
+    # mean = (0.485, 0.456, 0.406)  # ImageNet defaults (RGB)
+    # std  = (0.229, 0.224, 0.225)
+    # mean = torch.tensor(mean, dtype=tensor.dtype, device=tensor.device)[:, None, None]
+    # std  = torch.tensor(std,  dtype=tensor.dtype, device=tensor.device)[:, None, None]
+    # tensor.sub_(mean).div_(std)    
+
+    # 5) (Optional) add a batch dim and push to GPU ------------------------------
+    tensor = tensor.unsqueeze(0)             # 1 x C x H x W
+
+    # if ticks == 3:
+    #     tensor = tensor.to(torch.float16)
+    #     mosaic = torch.jit.load("models/mosaic_float16.pt")
+    #     mosaic.to('mps')
+    #     mosaic_output = mosaic(tensor) / 255.0
+    #     # mosaic_output = undo_normalize(mosaic_output)
+    #     print('input:',tensor.shape,tensor.dtype)
+    #     print('mosaic output:',mosaic_output.shape,mosaic_output.dtype)
+    #     torchvision.utils.save_image(tensor[0], 'input_tensor.png')
+    #     torchvision.utils.save_image(mosaic_output[0], 'mosaic_output.png')
+
+    #     sys.exit(0)
+
+
+    if args.model_file.name == 'sobel_edge_float32.pt':
+        output_tensor = model(tensor.to(torch.float16))
+    else:
+        output_tensor = model(tensor.to(torch.float16)) / 255.0
+    # print('input:',tensor.shape,tensor.dtype)
+    # print('output:',output_tensor.shape)
+
+
+    # frame_bgr_out = tensor_to_bgr(output_tensor, undo_normalise=True,mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    frame_bgr_out = utils.tensor_to_bgr(output_tensor)
+
+
+    if args.show_output or args.use_webcam:
+        cv2.imshow('Frame', frame_bgr_out)
+        if cv2.waitKey(25) & 0xFF == ord('q'):
+            break
+
+    if args.output_video_file and done_writing_to_output:
+        out.write(frame_bgr_out)
+
+
+# Release the capture and writer objects
+if args.output_video_file:
+    out.release()
+if args.use_webcam:
+    cam.release()
+if args.show_output:
+    cv2.destroyAllWindows()
diff --git a/demos/video/style-transfer/transformer_net.py b/demos/video/style-transfer/transformer_net.py
new file mode 100644
index 000000000..c0f69a9a3
--- /dev/null
+++ b/demos/video/style-transfer/transformer_net.py
@@ -0,0 +1,103 @@
+import torch
+
+
+class TransformerNet(torch.nn.Module):
+    def __init__(self):
+        super(TransformerNet, self).__init__()
+        # Initial convolution layers
+        self.conv1 = ConvLayer(3, 32, kernel_size=9, stride=1)
+        self.in1 = torch.nn.InstanceNorm2d(32, affine=True)
+        self.conv2 = ConvLayer(32, 64, kernel_size=3, stride=2)
+        self.in2 = torch.nn.InstanceNorm2d(64, affine=True)
+        self.conv3 = ConvLayer(64, 128, kernel_size=3, stride=2)
+        self.in3 = torch.nn.InstanceNorm2d(128, affine=True)
+        # Residual layers
+        self.res1 = ResidualBlock(128)
+        self.res2 = ResidualBlock(128)
+        self.res3 = ResidualBlock(128)
+        self.res4 = ResidualBlock(128)
+        self.res5 = ResidualBlock(128)
+        # Upsampling Layers
+        self.deconv1 = UpsampleConvLayer(128, 64, kernel_size=3, stride=1, upsample=2)
+        self.in4 = torch.nn.InstanceNorm2d(64, affine=True)
+        self.deconv2 = UpsampleConvLayer(64, 32, kernel_size=3, stride=1, upsample=2)
+        self.in5 = torch.nn.InstanceNorm2d(32, affine=True)
+        self.deconv3 = ConvLayer(32, 3, kernel_size=9, stride=1)
+        # Non-linearities
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, X):
+        y = self.relu(self.in1(self.conv1(X)))
+        y = self.relu(self.in2(self.conv2(y)))
+        y = self.relu(self.in3(self.conv3(y)))
+        y = self.res1(y)
+        y = self.res2(y)
+        y = self.res3(y)
+        y = self.res4(y)
+        y = self.res5(y)
+        y = self.relu(self.in4(self.deconv1(y)))
+        y = self.relu(self.in5(self.deconv2(y)))
+        y = self.deconv3(y)
+        return y
+
+
+class ConvLayer(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride):
+        super(ConvLayer, self).__init__()
+        reflection_padding = kernel_size // 2
+        self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
+        self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
+
+    def forward(self, x):
+        out = self.reflection_pad(x)
+        out = self.conv2d(out)
+        return out
+
+
+class ResidualBlock(torch.nn.Module):
+    """ResidualBlock
+    introduced in: https://arxiv.org/abs/1512.03385
+    recommended architecture: http://torch.ch/blog/2016/02/04/resnets.html
+    """
+
+    def __init__(self, channels):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = ConvLayer(channels, channels, kernel_size=3, stride=1)
+        self.in1 = torch.nn.InstanceNorm2d(channels, affine=True)
+        self.conv2 = ConvLayer(channels, channels, kernel_size=3, stride=1)
+        self.in2 = torch.nn.InstanceNorm2d(channels, affine=True)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+        out = self.relu(self.in1(self.conv1(x)))
+        out = self.in2(self.conv2(out))
+        out = out + residual
+        return out
+
+
+class UpsampleConvLayer(torch.nn.Module):
+    """UpsampleConvLayer
+    Upsamples the input and then does a convolution. This method gives better results
+    compared to ConvTranspose2d.
+    ref: http://distill.pub/2016/deconv-checkerboard/
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride, upsample):
+        super(UpsampleConvLayer, self).__init__()
+        # self.upsample = upsample
+        self.upsample = torch.nn.Upsample(scale_factor=2, mode='nearest')
+        reflection_padding = kernel_size // 2
+        self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
+        self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
+
+    def forward(self, x):
+        x_in = x
+        # print('upsample', self.upsample)
+        # x_in = torch.nn.functional.interpolate(x_in, mode='nearest', scale_factor=self.upsample)
+        # if self.upsample:
+        #     x_in = torch.nn.functional.interpolate(x_in, mode='nearest', scale_factor=self.upsample)
+        out = self.upsample(x_in)
+        out = self.reflection_pad(out)
+        out = self.conv2d(out)
+        return out
diff --git a/demos/video/style-transfer/utils.py b/demos/video/style-transfer/utils.py
new file mode 100644
index 000000000..bbfa07a34
--- /dev/null
+++ b/demos/video/style-transfer/utils.py
@@ -0,0 +1,140 @@
+import torch
+from PIL import Image
+import PIL
+
+import cv2
+import numpy as np
+
+def load_image(filename, size=None, scale=None):
+    img = Image.open(filename).convert('RGB')
+    if size is not None:
+        img = img.resize((size, size), PIL.Image.Resampling.LANCZOS)
+    elif scale is not None:
+        img = img.resize((int(img.size[0] / scale), int(img.size[1] / scale)), PIL.Image.Resampling.LANCZOS)
+    return img
+
+
+def save_image(filename, data):
+    img = data.clone().clamp(0, 255).numpy()
+    img = img.transpose(1, 2, 0).astype("uint8")
+    img = Image.fromarray(img)
+    img.save(filename)
+
+
+def gram_matrix(y):
+    (b, ch, h, w) = y.size()
+    features = y.view(b, ch, w * h)
+    features_t = features.transpose(1, 2)
+    gram = features.bmm(features_t) / (ch * h * w)
+    return gram
+
+
+def normalize_batch(batch):
+    # normalize using imagenet mean and std
+    mean = batch.new_tensor([0.485, 0.456, 0.406]).view(-1, 1, 1)
+    std = batch.new_tensor([0.229, 0.224, 0.225]).view(-1, 1, 1)
+    batch = batch.div_(255.0)
+    return (batch - mean) / std
+
+
+def bgr_to_tensor(
+    frame_bgr: np.ndarray,
+    *,
+    normalize: bool = False,
+    mean: tuple[float, float, float] | None = None,
+    std: tuple[float, float, float] | None = None,
+    add_batch: bool = False,
+    device: torch.device | str | None = None,
+) -> torch.Tensor:
+    """
+    Convert an OpenCV BGR frame (H x W x 3, uint8) to a PyTorch tensor
+    (C x H x W, float32, RGB).  Optionally normalise with mean / std.
+
+    Parameters
+    ----------
+    frame_bgr : np.ndarray
+        Raw image from cv2 (BGR, uint8, H x W x 3).
+    normalize : bool, default False
+        If True, apply  (tensor - mean) / std  after scaling to [0,1].
+    mean, std : tuple of 3 floats, optional
+        Normalisation stats in **RGB** order.  If `normalize` is True and
+        these are omitted, ImageNet values are used.
+    add_batch : bool, default False
+        If True, adds a leading batch dim → (1, C, H, W).
+    device : torch.device | str | None
+        Target device (e.g. "cuda").  If None, tensor stays on CPU.
+
+    Returns
+    -------
+    torch.Tensor
+        The converted (and optionally normalised) tensor.
+    """
+    # --- 1. BGR → RGB -------------------------------------------------------
+    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+
+    # --- 2. Ensure contiguous memory for zero‑copy conversion --------------
+    frame_rgb = np.ascontiguousarray(frame_rgb)
+
+    # --- 3. numpy → torch, reorder to C,H,W --------------------------------
+    tensor = torch.from_numpy(frame_rgb).permute(2, 0, 1).float()  # C,H,W
+
+    # --- 4. Scale to [0,1] --------------------------------------------------
+    tensor = tensor.div_(255.0)
+
+    # --- 5. Optional normalisation -----------------------------------------
+    if normalize:
+        if mean is None or std is None:
+            mean = (0.485, 0.456, 0.406)  # ImageNet defaults (RGB)
+            std  = (0.229, 0.224, 0.225)
+        mean = torch.tensor(mean, dtype=tensor.dtype, device=tensor.device)[:, None, None]
+        std  = torch.tensor(std,  dtype=tensor.dtype, device=tensor.device)[:, None, None]
+        tensor.sub_(mean).div_(std)
+
+    # --- 6. Optional batch and device move ---------------------------------
+    if add_batch:
+        tensor = tensor.unsqueeze(0)      # N,C,H,W
+    if device is not None:
+        tensor = tensor.to(device, non_blocking=True)
+
+    return tensor
+
+
+def tensor_to_bgr(frame_tensor, *, undo_normalise=False, mean=None, std=None):
+    """
+    Args
+    ----
+    frame_tensor : torch.Tensor
+        (C,H,W) or (1,C,H,W)   ―  float or half   ―  RGB
+    undo_normalise : bool
+        True if you previously applied (x - mean) / std
+    mean, std : list/tuple of 3 floats
+        Same numbers you used for normalising (e.g. ImageNet)
+    Returns
+    -------
+    frame_bgr : np.ndarray   (H,W,3) uint8   BGR  contiguous
+    """
+    # 1) squeeze batch dimension if present
+    if frame_tensor.ndim == 4:
+        frame_tensor = frame_tensor[0]
+
+    # 2) move to CPU & float32 for math
+    img = frame_tensor.detach()
+
+    # 3) (optional) reverse mean/std normalisation
+    if undo_normalise:
+        if mean is None or std is None:
+            raise ValueError("Supply mean and std to undo normalisation")
+        mean = torch.tensor(mean).to(img).view(3,1,1)
+        std  = torch.tensor(std).to(img).view(3,1,1)
+        img = img * std + mean
+
+    # 4) scale back to 0‑255, clamp, uint8
+    img = (img * 255.0)
+    # img = img # .to(torch.float16)
+    img = img.clamp(0,255).byte()
+
+    # 5) channel‑last & numpy
+    img = img.permute(1,2,0).cpu().numpy()                 # H,W,C  RGB
+    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)       # → BGR
+    img = np.ascontiguousarray(img)                  # ensure OpenCV‑happy
+    return img
diff --git a/demos/video/style-transfer/vgg.py b/demos/video/style-transfer/vgg.py
new file mode 100644
index 000000000..35fd25848
--- /dev/null
+++ b/demos/video/style-transfer/vgg.py
@@ -0,0 +1,38 @@
+from collections import namedtuple
+
+import torch
+from torchvision import models
+
+
+class Vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False):
+        super(Vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3)
+        return out
diff --git a/demos/video/style-transfer/videos/candy_deer.mp4 b/demos/video/style-transfer/videos/candy_deer.mp4
new file mode 100644
index 000000000..b10efa9aa
Binary files /dev/null and b/demos/video/style-transfer/videos/candy_deer.mp4 differ
diff --git a/demos/video/style-transfer/videos/deer.mp4 b/demos/video/style-transfer/videos/deer.mp4
new file mode 100644
index 000000000..54ff7126b
Binary files /dev/null and b/demos/video/style-transfer/videos/deer.mp4 differ
diff --git a/demos/video/style-transfer/videos/edge_deer.mp4 b/demos/video/style-transfer/videos/edge_deer.mp4
new file mode 100644
index 000000000..c19d5eaf5
Binary files /dev/null and b/demos/video/style-transfer/videos/edge_deer.mp4 differ
diff --git a/demos/video/style-transfer/videos/mosaic_deer.mp4 b/demos/video/style-transfer/videos/mosaic_deer.mp4
new file mode 100644
index 000000000..b54e27793
Binary files /dev/null and b/demos/video/style-transfer/videos/mosaic_deer.mp4 differ
diff --git a/demos/video/style-transfer/videos/udnie_deer.mp4 b/demos/video/style-transfer/videos/udnie_deer.mp4
new file mode 100644
index 000000000..22d6070eb
Binary files /dev/null and b/demos/video/style-transfer/videos/udnie_deer.mp4 differ
diff --git a/demos/video/webcam_infer.cpp b/demos/video/webcam-capture/webcam_infer.cpp
similarity index 100%
rename from demos/video/webcam_infer.cpp
rename to demos/video/webcam-capture/webcam_infer.cpp
diff --git a/examples/.gitignore b/examples/.gitignore
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 361a58c0e..277b468c9 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -3,3 +3,6 @@
 add_subdirectory(my_example)
 
 add_subdirectory(torch_model_loading)
+
+
+add_subdirectory(split_loop)
\ No newline at end of file
diff --git a/examples/pytorch-examples/fast_neural_style/.gitignore b/examples/pytorch-examples/fast_neural_style/.gitignore
new file mode 100644
index 000000000..e5694acc2
--- /dev/null
+++ b/examples/pytorch-examples/fast_neural_style/.gitignore
@@ -0,0 +1 @@
+saved_models/*
diff --git a/examples/pytorch-examples/fast_neural_style/README.md b/examples/pytorch-examples/fast_neural_style/README.md
new file mode 100644
index 000000000..9b5834ed7
--- /dev/null
+++ b/examples/pytorch-examples/fast_neural_style/README.md
@@ -0,0 +1,66 @@
+# fast-neural-style :city_sunrise: :rocket:
+
+This repository contains a pytorch implementation of an algorithm for artistic style transfer. The algorithm can be used to mix the content of an image with the style of another image. For example, here is a photograph of a door arch rendered in the style of a stained glass painting.
+
+The model uses the method described in [Perceptual Losses for Real-Time Style Transfer and Super-Resolution](https://arxiv.org/abs/1603.08155) along with [Instance Normalization](https://arxiv.org/pdf/1607.08022.pdf). The saved-models for examples shown in the README can be downloaded from [here](https://www.dropbox.com/s/lrvwfehqdcxoza8/saved_models.zip?dl=0).
+
+<p align="center">
+    <img src="images/style-images/mosaic.jpg" height="200px">
+    <img src="images/content-images/amber.jpg" height="200px">
+    <img src="images/output-images/amber-mosaic.jpg" height="440px">
+</p>
+
+## Requirements
+
+The program is written in Python, and uses [pytorch](http://pytorch.org/), [scipy](https://www.scipy.org). A GPU is not necessary, but can provide a significant speed up especially for training a new model. Regular sized images can be styled on a laptop or desktop using saved models.
+
+## Usage
+
+Stylize image
+
+```
+python neural_style/neural_style.py eval --content-image </path/to/content/image> --model </path/to/saved/model> --output-image </path/to/output/image> --accel
+```
+
+- `--content-image`: path to content image you want to stylize.
+- `--model`: saved model to be used for stylizing the image (eg: `mosaic.pth`)
+- `--output-image`: path for saving the output image.
+- `--content-scale`: factor for scaling down the content image if memory is an issue (eg: value of 2 will halve the height and width of content-image)
+- `--accel`: use accelerator
+
+Train model
+
+```bash
+python neural_style/neural_style.py train --dataset </path/to/train-dataset> --style-image </path/to/style/image> --save-model-dir </path/to/save-model/folder> --epochs 2 --accel
+```
+
+There are several command line arguments, the important ones are listed below
+
+- `--dataset`: path to training dataset, the path should point to a folder containing another folder with all the training images. I used COCO 2014 Training images dataset [80K/13GB] [(download)](https://cocodataset.org/#download).
+- `--style-image`: path to style-image.
+- `--save-model-dir`: path to folder where trained model will be saved.
+- `--accel`: use accelerator.
+
+If `--accel` argument is given, pytorch will search for available hardware acceleration device and attempt to use it. This example is known to work on CUDA, MPS and XPU devices.
+
+Refer to `neural_style/neural_style.py` for other command line arguments. For training new models you might have to tune the values of `--content-weight` and `--style-weight`. The mosaic style model shown above was trained with `--content-weight 1e5` and `--style-weight 1e10`. The remaining 3 models were also trained with similar order of weight parameters with slight variation in the `--style-weight` (`5e10` or `1e11`).
+
+## Models
+
+Models for the examples shown below can be downloaded from [here](https://www.dropbox.com/s/lrvwfehqdcxoza8/saved_models.zip?dl=0) or by running the script `download_saved_models.py`.
+
+<div align='center'>
+  <img src='images/content-images/amber.jpg' height="174px">		
+</div>
+
+<div align='center'>
+  <img src='images/style-images/mosaic.jpg' height="174px">
+  <img src='images/output-images/amber-mosaic.jpg' height="174px">
+  <img src='images/output-images/amber-candy.jpg' height="174px">
+  <img src='images/style-images/candy.jpg' height="174px">
+  <br>
+  <img src='images/style-images/rain-princess-cropped.jpg' height="174px">
+  <img src='images/output-images/amber-rain-princess.jpg' height="174px">
+  <img src='images/output-images/amber-udnie.jpg' height="174px">
+  <img src='images/style-images/udnie.jpg' height="174px">
+</div>
diff --git a/examples/pytorch-examples/fast_neural_style/accel.ipynb b/examples/pytorch-examples/fast_neural_style/accel.ipynb
new file mode 100644
index 000000000..8d6273572
--- /dev/null
+++ b/examples/pytorch-examples/fast_neural_style/accel.ipynb
@@ -0,0 +1,64 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "63f6c39d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d4a5bb07",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.backends.mps.is_available()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc478ec7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/pytorch-examples/fast_neural_style/download_saved_models.py b/examples/pytorch-examples/fast_neural_style/download_saved_models.py
new file mode 100644
index 000000000..691c2c0a3
--- /dev/null
+++ b/examples/pytorch-examples/fast_neural_style/download_saved_models.py
@@ -0,0 +1,28 @@
+import os
+import zipfile
+
+# PyTorch 1.1 moves _download_url_to_file
+#   from torch.utils.model_zoo to torch.hub
+# PyTorch 1.0 exists another _download_url_to_file
+#   2 argument
+# TODO: If you remove support PyTorch 1.0 or older,
+#       You should remove torch.utils.model_zoo
+#       Ref. PyTorch #18758
+#         https://github.com/pytorch/pytorch/pull/18758/commits
+try:
+    from torch.utils.model_zoo import _download_url_to_file
+except ImportError:
+    try:
+        from torch.hub import download_url_to_file as _download_url_to_file
+    except ImportError:
+        from torch.hub import _download_url_to_file
+
+
+def unzip(source_filename, dest_dir):
+    with zipfile.ZipFile(source_filename) as zf:
+        zf.extractall(path=dest_dir)
+
+
+if __name__ == '__main__':
+    _download_url_to_file('https://www.dropbox.com/s/lrvwfehqdcxoza8/saved_models.zip?dl=1', 'saved_models.zip', None, True)
+    unzip('saved_models.zip', '.')
diff --git a/examples/pytorch-examples/fast_neural_style/iain_hike-mosaic.jpg b/examples/pytorch-examples/fast_neural_style/iain_hike-mosaic.jpg
new file mode 100644
index 000000000..438f5c26f
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/iain_hike-mosaic.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/iain_hike.jpg b/examples/pytorch-examples/fast_neural_style/iain_hike.jpg
new file mode 100644
index 000000000..c174227e0
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/iain_hike.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/iain_river-mosaic.jpg b/examples/pytorch-examples/fast_neural_style/iain_river-mosaic.jpg
new file mode 100644
index 000000000..d0c849105
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/iain_river-mosaic.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/iain_river.jpeg b/examples/pytorch-examples/fast_neural_style/iain_river.jpeg
new file mode 100644
index 000000000..8efcf6294
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/iain_river.jpeg differ
diff --git a/examples/pytorch-examples/fast_neural_style/images/content-images/amber.jpg b/examples/pytorch-examples/fast_neural_style/images/content-images/amber.jpg
new file mode 100644
index 000000000..22f6390f4
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/images/content-images/amber.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/images/output-images/amber-candy.jpg b/examples/pytorch-examples/fast_neural_style/images/output-images/amber-candy.jpg
new file mode 100644
index 000000000..f585fdaae
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/images/output-images/amber-candy.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/images/output-images/amber-mosaic.jpg b/examples/pytorch-examples/fast_neural_style/images/output-images/amber-mosaic.jpg
new file mode 100644
index 000000000..5af32e759
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/images/output-images/amber-mosaic.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/images/output-images/amber-rain-princess.jpg b/examples/pytorch-examples/fast_neural_style/images/output-images/amber-rain-princess.jpg
new file mode 100644
index 000000000..4f9efeb20
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/images/output-images/amber-rain-princess.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/images/output-images/amber-udnie.jpg b/examples/pytorch-examples/fast_neural_style/images/output-images/amber-udnie.jpg
new file mode 100644
index 000000000..4e7261603
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/images/output-images/amber-udnie.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/images/style-images/candy.jpg b/examples/pytorch-examples/fast_neural_style/images/style-images/candy.jpg
new file mode 100644
index 000000000..f40e5a33e
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/images/style-images/candy.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/images/style-images/mosaic.jpg b/examples/pytorch-examples/fast_neural_style/images/style-images/mosaic.jpg
new file mode 100644
index 000000000..63aa06fe4
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/images/style-images/mosaic.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/images/style-images/rain-princess-cropped.jpg b/examples/pytorch-examples/fast_neural_style/images/style-images/rain-princess-cropped.jpg
new file mode 100644
index 000000000..00a83ea48
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/images/style-images/rain-princess-cropped.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/images/style-images/rain-princess.jpg b/examples/pytorch-examples/fast_neural_style/images/style-images/rain-princess.jpg
new file mode 100644
index 000000000..520f6a227
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/images/style-images/rain-princess.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/images/style-images/udnie.jpg b/examples/pytorch-examples/fast_neural_style/images/style-images/udnie.jpg
new file mode 100644
index 000000000..3dbb29cf8
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/images/style-images/udnie.jpg differ
diff --git a/examples/pytorch-examples/fast_neural_style/models/mosaic_float16.pt b/examples/pytorch-examples/fast_neural_style/models/mosaic_float16.pt
new file mode 100644
index 000000000..9718339de
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/models/mosaic_float16.pt differ
diff --git a/examples/pytorch-examples/fast_neural_style/models/mosaic_float32.pt b/examples/pytorch-examples/fast_neural_style/models/mosaic_float32.pt
new file mode 100644
index 000000000..e03440a7d
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/models/mosaic_float32.pt differ
diff --git a/examples/pytorch-examples/fast_neural_style/neural_style/__init__.py b/examples/pytorch-examples/fast_neural_style/neural_style/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/pytorch-examples/fast_neural_style/neural_style/neural_style.py b/examples/pytorch-examples/fast_neural_style/neural_style/neural_style.py
new file mode 100644
index 000000000..9695d088f
--- /dev/null
+++ b/examples/pytorch-examples/fast_neural_style/neural_style/neural_style.py
@@ -0,0 +1,293 @@
+import argparse
+import os
+import sys
+import time
+import re
+
+import numpy as np
+import torch
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision import transforms
+import torch.onnx
+
+import utils
+from transformer_net import TransformerNet
+from vgg import Vgg16
+
+
+def available_accelerator():
+    """
+    Check if accelerator is available.
+    """
+    return torch.cuda.is_available() or torch.backends.mps.is_available()
+
+def current_accelerator(args):
+    """
+    Get the current accelerator.
+    """
+    if args.accel and available_accelerator():
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            raise RuntimeError("No accelerator available")
+    else:
+        return torch.device("cpu")
+
+def check_paths(args):
+    try:
+        if not os.path.exists(args.save_model_dir):
+            os.makedirs(args.save_model_dir)
+        if args.checkpoint_model_dir is not None and not (os.path.exists(args.checkpoint_model_dir)):
+            os.makedirs(args.checkpoint_model_dir)
+    except OSError as e:
+        print(e)
+        sys.exit(1)
+
+
+def train(args):
+    if args.accel:
+        device = current_accelerator(args)
+    else:
+        device = torch.device("cpu")
+
+    print(f"Using device: {device}")
+
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    transform = transforms.Compose([
+        transforms.Resize(args.image_size),
+        transforms.CenterCrop(args.image_size),
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    train_dataset = datasets.ImageFolder(args.dataset, transform)
+    train_loader = DataLoader(train_dataset, batch_size=args.batch_size)
+
+    transformer = TransformerNet().to(device)
+    optimizer = Adam(transformer.parameters(), args.lr)
+    mse_loss = torch.nn.MSELoss()
+
+    vgg = Vgg16(requires_grad=False).to(device)
+    style_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    style = utils.load_image(args.style_image, size=args.style_size)
+    style = style_transform(style)
+    style = style.repeat(args.batch_size, 1, 1, 1).to(device)
+
+    features_style = vgg(utils.normalize_batch(style))
+    gram_style = [utils.gram_matrix(y) for y in features_style]
+
+    for e in range(args.epochs):
+        transformer.train()
+        agg_content_loss = 0.
+        agg_style_loss = 0.
+        count = 0
+        for batch_id, (x, _) in enumerate(train_loader):
+            n_batch = len(x)
+            count += n_batch
+            optimizer.zero_grad()
+
+            x = x.to(device)
+            y = transformer(x)
+
+            y = utils.normalize_batch(y)
+            x = utils.normalize_batch(x)
+
+            features_y = vgg(y)
+            features_x = vgg(x)
+
+            content_loss = args.content_weight * mse_loss(features_y.relu2_2, features_x.relu2_2)
+
+            style_loss = 0.
+            for ft_y, gm_s in zip(features_y, gram_style):
+                gm_y = utils.gram_matrix(ft_y)
+                style_loss += mse_loss(gm_y, gm_s[:n_batch, :, :])
+            style_loss *= args.style_weight
+
+            total_loss = content_loss + style_loss
+            total_loss.backward()
+            optimizer.step()
+
+            agg_content_loss += content_loss.item()
+            agg_style_loss += style_loss.item()
+
+            if (batch_id + 1) % args.log_interval == 0:
+                mesg = "{}\tEpoch {}:\t[{}/{}]\tcontent: {:.6f}\tstyle: {:.6f}\ttotal: {:.6f}".format(
+                    time.ctime(), e + 1, count, len(train_dataset),
+                                  agg_content_loss / (batch_id + 1),
+                                  agg_style_loss / (batch_id + 1),
+                                  (agg_content_loss + agg_style_loss) / (batch_id + 1)
+                )
+                print(mesg)
+
+            if args.checkpoint_model_dir is not None and (batch_id + 1) % args.checkpoint_interval == 0:
+                transformer.eval().cpu()
+                ckpt_model_filename = "ckpt_epoch_" + str(e) + "_batch_id_" + str(batch_id + 1) + ".pth"
+                ckpt_model_path = os.path.join(args.checkpoint_model_dir, ckpt_model_filename)
+                torch.save(transformer.state_dict(), ckpt_model_path)
+                transformer.to(device).train()
+
+    # save model
+    transformer.eval().cpu()
+    save_model_filename = "epoch_" + str(args.epochs) + "_" + str(time.ctime()).replace(' ', '_') + "_" + str(
+        args.content_weight) + "_" + str(args.style_weight) + ".model"
+    save_model_path = os.path.join(args.save_model_dir, save_model_filename)
+    torch.save(transformer.state_dict(), save_model_path)
+
+    print("\nDone, trained model saved at", save_model_path)
+
+
+def stylize(args):
+    if args.accel:
+        device = current_accelerator(args)
+    else:
+        device = torch.device("cpu")
+    
+    print(f"Using device: {device}")
+
+    content_image = utils.load_image(args.content_image, scale=args.content_scale)
+    content_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Lambda(lambda x: x.mul(255))
+    ])
+    content_image = content_transform(content_image)
+    content_image = content_image.unsqueeze(0).to(device)
+
+    if args.model.endswith(".onnx"):
+        output = stylize_onnx(content_image, args)
+    else:
+        with torch.no_grad():
+            style_model = TransformerNet()
+            state_dict = torch.load(args.model)
+            # remove saved deprecated running_* keys in InstanceNorm from the checkpoint
+            for k in list(state_dict.keys()):
+                if re.search(r'in\d+\.running_(mean|var)$', k):
+                    del state_dict[k]
+            style_model.load_state_dict(state_dict)
+            style_model.to(device)
+            style_model.eval()
+            if args.export_onnx:
+                assert args.export_onnx.endswith(".onnx"), "Export model file should end with .onnx"
+                output = torch.onnx._export(
+                    style_model, content_image, args.export_onnx, opset_version=11,
+                ).cpu()            
+            else:
+                print('Content image shape:', content_image.shape)
+                output = style_model(content_image).cpu()
+
+            utils.save_image(args.output_image, output[0])
+            from pathlib import Path
+            model_name = Path(args.model).stem
+
+            sm = torch.jit.script(style_model.to(torch.float32))
+            sm.save(f"models/{model_name}_float32.pt")
+
+            sm = torch.jit.script(style_model.to(torch.float16))
+            sm.save(f"models/{model_name}_float16.pt")
+
+    utils.save_image(args.output_image, output[0])
+
+
+def stylize_onnx(content_image, args):
+    """
+    Read ONNX model and run it using onnxruntime
+    """
+
+    assert not args.export_onnx
+
+    import onnxruntime
+
+    ort_session = onnxruntime.InferenceSession(args.model)
+
+    def to_numpy(tensor):
+        return (
+            tensor.detach().cpu().numpy()
+            if tensor.requires_grad
+            else tensor.cpu().numpy()
+        )
+
+    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(content_image)}
+    ort_outs = ort_session.run(None, ort_inputs)
+    img_out_y = ort_outs[0]
+
+    return torch.from_numpy(img_out_y)
+
+
+def main():
+    main_arg_parser = argparse.ArgumentParser(description="parser for fast-neural-style")
+    subparsers = main_arg_parser.add_subparsers(title="subcommands", dest="subcommand")
+
+    train_arg_parser = subparsers.add_parser("train", help="parser for training arguments")
+    train_arg_parser.add_argument("--epochs", type=int, default=2,
+                                  help="number of training epochs, default is 2")
+    train_arg_parser.add_argument("--batch-size", type=int, default=4,
+                                  help="batch size for training, default is 4")
+    train_arg_parser.add_argument("--dataset", type=str, required=True,
+                                  help="path to training dataset, the path should point to a folder "
+                                       "containing another folder with all the training images")
+    train_arg_parser.add_argument("--style-image", type=str, default="images/style-images/mosaic.jpg",
+                                  help="path to style-image")
+    train_arg_parser.add_argument("--save-model-dir", type=str, required=True,
+                                  help="path to folder where trained model will be saved.")
+    train_arg_parser.add_argument("--checkpoint-model-dir", type=str, default=None,
+                                  help="path to folder where checkpoints of trained models will be saved")
+    train_arg_parser.add_argument("--image-size", type=int, default=256,
+                                  help="size of training images, default is 256 X 256")
+    train_arg_parser.add_argument("--style-size", type=int, default=None,
+                                  help="size of style-image, default is the original size of style image")
+    train_arg_parser.add_argument('--accel', action='store_true',
+                                  help='use accelerator')
+    train_arg_parser.add_argument("--seed", type=int, default=42,
+                                  help="random seed for training")
+    train_arg_parser.add_argument("--content-weight", type=float, default=1e5,
+                                  help="weight for content-loss, default is 1e5")
+    train_arg_parser.add_argument("--style-weight", type=float, default=1e10,
+                                  help="weight for style-loss, default is 1e10")
+    train_arg_parser.add_argument("--lr", type=float, default=1e-3,
+                                  help="learning rate, default is 1e-3")
+    train_arg_parser.add_argument("--log-interval", type=int, default=500,
+                                  help="number of images after which the training loss is logged, default is 500")
+    train_arg_parser.add_argument("--checkpoint-interval", type=int, default=2000,
+                                  help="number of batches after which a checkpoint of the trained model will be created")
+
+    eval_arg_parser = subparsers.add_parser("eval", help="parser for evaluation/stylizing arguments")
+    eval_arg_parser.add_argument("--content-image", type=str, required=True,
+                                 help="path to content image you want to stylize")
+    eval_arg_parser.add_argument("--content-scale", type=float, default=None,
+                                 help="factor for scaling down the content image")
+    eval_arg_parser.add_argument("--output-image", type=str, required=True,
+                                 help="path for saving the output image")
+    eval_arg_parser.add_argument("--model", type=str, required=True,
+                                 help="saved model to be used for stylizing the image. If file ends in .pth - PyTorch path is used, if in .onnx - Caffe2 path")
+    eval_arg_parser.add_argument("--export_onnx", type=str,
+                                 help="export ONNX model to a given file")
+    eval_arg_parser.add_argument('--accel', action='store_true',
+                                 help='use accelerator')
+
+    args = main_arg_parser.parse_args()
+
+    if args.subcommand is None:
+        print("ERROR: specify either train or eval")
+        sys.exit(1)
+    if args.accel and not available_accelerator():
+        print("ERROR: accelerator is not available, try running on CPU")
+        sys.exit(1)
+    if not args.accel and available_accelerator():
+        print("WARNING: accelerator is available, run with --accel to enable it")
+
+    if args.subcommand == "train":
+        check_paths(args)
+        train(args)
+    else:
+        stylize(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch-examples/fast_neural_style/neural_style/transformer_net.py b/examples/pytorch-examples/fast_neural_style/neural_style/transformer_net.py
new file mode 100644
index 000000000..c0f69a9a3
--- /dev/null
+++ b/examples/pytorch-examples/fast_neural_style/neural_style/transformer_net.py
@@ -0,0 +1,103 @@
+import torch
+
+
+class TransformerNet(torch.nn.Module):
+    def __init__(self):
+        super(TransformerNet, self).__init__()
+        # Initial convolution layers
+        self.conv1 = ConvLayer(3, 32, kernel_size=9, stride=1)
+        self.in1 = torch.nn.InstanceNorm2d(32, affine=True)
+        self.conv2 = ConvLayer(32, 64, kernel_size=3, stride=2)
+        self.in2 = torch.nn.InstanceNorm2d(64, affine=True)
+        self.conv3 = ConvLayer(64, 128, kernel_size=3, stride=2)
+        self.in3 = torch.nn.InstanceNorm2d(128, affine=True)
+        # Residual layers
+        self.res1 = ResidualBlock(128)
+        self.res2 = ResidualBlock(128)
+        self.res3 = ResidualBlock(128)
+        self.res4 = ResidualBlock(128)
+        self.res5 = ResidualBlock(128)
+        # Upsampling Layers
+        self.deconv1 = UpsampleConvLayer(128, 64, kernel_size=3, stride=1, upsample=2)
+        self.in4 = torch.nn.InstanceNorm2d(64, affine=True)
+        self.deconv2 = UpsampleConvLayer(64, 32, kernel_size=3, stride=1, upsample=2)
+        self.in5 = torch.nn.InstanceNorm2d(32, affine=True)
+        self.deconv3 = ConvLayer(32, 3, kernel_size=9, stride=1)
+        # Non-linearities
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, X):
+        y = self.relu(self.in1(self.conv1(X)))
+        y = self.relu(self.in2(self.conv2(y)))
+        y = self.relu(self.in3(self.conv3(y)))
+        y = self.res1(y)
+        y = self.res2(y)
+        y = self.res3(y)
+        y = self.res4(y)
+        y = self.res5(y)
+        y = self.relu(self.in4(self.deconv1(y)))
+        y = self.relu(self.in5(self.deconv2(y)))
+        y = self.deconv3(y)
+        return y
+
+
+class ConvLayer(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride):
+        super(ConvLayer, self).__init__()
+        reflection_padding = kernel_size // 2
+        self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
+        self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
+
+    def forward(self, x):
+        out = self.reflection_pad(x)
+        out = self.conv2d(out)
+        return out
+
+
+class ResidualBlock(torch.nn.Module):
+    """ResidualBlock
+    introduced in: https://arxiv.org/abs/1512.03385
+    recommended architecture: http://torch.ch/blog/2016/02/04/resnets.html
+    """
+
+    def __init__(self, channels):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = ConvLayer(channels, channels, kernel_size=3, stride=1)
+        self.in1 = torch.nn.InstanceNorm2d(channels, affine=True)
+        self.conv2 = ConvLayer(channels, channels, kernel_size=3, stride=1)
+        self.in2 = torch.nn.InstanceNorm2d(channels, affine=True)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+        out = self.relu(self.in1(self.conv1(x)))
+        out = self.in2(self.conv2(out))
+        out = out + residual
+        return out
+
+
+class UpsampleConvLayer(torch.nn.Module):
+    """UpsampleConvLayer
+    Upsamples the input and then does a convolution. This method gives better results
+    compared to ConvTranspose2d.
+    ref: http://distill.pub/2016/deconv-checkerboard/
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride, upsample):
+        super(UpsampleConvLayer, self).__init__()
+        # self.upsample = upsample
+        self.upsample = torch.nn.Upsample(scale_factor=2, mode='nearest')
+        reflection_padding = kernel_size // 2
+        self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
+        self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
+
+    def forward(self, x):
+        x_in = x
+        # print('upsample', self.upsample)
+        # x_in = torch.nn.functional.interpolate(x_in, mode='nearest', scale_factor=self.upsample)
+        # if self.upsample:
+        #     x_in = torch.nn.functional.interpolate(x_in, mode='nearest', scale_factor=self.upsample)
+        out = self.upsample(x_in)
+        out = self.reflection_pad(out)
+        out = self.conv2d(out)
+        return out
diff --git a/examples/pytorch-examples/fast_neural_style/neural_style/utils.py b/examples/pytorch-examples/fast_neural_style/neural_style/utils.py
new file mode 100644
index 000000000..801ed82d0
--- /dev/null
+++ b/examples/pytorch-examples/fast_neural_style/neural_style/utils.py
@@ -0,0 +1,34 @@
+import torch
+from PIL import Image
+import PIL
+
+def load_image(filename, size=None, scale=None):
+    img = Image.open(filename).convert('RGB')
+    if size is not None:
+        img = img.resize((size, size), PIL.Image.Resampling.LANCZOS)
+    elif scale is not None:
+        img = img.resize((int(img.size[0] / scale), int(img.size[1] / scale)), PIL.Image.Resampling.LANCZOS)
+    return img
+
+
+def save_image(filename, data):
+    img = data.clone().clamp(0, 255).numpy()
+    img = img.transpose(1, 2, 0).astype("uint8")
+    img = Image.fromarray(img)
+    img.save(filename)
+
+
+def gram_matrix(y):
+    (b, ch, h, w) = y.size()
+    features = y.view(b, ch, w * h)
+    features_t = features.transpose(1, 2)
+    gram = features.bmm(features_t) / (ch * h * w)
+    return gram
+
+
+def normalize_batch(batch):
+    # normalize using imagenet mean and std
+    mean = batch.new_tensor([0.485, 0.456, 0.406]).view(-1, 1, 1)
+    std = batch.new_tensor([0.229, 0.224, 0.225]).view(-1, 1, 1)
+    batch = batch.div_(255.0)
+    return (batch - mean) / std
diff --git a/examples/pytorch-examples/fast_neural_style/neural_style/vgg.py b/examples/pytorch-examples/fast_neural_style/neural_style/vgg.py
new file mode 100644
index 000000000..35fd25848
--- /dev/null
+++ b/examples/pytorch-examples/fast_neural_style/neural_style/vgg.py
@@ -0,0 +1,38 @@
+from collections import namedtuple
+
+import torch
+from torchvision import models
+
+
+class Vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False):
+        super(Vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3)
+        return out
diff --git a/examples/pytorch-examples/fast_neural_style/requirements.txt b/examples/pytorch-examples/fast_neural_style/requirements.txt
new file mode 100644
index 000000000..54d4c008f
--- /dev/null
+++ b/examples/pytorch-examples/fast_neural_style/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+torch>=2.6
+torchvision
diff --git a/examples/pytorch-examples/fast_neural_style/run.md b/examples/pytorch-examples/fast_neural_style/run.md
new file mode 100644
index 000000000..5a06a3eb9
--- /dev/null
+++ b/examples/pytorch-examples/fast_neural_style/run.md
@@ -0,0 +1,6 @@
+
+
+
+```bash
+python3 neural_style/neural_style.py eval --content-image iain_hike.jpg --model saved_models/mosaic.pth --output-image iain_hike-mosaic.jpg --content-scale 3 --accel
+```
\ No newline at end of file
diff --git a/examples/pytorch-examples/fast_neural_style/saved_models.zip b/examples/pytorch-examples/fast_neural_style/saved_models.zip
new file mode 100644
index 000000000..cdb980749
Binary files /dev/null and b/examples/pytorch-examples/fast_neural_style/saved_models.zip differ
diff --git a/examples/split_loop/CMakeLists.txt b/examples/split_loop/CMakeLists.txt
new file mode 100644
index 000000000..0cca4d929
--- /dev/null
+++ b/examples/split_loop/CMakeLists.txt
@@ -0,0 +1,86 @@
+find_package(OpenCV 4 REQUIRED)
+
+find_library(ACCELERATE Accelerate REQUIRED)
+find_library(METAL Metal REQUIRED)
+find_library(FOUNDATION Foundation REQUIRED)
+
+
+
+add_library(bridge_cv OBJECT ${BRIDGE_DIR}/include/bridge.h ${BRIDGE_DIR}/lib/bridge.cpp)
+
+target_link_directories(bridge_cv PRIVATE ${LIBTORCH_DIR}/lib)
+
+target_link_libraries(
+    bridge_cv
+    PRIVATE
+    -ltorch
+    -ltorch_cpu
+    -lc10
+    -ltorch_global_deps
+    ${OpenCV_LIBS}
+    # ${TORCH_LIBRARIES}
+    ${ACCELERATE}
+    ${METAL}
+    ${FOUNDATION}
+)
+
+target_include_directories(
+    bridge_cv
+    PRIVATE
+    ${BRIDGE_DIR}/include
+    ${LIBTORCH_DIR}/include
+    ${LIBTORCH_DIR}/include/torch/csrc/api/include
+    # ${BRIDGE_DIR}/util
+)
+
+# if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+#   target_compile_options(bridge_cv PRIVATE -Ofast -flto -ffast-math)
+#   target_link_options(bridge_cv PRIVATE -flto)
+# endif()
+
+
+set(BRIDGE_CV_OBJECT_FILES $<TARGET_OBJECTS:bridge_cv>)
+
+
+set(CHAI_CV_LINKER_ARGS
+    -M ${PROJECT_ROOT_DIR}/lib
+    ${BRIDGE_DIR}/include/bridge.h
+    ${BRIDGE_CV_OBJECT_FILES}
+    -L ${LIBTORCH_DIR}/lib
+    ${LIBTORCH_LIBS_LINKER_ARGS}
+    --ldflags "-Wl,-rpath,${LIBTORCH_DIR}/lib"
+)
+
+
+
+
+
+
+
+
+add_executable(SplitLoop 
+    ${CMAKE_CURRENT_SOURCE_DIR}/split_loop.chpl
+    ${CHAI_LIB_FILES}
+)
+
+add_dependencies(SplitLoop bridge_cv)
+# add_dependencies(SplitLoop ChAI)
+target_link_options(SplitLoop
+    PRIVATE
+    ${CHAI_CV_LINKER_ARGS}
+)
+
+cmake_print_variables(CHAI_CV_LINKER_ARGS)
+cmake_print_variables(OpenCV_LIBS)
+cmake_print_variables(ACCELERATE)
+cmake_print_variables(METAL)
+cmake_print_variables(FOUNDATION)
+
+set_target_properties(SplitLoop PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+)
+
+# if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+#   target_compile_options(SplitLoop PRIVATE -Ofast -flto -ffast-math)
+#   target_link_options(SplitLoop PRIVATE -flto)
+# endif()
\ No newline at end of file
diff --git a/examples/split_loop/split_loop.chpl b/examples/split_loop/split_loop.chpl
new file mode 100644
index 000000000..9c633b35a
--- /dev/null
+++ b/examples/split_loop/split_loop.chpl
@@ -0,0 +1,27 @@
+use Tensor;
+use CTypes;
+
+proc main(args: [] string) {
+    writeln("Hello, world!");
+
+    // cobegin {
+    //     for i in 0..<100 {
+    //         begin Bridge.splitLoop(i,100);
+    //     }
+    // }
+    var n: int(64) = 0;
+    var nr = c_ptrTo(n);
+    cobegin {
+        begin Bridge.splitLoopFiller(1000000,nr);
+
+        for i in 0..<10 {
+            writeln("Hello from ", nr.deref());
+        }
+
+    }
+
+
+    Bridge.showWebcam();
+
+    writeln("Done!");
+}
\ No newline at end of file
diff --git a/lib/Bridge.chpl b/lib/Bridge.chpl
index c0216c3b5..06e90e529 100644
--- a/lib/Bridge.chpl
+++ b/lib/Bridge.chpl
@@ -87,6 +87,12 @@ module Bridge {
         in a: bridge_tensor_t, 
         in b: bridge_tensor_t): bridge_tensor_t;
 
+    extern "split_loop" proc splitLoop(idx: int(64), n: int(64)): void;
+
+    extern "split_loop_filler" proc splitLoopFiller(n: int(64),ret: c_ptr(int(64))): void;
+
+    extern "show_webcam" proc showWebcam(): void;
+
     // extern "capture_webcam_bridge" proc captureWebcam(
     //     in cam_index: int(32)): bridge_tensor_t;
 
diff --git a/syntax.tentract b/syntax.tentract
new file mode 100644
index 000000000..50b0e6316
--- /dev/null
+++ b/syntax.tentract
@@ -0,0 +1,40 @@
+
+
+// Builtin constructs
+operators + * - / ;
+
+// Kronecker delta
+d[i,...,j] = if i = ... = j then 1 else 0;
+
+// 
+
+
+(A _ B)[i] = A[i] _ B[i];
+
+(A + B)[i,j] = A[i,j] + B[i,j];
+
+(A + B)[j,k] = A[i,j,k] * B[i,j,k];
+
+(A + B)[:i] = A[:i] + B[:i];
+
+
+(A @ B)[i,j]   = A[i,k] * B[k,j];
+(A @ B)[:,i,j] = A[:,i,k] * B[:,k,j];
+
+(A ** B)[i,j,k,l] = A[i,m,k] * B[j,l,m];
+
+(transpose A)[i,j] = A[j,i];
+
+
+
+
+kernel mat_mul(A: float<2>, B: float<2>) -> float<2>;
+mat_mul A B = A @ B;
+
+kernel mat_mul(in A: float<2>, in B: float<2>, out C: float<2>) {
+    C = A @ B;
+}
+
+kernel mat_mul_delta(in A: float<2>, in B: float<2>, out C: float<2>) -> float<2>;
+mat_mul_delta A B = ∂(A @ B) / ∂A;
+