crutcher
diff --git a/‎crates/burn-candle/src/ops/base.rs‎
Lines changed: 6 additions & 1 deletion b/‎crates/burn-candle/src/ops/base.rs‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎crates/burn-cubecl/src/ops/base.rs‎
Lines changed: 9 additions & 5 deletions b/‎crates/burn-cubecl/src/ops/base.rs‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎crates/burn-fusion/src/ops/boolean.rs‎
Lines changed: 10 additions & 10 deletions b/‎crates/burn-fusion/src/ops/boolean.rs‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎crates/burn-fusion/src/ops/float.rs‎
Lines changed: 5 additions & 3 deletions b/‎crates/burn-fusion/src/ops/float.rs‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎crates/burn-fusion/src/ops/int.rs‎
Lines changed: 6 additions & 5 deletions b/‎crates/burn-fusion/src/ops/int.rs‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎crates/burn-ndarray/src/ops/base.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/burn-ndarray/src/ops/base.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/burn-router/src/ops/op_bool.rs‎
Lines changed: 32 additions & 2 deletions b/‎crates/burn-router/src/ops/op_bool.rs‎
Lines changed: 32 additions & 2 deletions
diff --git a/‎crates/burn-router/src/ops/op_float.rs‎
Lines changed: 32 additions & 3 deletions b/‎crates/burn-router/src/ops/op_float.rs‎
Lines changed: 32 additions & 3 deletions
diff --git a/‎crates/burn-router/src/ops/op_int.rs‎
Lines changed: 32 additions & 3 deletions b/‎crates/burn-router/src/ops/op_int.rs‎
Lines changed: 32 additions & 3 deletions
diff --git a/‎crates/burn-router/src/runner.rs‎
Lines changed: 18 additions & 0 deletions b/‎crates/burn-router/src/runner.rs‎
Lines changed: 18 additions & 0 deletions
@@ -1,7 +1,8 @@
+use std::cmp::max;
 use std::marker::PhantomData;
 
 use burn_tensor::{Element, Shape, TensorData, TensorMetadata, backend::Backend};
-use candle_core::WithDType;
+use candle_core::{Layout, WithDType};
 use half::{bf16, f16};
 
 use crate::{
@@ -133,6 +134,10 @@ pub fn expand(tensor: CandleTensor, shape: Shape) -> CandleTensor {
     CandleTensor::new(tensor.tensor.broadcast_as(shape.dims).unwrap())
 }
 
+pub fn unfold(tensor: CandleTensor, dim: usize, size: usize, step: usize) -> CandleTensor {
+    todo!()
+}
+
 pub fn sign(tensor: CandleTensor) -> CandleTensor {
     CandleTensor::new(tensor.tensor.sign().unwrap())
 }
 
@@ -1,11 +1,11 @@
 use crate::{CubeRuntime, element::CubeElement, kernel, tensor::CubeTensor};
 use burn_common::tensor::{ReshapeAction, reshape_action};
+use burn_tensor::ops::unfold::calculate_unfold_windows;
 use burn_tensor::{
     Shape, TensorData,
     quantization::{QTensorPrimitive, QuantLevel},
 };
 use cubecl::{server::CopyDescriptor, tensor_vectorization_factor};
-use std::cmp::max;
 
 pub(crate) fn from_data<R: CubeRuntime>(data: TensorData, device: &R::Device) -> CubeTensor<R> {
     let shape: Shape = (&data.shape).into();
@@ -222,6 +222,10 @@ pub(crate) fn max_line_size_many<R: CubeRuntime>(tensors: &[&CubeTensor<R>], dim
 ///
 /// The number of windows is `max(0, (shape[dim] - size).ceil_div(step))`.
 ///
+/// The new view will have the unfolded dimension replaced by two dimensions;
+/// one in the position of the original dimension, with size equal to the number of windows,
+/// and one appended to the right-most position, with size equal to `size`.
+///
 /// # Arguments
 ///
 /// * `tensor` - The input tensor to unfold; of shape ``[pre=..., dim shape, post=...]``
@@ -231,7 +235,7 @@ pub(crate) fn max_line_size_many<R: CubeRuntime>(tensors: &[&CubeTensor<R>], dim
 ///
 /// # Returns
 ///
-/// A tensor view with shape ``[pre=..., windows, size, post=...]``.
+/// A tensor view with the shape ``[pre=..., windows, post=..., size]``.
 pub fn unfold<R: CubeRuntime>(
     tensor: CubeTensor<R>,
     dim: usize,
@@ -241,15 +245,15 @@ pub fn unfold<R: CubeRuntime>(
     let d_shape = tensor.shape.dims[dim];
     let d_stride = tensor.strides[dim];
 
-    let windows = max(0, (d_shape - size).div_ceil(step));
+    let windows = calculate_unfold_windows(d_shape, size, step);
 
     let mut shape = tensor.shape.clone();
     shape.dims[dim] = windows;
-    shape.dims.insert(dim + 1, size);
+    shape.dims.push(size);
 
     let mut strides = tensor.strides.clone();
     strides[dim] = step * d_stride;
-    strides.insert(dim + 1, d_stride);
+    strides.push(d_stride);
 
     CubeTensor {
         shape,
 
@@ -1,22 +1,21 @@
+use crate::{
+    Fusion, FusionBackend,
+    client::FusionClient,
+    get_client,
+    stream::{OperationStreams, StreamId, execution::Operation},
+};
 use burn_ir::{
     BaseOperationIr, BinaryOpIr, BoolOperationIr, CatOpIr, ExpandOpIr, FlipOpIr, HandleContainer,
     InitOperationIr, OperationIr, PermuteOpIr, RepeatDimOpIr, SliceAssignOpIr, SliceOpIr,
     SwapDimsOpIr, TensorIr, UnaryOpIr, UnfoldOpIr,
 };
+use burn_tensor::ops::unfold::calculate_unfold_windows;
 use burn_tensor::{
     Device, Element, Shape, TensorData, TensorMetadata,
     ops::{BoolTensor, BoolTensorOps, FloatTensor, IntTensor, binary_ops_shape},
 };
-use std::cmp::max;
 use std::marker::PhantomData;
 
-use crate::{
-    Fusion, FusionBackend,
-    client::FusionClient,
-    get_client,
-    stream::{OperationStreams, StreamId, execution::Operation},
-};
-
 use super::NoOp;
 
 impl<B: FusionBackend> BoolTensorOps<Self> for Fusion<B> {
@@ -777,9 +776,10 @@ impl<B: FusionBackend> BoolTensorOps<Self> for Fusion<B> {
 
         let mut shape = tensor.shape().dims.clone();
         let d_shape = shape[dim];
-        let windows = max(0, (d_shape - size).div_ceil(step));
+        let windows = calculate_unfold_windows(d_shape, size, step);
+
         shape[dim] = windows;
-        shape.insert(dim + 1, size);
+        shape.push(size);
 
         let out = tensor
             .client
 
@@ -9,11 +9,11 @@ use crate::{
     unary_float_ops,
 };
 use burn_ir::*;
+use burn_tensor::ops::unfold::calculate_unfold_windows;
 use burn_tensor::{
     Device, Distribution, Element, FloatDType, Shape, TensorData, TensorMetadata,
     ops::{BoolTensor, FloatElem, FloatTensor, FloatTensorOps, IntTensor, binary_ops_shape},
 };
-use std::cmp::max;
 use std::{marker::PhantomData, ops::Range};
 
 impl<B: FusionBackend> FloatTensorOps<Self> for Fusion<B> {
@@ -2291,9 +2291,11 @@ impl<B: FusionBackend> FloatTensorOps<Self> for Fusion<B> {
 
         let mut shape = tensor.shape().dims.clone();
         let d_shape = shape[dim];
-        let windows = max(0, (d_shape - size).div_ceil(step));
+
+        let windows = calculate_unfold_windows(d_shape, size, step);
+
         shape[dim] = windows;
-        shape.insert(dim + 1, size);
+        shape.push(size);
 
         let out = tensor
             .client
 
@@ -1,3 +1,4 @@
+use super::NoOp;
 use crate::{
     Fusion, FusionBackend, binary_int_cmp_ops, binary_int_ops,
     client::FusionClient,
@@ -6,16 +7,14 @@ use crate::{
     unary_int_ops,
 };
 use burn_ir::*;
+use burn_tensor::ops::unfold::calculate_unfold_windows;
 use burn_tensor::{
     Device, Distribution, Element, IntDType, Shape, TensorData, TensorMetadata,
     ops::{BoolTensor, FloatTensor, IntElem, IntTensor, IntTensorOps, binary_ops_shape},
 };
 use core::ops::Range;
-use std::cmp::max;
 use std::marker::PhantomData;
 
-use super::NoOp;
-
 impl<B: FusionBackend> IntTensorOps<Self> for Fusion<B> {
     fn int_empty(shape: Shape, device: &Device<Self>, dtype: IntDType) -> IntTensor<Self> {
         #[derive(new, Debug)]
@@ -2204,9 +2203,11 @@ impl<B: FusionBackend> IntTensorOps<Self> for Fusion<B> {
 
         let mut shape = tensor.shape().dims.clone();
         let d_shape = shape[dim];
-        let windows = max(0, (d_shape - size).div_ceil(step));
+
+        let windows = calculate_unfold_windows(d_shape, size, step);
+
         shape[dim] = windows;
-        shape.insert(dim + 1, size);
+        shape.push(size);
 
         let out = tensor
             .client
 
@@ -177,7 +177,7 @@ where
     ///
     /// # Returns
     ///
-    /// A tensor view with shape ``[pre=..., windows, size, post=...]``.
+    /// A tensor view with shape ``[pre=..., windows, post=..., size]``.
     #[allow(unused)]
     pub(crate) fn unfold(
         tensor: SharedArray<E>,
 
@@ -1,14 +1,15 @@
 use alloc::vec::Vec;
 
+use crate::{BackendRouter, RunnerChannel, RunnerClient, get_client};
 use burn_ir::{
     BaseOperationIr, BinaryOpIr, BoolOperationIr, CatOpIr, ExpandOpIr, FlipOpIr, InitOperationIr,
     OperationIr, PermuteOpIr, RepeatDimOpIr, SliceAssignOpIr, SliceOpIr, SwapDimsOpIr, UnaryOpIr,
+    UnfoldOpIr,
 };
+use burn_tensor::ops::unfold::calculate_unfold_windows;
 use burn_tensor::ops::{BoolTensor, BoolTensorOps, FloatElem, FloatTensor, IntElem, IntTensor};
 use burn_tensor::{Device, Element, Shape, TensorData, TensorMetadata};
 
-use crate::{BackendRouter, RunnerChannel, RunnerClient, get_client};
-
 impl<R: RunnerChannel> BoolTensorOps<Self> for BackendRouter<R> {
     fn bool_empty(shape: Shape, device: &Device<Self>) -> BoolTensor<Self> {
         // Get the runtime client on which to register the operation for execution.
@@ -323,4 +324,33 @@ impl<R: RunnerChannel> BoolTensorOps<Self> for BackendRouter<R> {
 
         out
     }
+
+    fn bool_unfold(
+        tensor: BoolTensor<Self>,
+        dim: usize,
+        size: usize,
+        step: usize,
+    ) -> BoolTensor<Self> {
+        let client = tensor.client.clone();
+
+        let mut shape = tensor.shape().dims.clone();
+        let d_shape = shape[dim];
+        let windows = calculate_unfold_windows(d_shape, size, step);
+        shape[dim] = windows;
+        shape.push(size);
+
+        let out = client.register_empty_tensor(shape.clone(), tensor.dtype);
+
+        let desc = UnfoldOpIr {
+            input: tensor.into_ir(),
+            out: out.to_ir_out(),
+            dim,
+            size,
+            step,
+        };
+
+        client.register(OperationIr::BaseBool(BaseOperationIr::Unfold(desc)));
+
+        out
+    }
 }
@@ -2,20 +2,20 @@ use alloc::{vec, vec::Vec};
 use burn_tensor::backend::Backend;
 use core::ops::Range;
 
+use crate::{BackendRouter, RunnerChannel, RunnerClient, get_client};
 use burn_ir::{
     BaseOperationIr, BinaryOpIr, CatOpIr, ClampOpIr, ExpandOpIr, FlipOpIr, FloatOperationIr,
     GatherOpIr, InitOperationIr, MaskFillOpIr, MaskWhereOpIr, NumericOperationIr, OperationIr,
     PermuteOpIr, RandomOpIr, ReduceDimOpIr, ReduceDimWithIndicesOpIr, RepeatDimOpIr, ScalarIr,
     ScalarOpIr, ScatterOpIr, SelectAssignOpIr, SelectOpIr, SliceAssignOpIr, SliceOpIr,
-    SwapDimsOpIr, UnaryOpIr,
+    SwapDimsOpIr, UnaryOpIr, UnfoldOpIr,
 };
+use burn_tensor::ops::unfold::calculate_unfold_windows;
 use burn_tensor::ops::{
     BoolTensor, FloatElem, FloatTensor, FloatTensorOps, IntElem, IntTensor, binary_ops_shape,
 };
 use burn_tensor::{Device, Distribution, Element, FloatDType, Shape, TensorData, TensorMetadata};
 
-use crate::{BackendRouter, RunnerChannel, RunnerClient, get_client};
-
 impl<R: RunnerChannel> FloatTensorOps<Self> for BackendRouter<R> {
     fn float_from_data(data: TensorData, device: &Device<Self>) -> FloatTensor<Self> {
         let client = get_client::<R>(device);
@@ -1436,4 +1436,33 @@ impl<R: RunnerChannel> FloatTensorOps<Self> for BackendRouter<R> {
 
         out
     }
+
+    fn float_unfold(
+        tensor: FloatTensor<Self>,
+        dim: usize,
+        size: usize,
+        step: usize,
+    ) -> FloatTensor<Self> {
+        let client = tensor.client.clone();
+
+        let mut shape = tensor.shape().dims.clone();
+        let d_shape = shape[dim];
+        let windows = calculate_unfold_windows(d_shape, size, step);
+        shape[dim] = windows;
+        shape.push(size);
+
+        let out = client.register_empty_tensor(shape.clone(), tensor.dtype);
+
+        let desc = UnfoldOpIr {
+            input: tensor.into_ir(),
+            out: out.to_ir_out(),
+            dim,
+            size,
+            step,
+        };
+
+        client.register(OperationIr::BaseFloat(BaseOperationIr::Unfold(desc)));
+
+        out
+    }
 }
@@ -2,20 +2,20 @@ use alloc::{vec, vec::Vec};
 use burn_tensor::backend::Backend;
 use core::ops::Range;
 
+use crate::{BackendRouter, RunnerChannel, RunnerClient, get_client};
 use burn_ir::{
     BaseOperationIr, BinaryOpIr, CatOpIr, ClampOpIr, ExpandOpIr, FlipOpIr, GatherOpIr,
     InitOperationIr, IntOperationIr, MaskFillOpIr, MaskWhereOpIr, NumericOperationIr, OperationIr,
     PermuteOpIr, RandomOpIr, ReduceDimOpIr, ReduceDimWithIndicesOpIr, RepeatDimOpIr, ScalarIr,
     ScalarOpIr, ScatterOpIr, SelectAssignOpIr, SelectOpIr, SliceAssignOpIr, SliceOpIr,
-    SwapDimsOpIr, UnaryOpIr,
+    SwapDimsOpIr, UnaryOpIr, UnfoldOpIr,
 };
+use burn_tensor::ops::unfold::calculate_unfold_windows;
 use burn_tensor::ops::{
     BoolTensor, FloatElem, FloatTensor, IntElem, IntTensor, IntTensorOps, binary_ops_shape,
 };
 use burn_tensor::{Device, Distribution, Element, IntDType, Shape, TensorData, TensorMetadata};
 
-use crate::{BackendRouter, RunnerChannel, RunnerClient, get_client};
-
 impl<R: RunnerChannel> IntTensorOps<Self> for BackendRouter<R> {
     fn int_empty(shape: Shape, device: &Device<Self>, dtype: IntDType) -> IntTensor<Self> {
         // Get the runtime client on which to register the operation for execution.
@@ -1416,4 +1416,33 @@ impl<R: RunnerChannel> IntTensorOps<Self> for BackendRouter<R> {
 
         out
     }
+
+    fn int_unfold(
+        tensor: IntTensor<Self>,
+        dim: usize,
+        size: usize,
+        step: usize,
+    ) -> IntTensor<Self> {
+        let client = tensor.client.clone();
+
+        let mut shape = tensor.shape().dims.clone();
+        let d_shape = shape[dim];
+        let windows = calculate_unfold_windows(d_shape, size, step);
+        shape[dim] = windows;
+        shape.push(size);
+
+        let out = client.register_empty_tensor(shape.clone(), tensor.dtype);
+
+        let desc = UnfoldOpIr {
+            input: tensor.into_ir(),
+            out: out.to_ir_out(),
+            dim,
+            size,
+            step,
+        };
+
+        client.register(OperationIr::BaseInt(BaseOperationIr::Unfold(desc)));
+
+        out
+    }
 }
@@ -186,6 +186,12 @@ impl<B: BackendIr> RunnerClient for Runner<B> {
                     let output = B::float_expand(tensor, desc.shape.clone().into());
                     handles.register_float_tensor::<B>(&desc.out.id, output);
                 }
+                BaseOperationIr::Unfold(desc) => {
+                    let tensor = handles.get_float_tensor::<B>(&desc.input);
+
+                    let output = B::float_unfold(tensor, desc.dim, desc.size, desc.step);
+                    handles.register_float_tensor::<B>(&desc.out.id, output);
+                }
                 BaseOperationIr::Slice(desc) => {
                     let tensor = handles.get_float_tensor::<B>(&desc.tensor);
 
@@ -261,6 +267,12 @@ impl<B: BackendIr> RunnerClient for Runner<B> {
                     let output = B::int_expand(tensor, desc.shape.clone().into());
                     handles.register_int_tensor::<B>(&desc.out.id, output);
                 }
+                BaseOperationIr::Unfold(desc) => {
+                    let tensor = handles.get_int_tensor::<B>(&desc.input);
+
+                    let output = B::int_unfold(tensor, desc.dim, desc.size, desc.step);
+                    handles.register_int_tensor::<B>(&desc.out.id, output);
+                }
                 BaseOperationIr::Slice(desc) => {
                     let tensor = handles.get_int_tensor::<B>(&desc.tensor);
 
@@ -332,6 +344,12 @@ impl<B: BackendIr> RunnerClient for Runner<B> {
                     let output = B::bool_expand(tensor, desc.shape.clone().into());
                     handles.register_bool_tensor::<B>(&desc.out.id, output);
                 }
+                BaseOperationIr::Unfold(desc) => {
+                    let tensor = handles.get_bool_tensor::<B>(&desc.input);
+
+                    let output = B::bool_unfold(tensor, desc.dim, desc.size, desc.step);
+                    handles.register_bool_tensor::<B>(&desc.out.id, output);
+                }
                 BaseOperationIr::Slice(desc) => {
                     let tensor = handles.get_bool_tensor::<B>(&desc.tensor);