tracel-ai
diff --git a/‎crates/burn-cubecl/src/kernel/conv/conv2d/implicit_gemm/launch.rs
Lines changed: 4 additions & 4 deletions b/‎crates/burn-cubecl/src/kernel/conv/conv2d/implicit_gemm/launch.rs
Lines changed: 4 additions & 4 deletions
diff --git a/‎crates/burn-cubecl/src/kernel/conv/conv2d/layout_swap.rs
Lines changed: 2 additions & 26 deletions b/‎crates/burn-cubecl/src/kernel/conv/conv2d/layout_swap.rs
Lines changed: 2 additions & 26 deletions
diff --git a/‎crates/burn-cubecl/src/kernel/conv/mod.rs
Lines changed: 1 addition & 4 deletions b/‎crates/burn-cubecl/src/kernel/conv/mod.rs
Lines changed: 1 addition & 4 deletions
diff --git a/‎crates/burn-cubecl/src/kernel/interpolate/base.rs
Lines changed: 20 additions & 18 deletions b/‎crates/burn-cubecl/src/kernel/interpolate/base.rs
Lines changed: 20 additions & 18 deletions
diff --git a/‎crates/burn-cubecl/src/kernel/interpolate/bicubic.rs
Lines changed: 68 additions & 37 deletions b/‎crates/burn-cubecl/src/kernel/interpolate/bicubic.rs
Lines changed: 68 additions & 37 deletions
@@ -18,7 +18,7 @@ use cubecl::linalg::{
 
 use crate::{
     CubeElement, CubeRuntime, FloatElement,
-    ops::{numeric::empty_device_strided, permute},
+    ops::{numeric::empty_device_strided, permute_nchw_to_nhwc, permute_nhwc_to_nchw},
     tensor::CubeTensor,
 };
 
@@ -95,8 +95,8 @@ where
         width,
     );
 
-    let input = permute(input, &[0, 2, 3, 1]);
-    let weight = permute(weight, &[0, 2, 3, 1]);
+    let input = permute_nchw_to_nhwc(input);
+    let weight = permute_nchw_to_nhwc(weight);
 
     let out_shape = Shape::new([batch_size, out_h, out_w, out_channels]);
     let out =
@@ -117,5 +117,5 @@ where
         },
     )?;
 
-    Ok(permute(out, &[0, 3, 1, 2]))
+    Ok(permute_nhwc_to_nchw(out))
 }
@@ -3,8 +3,7 @@ use cubecl::{CubeCount, CubeDim, prelude::*};
 
 use crate::{
     CubeElement, CubeRuntime,
-    kernel::into_contiguous,
-    ops::{max_vectorization, numeric::empty_device_strided},
+    ops::{max_line_size, numeric::empty_device_strided},
     tensor::CubeTensor,
 };
 
@@ -55,7 +54,7 @@ pub fn nchw_to_nhwc<R: CubeRuntime, E: CubeElement>(input: CubeTensor<R>) -> Cub
     };
     let cube_count = CubeCount::Static(cube_count_x, cube_count_y, cube_count_z);
 
-    let in_vec = max_vectorization(&input);
+    let in_vec = max_line_size(&input);
     let out_vec = R::supported_line_sizes()
         .iter()
         .copied()
@@ -201,26 +200,3 @@ pub fn swizzle(offset: u32, #[comptime] bank_count: i32) -> u32 {
 
     offset ^ ((offset & yyy_mask) >> mask_shift)
 }
-
-/// Transpose an NCHW tensor to NHWC.
-pub fn permute_nchw_to_nhwc<R: CubeRuntime, E: CubeElement>(input: CubeTensor<R>) -> CubeTensor<R> {
-    // Disabled for now, need to fix hard to track down bug
-
-    // let use_plane = if cfg!(target_family = "wasm") {
-    //     // Any plane op enables subgroups on wasm so we need to make sure it is entirely supported.
-    //     // Otherwise, `nchw_to_nhwc` will cause compilation issues on wasm.
-    //     input
-    //         .client
-    //         .properties()
-    //         .feature_enabled(cubecl::Feature::Plane)
-    // } else {
-    //     true // plane broadcast was/is always used on other platforms
-    // };
-
-    // if input.is_contiguous() && use_plane {
-    //     nchw_to_nhwc::<R, E>(input)
-    // } else {
-    //     crate::ops::permute(input, &[0, 2, 3, 1])
-    // }
-    into_contiguous(crate::ops::permute(input, &[0, 2, 3, 1]))
-}
@@ -10,7 +10,4 @@ pub(crate) use conv3d::*;
 pub(crate) use deform_conv_transpose2d::*;
 pub(crate) use deform_conv2d::*;
 
-pub use conv2d::{
-    Conv2dStrategy, ConvTranspose2dStrategy, conv_transpose2d, conv2d, nchw_to_nhwc,
-    permute_nchw_to_nhwc,
-};
+pub use conv2d::{Conv2dStrategy, ConvTranspose2dStrategy, conv_transpose2d, conv2d, nchw_to_nhwc};
@@ -1,5 +1,6 @@
 use crate::{
-    CubeRuntime, FloatElement, kernel::into_contiguous, ops::numeric::empty_device,
+    CubeRuntime, FloatElement,
+    ops::{numeric::empty_device_strided, permute_nchw_to_nhwc, permute_nhwc_to_nchw},
     tensor::CubeTensor,
 };
 use burn_tensor::{
@@ -20,18 +21,22 @@ pub fn interpolate<R: CubeRuntime, E: FloatElement>(
     output_size: [usize; 2],
     options: InterpolateOptions,
 ) -> CubeTensor<R> {
-    let input = into_contiguous(input);
     let [batch_size, channels, _, _] = input.shape.dims();
     let [out_height, out_width] = output_size;
 
-    let shape_out = Shape::new([batch_size, channels, out_height, out_width]);
-    let output = empty_device::<R, E>(input.client.clone(), input.device.clone(), shape_out);
+    let input = permute_nchw_to_nhwc(input);
 
-    match options.mode {
+    let shape_out = Shape::new([batch_size, out_height, out_width, channels]);
+    let output =
+        empty_device_strided::<R, E>(input.client.clone(), input.device.clone(), shape_out);
+
+    let output = match options.mode {
         InterpolateMode::Nearest => interpolate_nearest_launch::<R, E>(input, output),
         InterpolateMode::Bilinear => interpolate_bilinear_launch::<R, E>(input, output),
         InterpolateMode::Bicubic => interpolate_bicubic_launch::<R, E>(input, output),
-    }
+    };
+
+    permute_nhwc_to_nchw(output)
 }
 
 /// Backward interpolate operation
@@ -43,25 +48,22 @@ pub fn interpolate_backward<R: CubeRuntime, E: FloatElement>(
     _output_size: [usize; 2],
     options: InterpolateOptions,
 ) -> CubeTensor<R> {
-    let out_grad = into_contiguous(out_grad);
+    let input = permute_nchw_to_nhwc(input);
+    let out_grad = permute_nchw_to_nhwc(out_grad);
+
     let output_shape = input.shape.clone();
-    let num_elems = input.shape.num_elements();
-    let buffer = input.client.empty(num_elems * core::mem::size_of::<E>());
-    let output = CubeTensor::new_contiguous(
-        input.client.clone(),
-        input.device.clone(),
-        output_shape,
-        buffer,
-        input.dtype,
-    );
+    let output =
+        empty_device_strided::<R, E>(input.client.clone(), input.device.clone(), output_shape);
 
-    match options.mode {
+    let output = match options.mode {
         InterpolateMode::Nearest => interpolate_nearest_backward_launch::<R, E>(out_grad, output),
         InterpolateMode::Bilinear => {
             panic!("bilinear interpolation backward is not supported by JIT backend")
         }
         InterpolateMode::Bicubic => {
             panic!("bicubic interpolation backward is not supported by JIT backend")
         }
-    }
+    };
+
+    permute_nhwc_to_nchw(output)
 }
@@ -1,48 +1,61 @@
-use cubecl::{calculate_cube_count_elemwise, prelude::*};
+use cubecl::{calculate_cube_count_elemwise, linalg::tensor::StridedLayout, prelude::*};
+use cubecl_std::FastDivmod;
 
-use crate::{CubeRuntime, FloatElement, tensor::CubeTensor};
+use crate::{
+    CubeRuntime, FloatElement,
+    kernel::utils::{shape_divmod, strided_layout},
+    ops::max_line_size,
+    tensor::CubeTensor,
+};
 
 #[cube(launch)]
-fn interpolate_bicubic_kernel<F: Float>(input: &Tensor<F>, output: &mut Tensor<F>) {
+fn interpolate_bicubic_kernel<F: Float>(
+    input: &Tensor<Line<F>>,
+    output: &mut Tensor<Line<F>>,
+    shape_out: Sequence<FastDivmod>,
+    out_layout: StridedLayout,
+) {
     if ABSOLUTE_POS >= output.len() {
         terminate!();
     }
 
-    let batch = ABSOLUTE_POS / output.stride(0) % output.shape(0);
-    let channel = ABSOLUTE_POS / output.stride(1) % output.shape(1);
-    let y = ABSOLUTE_POS / output.stride(2) % output.shape(2);
-    let x = ABSOLUTE_POS / output.stride(3) % output.shape(3);
+    let line_size = input.line_size();
+    let out_idx = out_layout.index(output, ABSOLUTE_POS);
 
-    let input_height = input.shape(2) - 1;
-    let output_height = F::cast_from(Max::max(output.shape(2) - 1, 1));
-    let numerator = F::cast_from(y * input_height);
+    let (rem, c) = shape_out.index(3).div_mod(ABSOLUTE_POS * line_size);
+    let (rem, x) = shape_out.index(2).div_mod(rem);
+    let (b, y) = shape_out.index(1).div_mod(rem);
 
-    let frac = numerator / output_height;
+    let input_height = input.shape(1) - 1;
+    let output_height = f32::cast_from(Max::max(output.shape(1) - 1, 1));
+    let numerator = f32::cast_from(y * input_height);
+
+    let frac = f32::cast_from(numerator / output_height);
     let y_in_f = Floor::floor(frac);
     let y_in = u32::cast_from(y_in_f);
-    let yw = frac - y_in_f;
+    let yw = Line::empty(line_size).fill(F::cast_from(frac - y_in_f));
 
     let y0 = select(y_in != 0, y_in - 1, 0);
     let y1 = y_in;
     let y2 = Min::min(y_in + 1, input_height);
     let y3 = Min::min(y_in + 2, input_height);
 
-    let input_width = input.shape(3) - 1;
-    let output_width = F::cast_from(Max::max(output.shape(3) - 1, 1));
-    let numerator = F::cast_from(x * input_width);
+    let input_width = input.shape(2) - 1;
+    let output_width = f32::cast_from(Max::max(output.shape(2) - 1, 1));
+    let numerator = f32::cast_from(x * input_width);
     let frac = numerator / output_width;
     let x_in_f = Floor::floor(frac);
     let x_in = u32::cast_from(x_in_f);
-    let xw = frac - x_in_f;
+    let xw = Line::empty(line_size).fill(F::cast_from(frac - x_in_f));
 
     let x0 = select(x_in != 0, x_in - 1, 0);
     let x1 = x_in;
     let x2 = Min::min(x_in + 1, input_width);
     let x3 = Min::min(x_in + 2, input_width);
 
-    let index_base = batch * input.stride(0) + channel * input.stride(1);
-    let in_stride_y = input.stride(2);
-    let in_stride_x = input.stride(3);
+    let index_base = b * input.stride(0) + c * input.stride(3);
+    let in_stride_y = input.stride(1);
+    let in_stride_x = input.stride(2);
 
     let y0_stride = y0 * in_stride_y;
     let y1_stride = y1 * in_stride_y;
@@ -89,51 +102,69 @@ fn interpolate_bicubic_kernel<F: Float>(input: &Tensor<F>, output: &mut Tensor<F
         yw,
     );
 
-    output[ABSOLUTE_POS] = val;
+    output[out_idx] = val;
 }
 
 #[cube]
-fn cubic_interp_1d<F: Float>(x0: F, x1: F, x2: F, x3: F, t: F) -> F {
-    let a = F::new(-0.75);
-
-    let coeffs0 = cubic_convolution_2::<F>(t + F::new(1.0), a);
+fn cubic_interp_1d<F: Float>(
+    x0: Line<F>,
+    x1: Line<F>,
+    x2: Line<F>,
+    x3: Line<F>,
+    t: Line<F>,
+) -> Line<F> {
+    let a = lined(&x0, -0.75);
+
+    let coeffs0 = cubic_convolution_2::<F>(t + lined(&x0, 1.0), a);
     let coeffs1 = cubic_convolution_1::<F>(t, a);
-    let coeffs2 = cubic_convolution_1::<F>(F::new(1.0) - t, a);
-    let coeffs3 = cubic_convolution_2::<F>(F::new(2.0) - t, a);
+    let coeffs2 = cubic_convolution_1::<F>(lined(&x0, 1.0) - t, a);
+    let coeffs3 = cubic_convolution_2::<F>(lined(&x0, 2.0) - t, a);
 
     x0 * coeffs0 + x1 * coeffs1 + x2 * coeffs2 + x3 * coeffs3
 }
 
 #[cube]
-fn cubic_convolution_1<F: Float>(x: F, a: F) -> F {
-    let conv = (a + F::new(2.0)) * x;
-    let tmp = a + F::new(3.0);
-    (conv - tmp) * x * x + F::new(1.0)
+fn cubic_convolution_1<F: Float>(x: Line<F>, a: Line<F>) -> Line<F> {
+    let conv = (a + lined(&x, 2.0)) * x;
+    let tmp = a + lined(&x, 3.0);
+    (conv - tmp) * x * x + lined(&x, 1.0)
 }
 
 #[cube]
-fn cubic_convolution_2<F: Float>(x: F, a: F) -> F {
+fn cubic_convolution_2<F: Float>(x: Line<F>, a: Line<F>) -> Line<F> {
     let conv = a * x;
-    let conv = (conv - F::new(5.0) * a) * x;
-    let tmp = F::new(8.0) * a;
+    let conv = (conv - lined(&x, 5.0) * a) * x;
+    let tmp = lined(&x, 8.0) * a;
     let conv = (conv + tmp) * x;
 
-    conv - F::new(4.0) * a
+    conv - lined(&x, 4.0) * a
+}
+
+#[cube]
+fn lined<F: Float>(x: &Line<F>, #[comptime] v: f32) -> Line<F> {
+    Line::empty(x.size()).fill(F::new(v))
 }
 
 pub(crate) fn interpolate_bicubic_launch<R: CubeRuntime, E: FloatElement>(
     input: CubeTensor<R>,
     output: CubeTensor<R>,
 ) -> CubeTensor<R> {
+    let line_size = max_line_size(&input);
+    let out_shape = shape_divmod(&output);
+    let out_layout = strided_layout(&output);
+
     let cube_dim = CubeDim::default();
-    let cube_count = calculate_cube_count_elemwise(output.shape.num_elements(), cube_dim);
+    let cube_count =
+        calculate_cube_count_elemwise(output.shape.num_elements() / line_size as usize, cube_dim);
 
     interpolate_bicubic_kernel::launch::<E, R>(
         &input.client,
         cube_count,
         cube_dim,
-        input.as_tensor_arg::<E>(1),
-        output.as_tensor_arg::<E>(1),
+        input.as_tensor_arg::<E>(line_size),
+        output.as_tensor_arg::<E>(line_size),
+        out_shape,
+        out_layout,
     );
 
     output