tracel-ai · nathanielsimard · Jun 10, 2025 · Jun 6, 2025 · Jun 6, 2025 · Jun 6, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -158,9 +158,9 @@ portable-atomic = { version = "1.11.0" }
 portable-atomic-util = { version = "0.2.4", features = ["alloc"] }
 
 ### For the main burn branch. ###
-cubecl = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "cdf67a222ce973955fba439bb019139f5b33d40c" }
-cubecl-common = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "cdf67a222ce973955fba439bb019139f5b33d40c" }
-cubecl-std = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "cdf67a222ce973955fba439bb019139f5b33d40c" }
+cubecl = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "b34cffd711e25c1438967c19dc14376de8427df2" }
+cubecl-common = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "b34cffd711e25c1438967c19dc14376de8427df2" }
+cubecl-std = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "b34cffd711e25c1438967c19dc14376de8427df2" }
 ### For local development. ###
 # cubecl = { path = "../cubecl/crates/cubecl", default-features = false }
 # cubecl-common = { path = "../cubecl/crates/cubecl-common", default-features = false }

diff --git a/crates/burn-cubecl-fusion/src/shared/builder.rs b/crates/burn-cubecl-fusion/src/shared/builder.rs
@@ -36,6 +36,14 @@ impl OptimizationBuilder<FuseTrace> for FuseOptimizationBuilder {
         }
 
         match op {
+            OperationIr::Drop(tensor) => {
+                if self.num_ops == 0 {
+                    self.status = OptimizationStatus::Closed;
+                    return;
+                }
+
+                self.builder.builder.tag_dropped(tensor.id);
+            }
             OperationIr::BaseFloat(ops) => {
                 if !self.register_base(ops) {
                     self.status = OptimizationStatus::Closed;

diff --git a/crates/burn-cubecl-fusion/src/shared/trace/base.rs b/crates/burn-cubecl-fusion/src/shared/trace/base.rs
@@ -12,7 +12,10 @@ use burn_fusion::stream::Context;
 use burn_ir::{TensorId, TensorIr};
 use cubecl::prelude::*;
 use serde::{Deserialize, Serialize};
-use std::{collections::BTreeMap, marker::PhantomData};
+use std::{
+    collections::{BTreeMap, HashSet},
+    marker::PhantomData,
+};
 
 #[cfg(feature = "autotune-checks")]
 use burn_tensor::TensorData;
@@ -127,6 +130,7 @@ pub struct FuseResources {
     pub inputs_unhandled: Vec<TensorId>,
     pub outputs_unhandled: Vec<Arg>,
     pub num_reshaped: usize,
+    pub dropped: HashSet<TensorId>,
 }
 
 #[derive(Debug)]

diff --git a/crates/burn-cubecl-fusion/src/shared/trace/block.rs b/crates/burn-cubecl-fusion/src/shared/trace/block.rs
@@ -518,7 +518,10 @@ impl FuseBlockBuilder {
         for entry in local_tensor_ids_output {
             let is_read = local_tensor_ids_input.contains(&entry);
 
-            if !is_read && !self.local_outputs.contains(&entry.0) {
+            if !is_read
+                && !self.local_outputs.contains(&entry.0)
+                && !resources.dropped.contains(&entry.0)
+            {
                 let (tensor_id, precision) = entry;
                 let (tensor, _) = resources.outputs.get(tensor_id).unwrap();
                 result.insert(precision, tensor.clone());
@@ -529,7 +532,9 @@ impl FuseBlockBuilder {
         // are going to be used after the fused kernel by other operations.
         for (tensor, precision) in self.outputs.iter() {
             if let TensorStatus::ReadOnly = tensor.status {
-                result.insert(*precision, tensor.clone());
+                if !resources.dropped.contains(&tensor.id) {
+                    result.insert(*precision, tensor.clone());
+                }
             }
         }
 

diff --git a/crates/burn-cubecl-fusion/src/shared/trace/builder.rs b/crates/burn-cubecl-fusion/src/shared/trace/builder.rs
@@ -8,7 +8,7 @@ use super::{
 };
 use super::{FuseTrace, RegisteredTensors};
 use burn_fusion::stream::ScalarId;
-use burn_ir::TensorIr;
+use burn_ir::{TensorId, TensorIr};
 use burn_tensor::{DType, Element};
 
 #[derive(Clone, Debug)]
@@ -36,6 +36,11 @@ impl FuseTraceBuilder {
         }
     }
 
+    /// Tag a tensor as dropped.
+    pub fn tag_dropped(&mut self, id: TensorId) {
+        self.resources.dropped.insert(id);
+    }
+
     /// Register an operation.
     pub fn register_operation(&mut self, op: FuseOp) {
         self.block_current.ops.push(op);

diff --git a/crates/burn-cubecl-fusion/src/shared/trace/input.rs b/crates/burn-cubecl-fusion/src/shared/trace/input.rs
@@ -27,11 +27,13 @@ impl<'a, R: Runtime> InputPlanner<'a, R> {
     pub fn run(self, context: &mut Context<'_, CubeFusionHandle<R>>, plan: &mut LaunchPlan<'a, R>) {
         for (pos, (tensor_relative, precision)) in self.resources.inputs.iter().enumerate() {
             let mut tensor_global = context.tensors.get(&tensor_relative.id).unwrap().clone();
-            // Important to take the status of the relative graph and not
-            // the global graph, since the status of the global graph
-            // might be of a later operation on the same tensor id.
-            let status = &tensor_relative.status;
-            let mut handle = context.handles.get_handle(&tensor_global.id, status);
+            let mut handle = context
+                .handles
+                .get_handle(&tensor_global.id, &TensorStatus::ReadOnly);
+
+            if let TensorStatus::ReadWrite = tensor_relative.status {
+                plan.cleared.push(tensor_global.id);
+            }
 
             self.analyze(plan, pos, tensor_relative, &handle);
 

diff --git a/crates/burn-cubecl-fusion/src/shared/trace/output.rs b/crates/burn-cubecl-fusion/src/shared/trace/output.rs
@@ -164,6 +164,16 @@ impl<'a, R: Runtime> OutputPlanner<'a, R> {
                 Self::add_layout_info_inputs(block, &plan.handle_inputs);
             }
         }
+
+        // Make sure dropped are correctly executed.
+        for id in self.resources.dropped.iter() {
+            if let Some(tensor_global) = context.tensors.get(id) {
+                context.handles.remove_handle(tensor_global.id);
+            }
+        }
+        for id in plan.cleared.drain(..) {
+            context.handles.remove_handle(id);
+        }
     }
 
     fn select_reference_from_inputs(

diff --git a/crates/burn-cubecl-fusion/src/shared/trace/plan.rs b/crates/burn-cubecl-fusion/src/shared/trace/plan.rs
@@ -20,6 +20,7 @@ pub(crate) struct LaunchPlan<'a, R: Runtime> {
     pub rank: usize,
     pub blocks: Vec<BlockPlan<'a>>,
     pub vectorizations: BTreeMap<TensorId, Vect>,
+    pub cleared: Vec<TensorId>,
 }
 
 #[derive(Debug)]
@@ -135,6 +136,7 @@ impl<R: Runtime> LaunchPlan<'_, R> {
             rank,
             blocks,
             vectorizations: Default::default(),
+            cleared: Default::default(),
         }
     }
 }

diff --git a/crates/burn-fusion/src/client/base.rs b/crates/burn-fusion/src/client/base.rs
@@ -4,7 +4,7 @@ use crate::{
     FusionBackend, FusionDevice, FusionHandle, FusionRuntime, FusionTensor,
     stream::{StreamId, execution::Operation},
 };
-use burn_ir::{OperationIr, TensorId, TensorIr};
+use burn_ir::{OperationIr, TensorIr};
 use burn_tensor::{DType, TensorData};
 
 /// Define how to interact with the fusion server.
@@ -112,6 +112,4 @@ where
     ) -> FusionTensor<R>
     where
         B: FusionBackend<FusionRuntime = R>;
-    /// Drop the tensor with the given [tensor id](TensorId).
-    fn register_orphan(&self, id: &TensorId);
 }
diff --git a/crates/burn-fusion/src/client/mutex.rs b/crates/burn-fusion/src/client/mutex.rs
@@ -3,7 +3,7 @@ use crate::{
     FusionBackend, FusionDevice, FusionHandle, FusionRuntime, FusionServer, FusionTensor,
     stream::{StreamId, execution::Operation},
 };
-use burn_ir::{OperationIr, TensorId, TensorIr};
+use burn_ir::{OperationIr, TensorIr};
 use burn_tensor::{DType, TensorData};
 use spin::Mutex;
 use std::sync::Arc;
@@ -206,11 +206,6 @@ where
         FusionTensor::new(id, tensor.shape, tensor.dtype, client, StreamId::current())
     }
 
-    fn register_orphan(&self, id: &TensorId) {
-        // TODO: Make drop into a tensor operation so that optimizations can know about it.
-        self.server.lock().drop_tensor_handle(*id);
-    }
-
     fn resolve_tensor_float<B>(&self, tensor: FusionTensor<R>) -> B::FloatTensorPrimitive
     where
         B: FusionBackend<FusionRuntime = R>,

diff --git a/crates/burn-fusion/src/search/optimization/blocks.rs b/crates/burn-fusion/src/search/optimization/blocks.rs
@@ -41,6 +41,10 @@ enum BlockOptimizationStep<O> {
     Contiguous {
         strategy: ExecutionStrategy<O>,
     },
+    /// Only happen when we fallback on executing a single operation.
+    Operation {
+        strategy: ExecutionStrategy<O>,
+    },
     WithHoles {
         strategy: ExecutionStrategy<O>,
         holes: Vec<usize>,
@@ -78,6 +82,10 @@ impl<O: NumOperations> BlocksOptimizer<O> {
                 BlockOptimizationStep::Contiguous { strategy } => {
                     strategies.push(Box::new(strategy));
                 }
+                BlockOptimizationStep::Operation { strategy } => {
+                    strategies.push(Box::new(strategy));
+                    break;
+                }
                 BlockOptimizationStep::WithHoles { strategy, holes } => {
                     strategies.push(Box::new(strategy));
 
@@ -143,10 +151,9 @@ impl<O: NumOperations> BlocksOptimizer<O> {
         last_index: usize,
         ordering_global: &mut Vec<usize>,
     ) -> BlockOptimizationStep<O> {
-        ordering_global.append(&mut optimization.ordering);
-
         match optimization.strategy {
             ExecutionStrategy::Optimization { opt, ordering } => {
+                ordering_global.append(&mut optimization.ordering);
                 let holes = self.find_holes(last_index);
 
                 if holes.is_empty() {
@@ -159,11 +166,12 @@ impl<O: NumOperations> BlocksOptimizer<O> {
             }
             ExecutionStrategy::Operations { ordering } => {
                 let min = ordering.iter().min().unwrap();
+                ordering_global.push(*min);
 
                 let strategy = ExecutionStrategy::Operations {
                     ordering: Arc::new(vec![*min]),
                 };
-                BlockOptimizationStep::Contiguous { strategy }
+                BlockOptimizationStep::Operation { strategy }
             }
             _ => unreachable!(),
         }

diff --git a/crates/burn-fusion/src/server.rs b/crates/burn-fusion/src/server.rs
@@ -200,8 +200,4 @@ where
 
         id
     }
-
-    pub fn drop_tensor_handle(&mut self, id: TensorId) {
-        self.handles.handles_orphan.push(id);
-    }
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -200,8 +200,4 @@ where @@
             id
         }
-        pub fn drop_tensor_handle(&mut self, id: TensorId) {
-            self.handles.handles_orphan.push(id);
-        }
     }