[boo] add explicit casts for layer norm intermediates

ftynse · ftynse · commit b3065b6db53d · 2025-07-31T15:46:37.000Z
Torch choses to store intermediate values passes from the forward to the
backward pass using different dtypes based on the device: on CPU, the
same dtype as the input is used even if the computation uses double
bitwidth, on GPU, the double-width dtype is used instead. Guard against
this by allowing the user to explicitly specify the dtype to use for
intermediates in the layer norm signature. Note that exporting from
torch will still introduce truncations, but these are expected to cancel
out during compilation.

Signed-off-by: Alex Zinenko &lt;git@ozinenko.com&gt;
diff --git a/iree/turbine/kernel/boo/op_exports/layer_norm.py b/iree/turbine/kernel/boo/op_exports/layer_norm.py
@@ -44,6 +44,7 @@ def __init__(
         bias: bool = True,
         dtype=torch.bfloat16,
         mode: str | Mode = Mode.FORWARD,
+        forwarded_args_dtype: torch.dtype | None = None,
     ):
         if (
             len(normalized_shape) > len(input_shape)
@@ -60,6 +61,7 @@ def __init__(
         self.bias = bias
         self.dtype = dtype
         self.mode = Mode.parse(mode)
+        self.forwarded_args_dtype = forwarded_args_dtype or dtype
 
     @property
     def output_shape(self) -> list[int]:
@@ -116,6 +118,7 @@ def func_name(self) -> str:
             "layer_norm",
             f"{len(self.normalized_shape)}d",
             str(self.dtype).removeprefix("torch."),
+            str(self.forwarded_args_dtype).removeprefix("torch."),
             self.mode.name.lower(),
             "x".join(str(i) for i in self.input_shape),
             "w" if self.elementwise_affine is not None else "",
@@ -175,6 +178,7 @@ def as_init_kwargs(self) -> dict[str, Any]:
             "bias": self.bias,
             "dtype": self.dtype,
             "mode": self.Mode,
+            "forwarded_args_dtype": self.forwarded_args_dtype,
         }
 
     def get_output_size(self) -> int:
@@ -229,16 +233,16 @@ def get(shape: Sequence[int]) -> torch.Tensor:
                 get(self.output_shape),
                 get(self.input_shape),
                 get(self.normalized_shape),
-                get(self.aggregate_shape),
-                get(self.aggregate_shape),
+                get(self.aggregate_shape).to(dtype=self.forwarded_args_dtype),
+                get(self.aggregate_shape).to(dtype=self.forwarded_args_dtype),
             )
         if self.mode == Mode.WEIGHT_BACKWARD:
             # (dLdy, input, mean, rstd)
             return (
                 get(self.output_shape),
                 get(self.input_shape),
-                get(self.aggregate_shape),
-                get(self.aggregate_shape),
+                get(self.aggregate_shape).to(dtype=self.forwarded_args_dtype),
+                get(self.aggregate_shape).to(dtype=self.forwarded_args_dtype),
             )
         if self.mode == Mode.BIAS_BACKWARD:
             # (dLdy,)
@@ -253,6 +257,7 @@ def __init__(self, signature: LayerNormSignature):
         super().__init__()
         self.normalized_shape = signature.normalized_shape
         self.eps = signature.eps
+        self.forwarded_args_dtype = signature.forwarded_args_dtype
 
     def forward(
         self,
@@ -272,9 +277,14 @@ def forward(
         #     torch.layer_norm(input, self.normalized_shape, weight, bias, self.eps)
         #
         # wrapper hides. We want those too so we can save them for backward.
-        return torch.ops.aten.native_layer_norm(
+        output, mean, rstd = torch.ops.aten.native_layer_norm(
             input, self.normalized_shape, weight, bias, self.eps
         )
+        return (
+            output,
+            mean.to(dtype=self.forwarded_args_dtype),
+            rstd.to(dtype=self.forwarded_args_dtype),
+        )
 
 
 class LayerNormBackwardInput(torch.nn.Module):
@@ -285,6 +295,7 @@ def __init__(self, signature: LayerNormSignature):
         super().__init__()
         self.normalized_shape = signature.normalized_shape
         self.eps = signature.eps
+        self.dtype = signature.dtype
 
         self.normalized_dim = list(
             range(len(signature.input_shape))[-len(self.normalized_shape) :]
@@ -303,6 +314,8 @@ def forward(
     ) -> torch.Tensor:
         # Recompute norm instead of saving it. Judging by the signature, this is the same
         # decision as ATen.
+        mean = mean.to(dtype=self.dtype)
+        rstd = rstd.to(dtype=self.dtype)
         norm = (input - mean) * rstd
         dnorm = grad_output * weight if weight is not None else grad_output
         dx = (
@@ -321,6 +334,8 @@ def __init__(self, signature: LayerNormSignature):
         super().__init__()
         self.normalized_shape = signature.normalized_shape
         self.eps = signature.eps
+        self.dtype = signature.dtype
+
         self.normalized_dim = list(
             range(len(signature.input_shape))[-len(self.normalized_shape) :]
         )
@@ -337,6 +352,8 @@ def forward(
     ):
         # Recompute norm instead of saving it. Judging by the signature, this is the same
         # decision as ATen.
+        mean = mean.to(dtype=self.dtype)
+        rstd = rstd.to(dtype=self.dtype)
         norm = (input - mean) * rstd
         return (grad_output * norm).sum(self.keep_dim)
 
diff --git a/tests/kernel/boo/op_exports/layer_norm_backward_impl_test.py b/tests/kernel/boo/op_exports/layer_norm_backward_impl_test.py
@@ -16,12 +16,14 @@
 # Note that elementwise_affine and bias flags are grouped together to avoid an
 # invalid combination.
 @pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("forwarded_dtype", [torch.float32, torch.float64])
 @pytest.mark.parametrize("input_shape", [(10, 12, 14, 16), (11, 13, 15)])
 @pytest.mark.parametrize(
     "elementwise_affine_bias", [(False, False), (True, False), (True, True)]
 )
 def test_layer_norm_impl(
     dtype: torch.dtype,
+    forwarded_dtype: torch.dtype,
     input_shape: tuple[int, ...],
     elementwise_affine_bias: tuple[bool, bool],
 ):
@@ -37,6 +39,7 @@ def test_layer_norm_impl(
         "elementwise_affine": elementwise_affine,
         "bias": bias,
         "dtype": dtype,
+        "forwarded_args_dtype": forwarded_dtype,
     }
     fwd_sig = LayerNormSignature(**kwargs)
     args = fwd_sig.get_sample_args(seed=1)
@@ -53,6 +56,9 @@ def test_layer_norm_impl(
     bwd_bias = bwd_bias_sig.get_nn_module(use_custom=True).to(device="cpu")
 
     fwd_results = fwd(*args)
+    assert fwd_results[1].dtype == forwarded_dtype
+    assert fwd_results[2].dtype == forwarded_dtype
+
     main_result = fwd_results[fwd_sig.main_result_index]
     main_result.retain_grad()
     # TODO: this is not a good loss function (#1021).