[boo] add a combined backward layer norm

ftynse · ftynse · commit 0b5b6719a329 · 2025-07-31T16:09:53.000Z
Add an op export for combined computaiton of all gradients in layer
norm. This may be more efficient than executing them one by one in some
cases and requires separate testing.

Signed-off-by: Alex Zinenko &lt;git@ozinenko.com&gt;
diff --git a/iree/turbine/kernel/boo/op_exports/layer_norm.py b/iree/turbine/kernel/boo/op_exports/layer_norm.py
@@ -21,6 +21,7 @@ class Mode(ModeBase, IntEnum):
     INPUT_BACKWARD = 1
     WEIGHT_BACKWARD = 2
     BIAS_BACKWARD = 3
+    FULL_BACKWARD = 4
 
 
 class LayerNormSignature(OpSignature):
@@ -160,13 +161,16 @@ def arrange_backward_launch_args(
         input = forward_args[0]
         # TODO: is this possible at this level?
         weight = forward_args[1] if len(forward_args) > 1 else None
+        bias = forward_args[2] if len(forward_args) > 2 else None
         _, mean, rstd = forward_results
         if self.mode == Mode.INPUT_BACKWARD:
             return (input, weight, mean, rstd)
         if self.mode == Mode.WEIGHT_BACKWARD:
             return (input, mean, rstd)
         if self.mode == Mode.BIAS_BACKWARD:
             return ()
+        if self.mode == Mode.FULL_BACKWARD:
+            return (input, weight, bias, mean, rstd)
         assert False, "Unsupported mode."
 
     def as_init_kwargs(self) -> dict[str, Any]:
@@ -202,6 +206,9 @@ def get_nn_module(self, use_custom: bool) -> torch.nn.Module:
             return LayerNormBackwardWeight(self)
         if self.mode == Mode.BIAS_BACKWARD:
             return LayerNormBackwardBias(self)
+        if self.mode == Mode.FULL_BACKWARD:
+            return LayerNormBackwardFull(self)
+        assert False, f"Unknown mode: {self.mode.name}."
 
     def get_sample_args(
         self,
@@ -247,6 +254,15 @@ def get(shape: Sequence[int]) -> torch.Tensor:
         if self.mode == Mode.BIAS_BACKWARD:
             # (dLdy,)
             return (get(self.output_shape),)
+        if self.mode == Mode.FULL_BACKWARD:
+            return (
+                get(self.output_shape),
+                get(self.input_shape),
+                get(self.normalized_shape) if self.elementwise_affine else None,
+                get(self.normalized_shape) if self.bias else None,
+                get(self.aggregate_shape).to(dtype=self.forwarded_args_dtype),
+                get(self.aggregate_shape).to(dtype=self.forwarded_args_dtype),
+            )
         raise ValueError(f"Unknown mode: {self.mode}")
 
 
@@ -374,6 +390,69 @@ def forward(self, grad_output: torch.Tensor) -> torch.Tensor:
         return grad_output.sum(dim=self.keep_dim)
 
 
+class LayerNormBackwardFull(torch.nn.Module):
+    """Module computing, as its forward computation, the gradients of the input,
+    weights, and bias of the layer normalization given the gradient of its
+    output."""
+
+    def __init__(self, signature: LayerNormSignature, *, use_aten=True):
+        super().__init__()
+        self.use_aten = use_aten
+        self.normalized_shape = signature.normalized_shape
+        self.need_bias = signature.bias
+        self.need_weight = signature.elementwise_affine
+        self.normalized_dim = list(
+            range(len(signature.input_shape))[-len(self.normalized_shape) :]
+        )
+        self.keep_dim = list(
+            range(len(signature.input_shape))[: -len(signature.normalized_shape)]
+        )
+
+    def forward(
+        self,
+        grad_output: torch.Tensor,
+        input: torch.Tensor,
+        weight: torch.Tensor | None,
+        bias: torch.Tensor | None,
+        mean: torch.Tensor,
+        rstd: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        assert self.need_weight != (
+            weight is None
+        ), "Weight must be provided if its gradient is requested."
+        assert self.need_bias != (
+            bias is None
+        ), "Bias must be provided if its gradient is requested."
+        if self.use_aten:
+            return torch.ops.aten.native_layer_norm_backward(
+                grad_output,
+                input,
+                self.normalized_shape,
+                mean,
+                rstd,
+                weight,
+                bias,
+                (True, self.need_weight, self.need_bias),
+            )
+
+        # Recompute norm instead of saving it. Judging by the signature, this is the same
+        # decision as ATen.
+        norm = (input - mean) * rstd
+        dnorm = grad_output * weight if weight is not None else grad_output
+        dx = (
+            dnorm
+            - dnorm.mean(dim=self.normalized_dim, keepdim=True)
+            - norm * (dnorm * norm).mean(dim=self.normalized_dim, keepdim=True)
+        ) * rstd
+        dw = None
+        if self.need_weight:
+            dw = (grad_output * norm).sum(self.keep_dim)
+        db = None
+        if self.need_bias:
+            db = grad_output.sum(dim=self.keep_dim)
+        return dx, dw, db
+
+
 def _parse_shape(shape: str) -> list[int]:
     for symbol in shape:
         assert symbol in "0123456789x", "Unsupported shape syntax."
@@ -419,6 +498,8 @@ def get_signature(args: argparse.Namespace) -> LayerNormSignature:
                 mode = Mode.WEIGHT_BACKWARD
             case 4:
                 mode = Mode.BIAS_BACKWARD
+            case 5:
+                mode = Mode.FULL_BACKWARD
             case _:
                 raise ValueError(f"Unsupported mode {args.forw}.")
 
diff --git a/tests/kernel/boo/op_exports/layer_norm_backward_impl_test.py b/tests/kernel/boo/op_exports/layer_norm_backward_impl_test.py
@@ -13,6 +13,16 @@
 )
 
 
+def _marked_xfail(*args):
+    return pytest.param(
+        *args,
+        marks=pytest.mark.xfail(
+            condition=not torch.cuda.is_available(),
+            reason="Cannot run on GPU with no GPU.",
+        ),
+    )
+
+
 # Note that elementwise_affine and bias flags are grouped together to avoid an
 # invalid combination.
 @pytest.mark.parametrize("dtype", [torch.float32])
@@ -96,3 +106,70 @@ def test_layer_norm_impl(
         print(f"Expected for gradient #{i}: ", args[i].grad)
         print(f"Actual for gradient #{i}: ", grads[i])
     raise RuntimeError(f"Tensor matches: {results}")
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("device", ["cpu", _marked_xfail("cuda")])
+@pytest.mark.parametrize("input_shape", [(10, 12, 14, 16), (11, 13, 15)])
+@pytest.mark.parametrize(
+    "elementwise_affine_bias", [(False, False), (True, False), (True, True)]
+)
+def test_layer_norm_combined_impl(
+    input_shape: tuple[int, ...],
+    device: str,
+    dtype: torch.dtype,
+    elementwise_affine_bias: tuple[bool, bool],
+):
+    # Account for ATen weirdness on GPU.
+    if device == "cuda" and dtype == torch.bfloat16:
+        forwarded_args_dtype = torch.float32
+    else:
+        forwarded_args_dtype = dtype
+
+    elementwise_affine, bias = elementwise_affine_bias
+    kwargs = {
+        "input_shape": input_shape,
+        "normalized_shape": input_shape[-1:],
+        "elementwise_affine": elementwise_affine,
+        "bias": bias,
+        "dtype": dtype,
+        "forwarded_args_dtype": forwarded_args_dtype,
+    }
+    fwd_sig = LayerNormSignature(**kwargs)
+    args = fwd_sig.get_sample_args(seed=1)
+
+    args = tuple(
+        arg.to(device=device).requires_grad_(True) if arg is not None else None
+        for arg in args
+    )
+    fwd = fwd_sig.get_nn_module(use_custom=True).to(device=device)
+    bwd_sig = LayerNormSignature(**kwargs, mode=Mode.FULL_BACKWARD)
+    bwd = bwd_sig.get_nn_module(use_custom=True).to(device=device)
+
+    fwd_results = fwd(*args)
+
+    main_result = fwd_results[fwd_sig.main_result_index]
+    main_result.retain_grad()
+    # TODO: this is not a good loss function (#1021).
+    loss = main_result.sum()
+    loss.backward(retain_graph=True)
+
+    bwd_input_args = bwd_sig.arrange_backward_launch_args(args, fwd_results)
+    grads = tuple(x for x in bwd(main_result.grad, *bwd_input_args) if x is not None)
+
+    rtol = 1e-4
+    atol = 1e-4
+    assert len(grads) == len(args)
+    results = [
+        torch.allclose(arg.grad, grad, rtol=rtol, atol=atol)
+        for arg, grad in zip(args, grads)
+    ]
+    if all(results):
+        return
+
+    for i, r in enumerate(results):
+        if r:
+            continue
+        print(f"Expected for gradient #{i}: ", args[i].grad)
+        print(f"Actual for gradient #{i}: ", grads[i])
+    raise RuntimeError(f"Tensor matches: {results}")