Support core axis index in the device_id dict for async copy and semaphore.

IvyZX · Google-ML-Automation · commit 4ad7186ed33a · 2025-07-25T19:57:58.000-07:00
PiperOrigin-RevId: 787329175
diff --git a/jax/_src/pallas/mosaic/lowering.py b/jax/_src/pallas/mosaic/lowering.py
@@ -3607,42 +3607,54 @@ def _run_scoped_lowering_rule(ctx: LoweringRuleContext, *consts, jaxpr, collecti
 def _device_id_dict_to_mesh(ctx: LoweringRuleContext, device_id_dict):
   mesh_context = ctx.lowering_context.mesh_context
   assert mesh_context is not None
-  zipped_metadata = zip(mesh_context.axis_names, mesh_context.mesh_shape)
+  mesh_axis_sizes = dict(zip(mesh_context.axis_names, mesh_context.mesh_shape))
+  core_axis_name, grid_names = None, ctx.lowering_context.grid_names
+  if grid_names:
+    if len(grid_names) > 1:
+      raise NotImplementedError(
+          "Unable to determine core axis name if grid_names is more than 1."
+      )
+    mesh_axis_sizes.update(
+        dict(zip(grid_names, ctx.lowering_context.grid_sizes))
+    )
+    core_axis_name = grid_names[0]
   physical_axis_dict = {}
   # Handle joint axes (i.e., one logical axis over >1 physical axes)
   for axis, idx in device_id_dict.items():
     if isinstance(axis, tuple):
-      axis_names, mesh_shape = unzip2(
-          (name, shape) for name, shape in zipped_metadata if name in axis
-      )
-      for axis_index, axis_name in enumerate(axis_names):
-        axis_size = ir_constant(mesh_shape[axis_index])
+      axes_dimensions = [mesh_axis_sizes[name] for name in axis]
+      for axis_index, axis_name in enumerate(axis):
+        axis_size = ir_constant(mesh_axis_sizes[axis_name])
         minor_divisor = ir_constant(
-            np.prod(mesh_shape[axis_index + 1 :], dtype=np.int32)
+            np.prod(axes_dimensions[axis_index + 1 :], dtype=np.int32)
         )
         device_idx = arith.remsi(arith.divsi(idx, minor_divisor), axis_size)
         physical_axis_dict[axis_name] = device_idx
     else:
       physical_axis_dict[axis] = idx
+  core_index = None
+  if core_axis_name and core_axis_name in physical_axis_dict:
+    core_index = physical_axis_dict.pop(grid_names[0])
   device_id = []
   for axis in mesh_context.axis_names:
     if axis in physical_axis_dict:
       device_id.append(physical_axis_dict[axis])
     else:
       device_id.append(_axis_index_rule(ctx, axis_name=axis))
-  return tuple(device_id)
+  return tuple(device_id), core_index
 
 
 def _device_id_to_logical(
     ctx: LoweringRuleContext, device_id,
     device_id_type: primitives.DeviceIdType):
+  core_index = None
   if isinstance(device_id, dict):
     if device_id_type is not primitives.DeviceIdType.MESH:
       raise ValueError(
           "`device_id_type` must be MESH if `device_id` is a dict,"
           f" got: {device_id_type = }."
       )
-    device_id = _device_id_dict_to_mesh(ctx, device_id)
+    device_id, core_index = _device_id_dict_to_mesh(ctx, device_id)
   if device_id_type is primitives.DeviceIdType.MESH:
     assert (mesh_context := ctx.lowering_context.mesh_context)
     # Mesh means we are passed the mesh coordinates for the device
@@ -3651,16 +3663,16 @@ def _device_id_to_logical(
 
     i32 = ir.IntegerType.get_signless(32)
     if len(device_ids) == 0:
-      return arith.constant(i32, 0)
+      return arith.constant(i32, 0), core_index
     return functools.reduce(
         arith.addi,
         (
             arith.muli(a, arith.constant(i32, b))
             for a, b in zip(device_ids, mesh_strides)
         ),
-    )
+    ), core_index
   elif device_id_type is primitives.DeviceIdType.LOGICAL:
-    return device_id
+    return device_id, None
   raise NotImplementedError(f"Unsupported device id type: {device_id_type}")
 
 
@@ -3704,7 +3716,13 @@ def _semaphore_signal_lowering_rule(
   )
   sem, _ = _transform_ref(sem, sem_aval.dtype, sem_aval.shape, transforms)
   if device_id is not None:
-    device_id = _device_id_to_logical(ctx, device_id, device_id_type)
+    device_id, core_id = _device_id_to_logical(ctx, device_id, device_id_type)
+    if core_id is not None:
+      if core_index is not None:
+        raise ValueError(
+            "Cannot specify both `core_index` and the core axis in `device_id`."
+        )
+      core_index = core_id
   tpu.sem_signal(sem, value, device_id=device_id, core_id=core_index)
   return []
 
@@ -3757,8 +3775,9 @@ def _dma_start_lowering_rule(
       dst_ref, dst_ref_aval.dtype, dst_ref_block_shape, dst_transforms
   )
   sem, _ = _transform_ref(sem, sem_aval.dtype, sem_aval.shape, sem_transforms)
+  core_id = None
   if device_id is not None:
-    device_id = _device_id_to_logical(ctx, device_id, device_id_type)
+    device_id, core_id = _device_id_to_logical(ctx, device_id, device_id_type)
   priority_kwarg = {"priority": priority}
   if jaxlib_version < (0, 5, 4):
     priority_kwarg = {}
@@ -3768,6 +3787,7 @@ def _dma_start_lowering_rule(
       sem,
       source_semaphore=src_sem,
       device_id=device_id,
+      core_id=core_id,
       **priority_kwarg,
   )
   return []
@@ -3796,13 +3816,14 @@ def _dma_wait_lowering_rule(ctx: LoweringRuleContext, *args, tree,
   dst, _ = _transform_ref(dst, dst_aval.dtype, ref_block_shape, transforms)
   sem, _ = _transform_ref(sem, sem_aval.dtype, sem_aval.shape, sem_transforms)
 
+  core_id = None
   if device_id is not None:
-    device_id = _device_id_to_logical(ctx, device_id, device_id_type)
+    device_id, core_id = _device_id_to_logical(ctx, device_id, device_id_type)
 
   if ctx.forward_compatible or is_cloud_tpu_older_than(2025, 7, 27):
-    tpu.wait_dma2(sem, src, dst)
+    tpu.wait_dma2(sem, src, dst, core_id=core_id)
   else:
-    tpu.wait_dma2(sem, src, dst, device_id=device_id)
+    tpu.wait_dma2(sem, src, dst, device_id=device_id, core_id=core_id)
   return []
 
 
diff --git a/tests/pallas/BUILD b/tests/pallas/BUILD
@@ -523,6 +523,7 @@ jax_multiplatform_test(
     enable_configs = [
         "tpu_v5e_x8",
         "tpu_v5p",
+        "tpu_v5p_x4",
     ],
     deps = [
         "//jax:pallas_tpu",
diff --git a/tests/pallas/tpu_pallas_async_test.py b/tests/pallas/tpu_pallas_async_test.py
@@ -454,6 +454,84 @@ def _():
     np.testing.assert_array_equal(pallas_out[:xlocal],
                                   pallas_out[xlocal:(2*xlocal)])
 
+  def test_axis_dict_with_core_single_device(self):
+    if jax.device_count() > 2 or (jax.devices()[0].num_cores) != 2:
+      self.skipTest('Testing single device two cores')
+    mesh = jax.make_mesh((jax.device_count(),), ('device',))
+    ddim = jax.device_count()
+    tcmesh = pltpu.create_tensorcore_mesh('core')
+    pspec = P('device', None)
+    sharding = jax.sharding.NamedSharding(mesh, pspec)
+
+    # Array is fully sharded.
+    xlocal, ylocal = 8, 256
+    input_arr = jnp.arange(xlocal * ddim * ylocal, dtype=jnp.int32).reshape(
+        (xlocal * ddim, ylocal)
+    )
+    input_arr = jax.device_put(input_arr, sharding)
+
+    def core_copy(refs):
+      in_ref, out_ref = refs
+
+      @pl.core_map(tcmesh, compiler_params=pltpu.CompilerParams(collective_id=7))
+      def _():
+        num_cores = jax.lax.axis_size('core')
+        slc_size = ylocal // num_cores
+        vmem_shape = (xlocal, slc_size)
+
+        # This runs on every core, for every vmem iterations
+        def alloc(out_vmem_ref, sem, send_sem, recv_sem):
+          core_index = jax.lax.axis_index('core')
+          slc = pl.ds(core_index * slc_size, slc_size)
+
+          # Make sure all cores have entered run_scoped.
+          sem0 = pltpu.get_barrier_semaphore()
+          for i in range(ddim):
+            for j in range(num_cores):
+              pltpu.semaphore_signal(
+                  sem0, 1, device_id={'device': i, 'core': j},
+                  device_id_type=pltpu.DeviceIdType.MESH)
+          pltpu.semaphore_wait(sem0, ddim * num_cores)
+
+          # Identity function by default
+          pltpu.async_copy(in_ref.at[:, slc], out_ref.at[:, slc], sem).wait()
+
+          copy_c0_to_c1 = pltpu.make_async_remote_copy(
+              src_ref=in_ref.at[:, slc],
+              dst_ref=out_vmem_ref,
+              send_sem=send_sem,
+              recv_sem=recv_sem,
+              device_id={'core': 1},
+              device_id_type=pltpu.DeviceIdType.MESH,
+          )
+
+          @pl.when(core_index == 0)
+          def _():
+            copy_c0_to_c1.start()
+            copy_c0_to_c1.wait_send()
+
+          @pl.when(core_index == 1)
+          def _():
+            copy_c0_to_c1.wait_recv()
+            pltpu.async_copy(out_vmem_ref, out_ref.at[:, slc], sem).wait()
+
+        pl.run_scoped(
+            alloc,
+            pltpu.VMEM(vmem_shape, out_ref.dtype),
+            *([pltpu.SemaphoreType.DMA] * 3),
+        )
+
+    @partial(jax.shard_map, mesh=mesh, in_specs=pspec, out_specs=pspec, check_vma=False)
+    def run_core_kernel(input):
+      output = jnp.zeros_like(input)
+      _, output = pl.run_state(core_copy)((input, output))
+      return output
+    pallas_out = jax.jit(run_core_kernel)(input_arr)
+
+    # The device=0 core=1 slice was flushed with device=0 core=0 contents
+    np.testing.assert_array_equal(pallas_out[:, 128:], input_arr[:, :128])
+    np.testing.assert_array_equal(pallas_out[:, :128], input_arr[:, :128])
+
 
 def make_async_remote_copy(axis_name: str, direction: str = 'right',
                            target_memory_space=None):
diff --git a/tests/pallas/tpu_pallas_distributed_test.py b/tests/pallas/tpu_pallas_distributed_test.py
@@ -326,6 +326,100 @@ def body(x):
     )(x)
     np.testing.assert_allclose(y, x)
 
+  @parameterized.product(joint_axis=[True, False])
+  def test_axis_dict_with_core_multi_device(self, joint_axis):
+    if jax.device_count() < 2:
+      self.skipTest('Requires at least 2 devices for DMAs.')
+    if (cdim := jax.devices()[0].num_cores) < 2:
+      self.skipTest('Requires a TPU with at least 2 cores.')
+    mesh = jax.make_mesh((jax.device_count(),), ('device',))
+    ddim = jax.device_count()
+    tcmesh = pltpu.create_tensorcore_mesh('core')
+    pspec = P('device', None)
+    sharding = jax.sharding.NamedSharding(mesh, pspec)
+
+    # Array is fully sharded.
+    xlocal, ylocal = 8, 256
+    input_arr = jnp.arange(xlocal * ddim * ylocal, dtype=jnp.int32).reshape(
+        (xlocal * ddim, ylocal)
+    )
+    input_arr = jax.device_put(input_arr, sharding)
+
+    def core_copy(refs):
+      in_ref, out_ref = refs
+
+      @pl.core_map(tcmesh, compiler_params=pltpu.CompilerParams(collective_id=7))
+      def _():
+        num_cores = jax.lax.axis_size('core')
+        slc_size = ylocal // num_cores
+        vmem_shape = (xlocal, slc_size)
+
+        # This runs on every core, for every vmem iterations
+        def alloc(out_vmem_ref, sem, send_sem, recv_sem):
+          core_index = jax.lax.axis_index('core')
+          device_index = jax.lax.axis_index('device')
+          slc = pl.ds(core_index * slc_size, slc_size)
+
+          # Make sure all cores have entered run_scoped.
+          sem0 = pltpu.get_barrier_semaphore()
+          for i in range(ddim):
+            for j in range(num_cores):
+              pltpu.semaphore_signal(
+                  sem0, 1, device_id={'device': i, 'core': j},
+                  device_id_type=pltpu.DeviceIdType.MESH)
+          pltpu.semaphore_wait(sem0, ddim * num_cores)
+
+          # Identity function by default
+          pltpu.async_copy(in_ref.at[:, slc], out_ref.at[:, slc], sem).wait()
+
+          if joint_axis:
+            device_id = {('device', 'core'): cdim + 1}
+          else:
+            device_id = {'device': 1, 'core': 1}
+          copy_d0c0_to_d1c1 = pltpu.make_async_remote_copy(
+              src_ref=in_ref.at[:, slc],
+              dst_ref=out_vmem_ref,
+              send_sem=send_sem,
+              recv_sem=recv_sem,
+              device_id=device_id,
+              device_id_type=pltpu.DeviceIdType.MESH,
+          )
+
+          @pl.when(device_index == 0)
+          def _():
+            @pl.when(core_index == 0)
+            def _():
+              copy_d0c0_to_d1c1.start()
+              copy_d0c0_to_d1c1.wait_send()
+
+          @pl.when(device_index == 1)
+          def _():
+            @pl.when(core_index == 1)
+            def _():
+              copy_d0c0_to_d1c1.wait_recv()
+              pltpu.async_copy(out_vmem_ref, out_ref.at[:, slc], sem).wait()
+
+        pl.run_scoped(
+            alloc,
+            pltpu.VMEM(vmem_shape, out_ref.dtype),
+            *([pltpu.SemaphoreType.DMA] * 3),
+        )
+
+    @partial(jax.shard_map, mesh=mesh, in_specs=pspec, out_specs=pspec, check_vma=False)
+    def run_core_kernel(input):
+      output = jnp.zeros_like(input)
+      _, output = pl.run_state(core_copy)((input, output))
+      return output
+    pallas_out = jax.jit(run_core_kernel)(input_arr)
+
+    # The device=1 core=1 slice was flushed with device=0 core=0 contents
+    np.testing.assert_array_equal(pallas_out[8:16, 128:], input_arr[:8, :128])
+    # Mask that slice out and all should be the same.
+    mask = jnp.zeros((8, 128), jnp.int32)
+    masked_in = jax.lax.dynamic_update_slice(input_arr, mask, (8, 128))
+    masked_out = jax.lax.dynamic_update_slice(pallas_out, mask, (8, 128))
+    np.testing.assert_array_equal(masked_in, masked_out)
+
 
 class PallasCallRemoteDMAInterpretTest(parameterized.TestCase):