Added documentation and example explination

astroC86 · astroC86 · commit 8851e0599e99 · 2025-07-27T03:07:31.000+02:00
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
@@ -42,7 +42,7 @@ Basic Operators
 .. autosummary::
    :toctree: generated/
 
-    MPIMatrixMult
+    MatrixMult.MPIMatrixMult
     MPIBlockDiag
     MPIStackedBlockDiag
     MPIVStack
@@ -118,6 +118,16 @@ Utils
     local_split
 
 
+.. currentmodule:: pylops_mpi.basicoperators.MatrixMult
+
+.. autosummary::
+   :toctree: generated/
+
+    block_gather
+    local_block_split
+    active_grid_comm
+
+
 .. currentmodule:: pylops_mpi.utils.dottest
 
 .. autosummary::
diff --git a/examples/plot_summamatrixmult.py b/examples/plot_summamatrixmult.py
@@ -1,7 +1,7 @@
 r"""
 Distributed SUMMA Matrix Multiplication
 =======================================
-This example shows how to use the :py:class:`pylops_mpi.basicoperators.MPISummaMatrixMult`
+This example shows how to use the :py:class:`pylops_mpi.basicoperators._MPISummaMatrixMult`
 operator to perform matrix-matrix multiplication between a matrix :math:`\mathbf{A}`
 distributed in 2D blocks across a square process grid and matrices :math:`\mathbf{X}`
 and :math:`\mathbf{Y}` distributed in 2D blocks across the same grid. Similarly,
@@ -20,53 +20,127 @@
 import math
 import numpy as np
 from mpi4py import MPI
+from matplotlib import pyplot as plt
 
 import pylops_mpi
-from pylops_mpi.basicoperators.MatrixMult import (local_block_spit, block_gather, MPIMatrixMult)
+from pylops import Conj
+from pylops_mpi.basicoperators.MatrixMult import (local_block_spit, MPIMatrixMult, active_grid_comm)
 
-comm = MPI.COMM_WORLD
-rank = comm.Get_rank()
-size = comm.Get_size()
+plt.close("all")
 
-N = 9
-M = 9
-K = 9
+###############################################################################
+# We set the seed such that all processes can create the input matrices filled
+# with the same random number. In practical application, such matrices will be
+# filled with data that is appropriate that is appropriate the use-case.
+np.random.seed(42)
 
-A_shape = (N, K)
-x_shape = (K, M)
-y_shape = (N, M)
 
-p_prime = math.isqrt(size)
+N, M, K = 6, 6, 6
+A_shape, x_shape, y_shape= (N, K), (K, M), (N, M)
+
+
+base_comm = MPI.COMM_WORLD
+comm, rank, row_id, col_id, is_active = active_grid_comm(base_comm, N, M)
+print(f"Process {base_comm.Get_rank()} is {'active' if is_active else 'inactive'}")
+
+
+###############################################################################
+# We are now ready to create the input matrices for our distributed matrix
+# multiplication example. We need to set up:
+# - Matrix :math:`\mathbf{A}` of size :math:`N \times K` (the left operand)
+# - Matrix :math:`\mathbf{X}` of size :math:`K \times M` (the right operand)  
+# - The result will be :math:`\mathbf{Y} = \mathbf{A} \mathbf{X}` of size :math:`N \times M`
+#
+# For distributed computation, we arrange processes in a square grid of size
+# :math:`P' \times P'` where :math:`P' = \sqrt{P}` and :math:`P` is the total 
+# number of MPI processes. Each process will own a block of each matrix 
+# according to this 2D grid layout.
+
+p_prime = math.isqrt(comm.Get_size())
+print(f"Process grid: {p_prime} x {p_prime} = {comm.Get_size()} processes")
+
+# Create global test matrices with sequential values for easy verification
+# Matrix A: Each element :math:`A_{i,j} = i \cdot K + j` (row-major ordering)
+# Matrix X: Each element :math:`X_{i,j} = i \cdot M + j`  
 A_data = np.arange(int(A_shape[0] * A_shape[1])).reshape(A_shape)
 x_data = np.arange(int(x_shape[0] * x_shape[1])).reshape(x_shape)
 
+print(f"Global matrix A shape: {A_shape} (N={A_shape[0]}, K={A_shape[1]})")
+print(f"Global matrix X shape: {x_shape} (K={x_shape[0]}, M={x_shape[1]})")
+print(f"Expected Global result Y shape: ({A_shape[0]}, {x_shape[1]}) = (N, M)")
+
+################################################################################
+# Determine which block of each matrix this process should own
+# The 2D block distribution ensures:
+# - Process at grid position :math:`(i,j)` gets block :math:`\mathbf{A}[i_{start}:i_{end}, j_{start}:j_{end}]`
+# - Block sizes are approximately :math:`\lceil N/P' \rceil \times \lceil K/P' \rceil`  with edge processes handling remainder
+#
+# .. raw:: html
+#
+#   <div style="text-align: left; font-family: monospace; white-space: pre;">
+#   <b>Example: 2x2 Process Grid with 6x6 Matrices</b>
+#   
+#   Matrix A (6x6):                    Matrix X (6x6):
+#   ┌───────────┬───────────┐      ┌───────────┬───────────┐
+#   │  0  1  2  │  3  4  5  │      │  0  1  2  │  3  4  5  │
+#   │  6  7  8  │  9 10 11  │      │  6  7  8  │  9 10 11  │
+#   │ 12 13 14  │ 15 16 17  │      │ 12 13 14  │ 15 16 17  │
+#   ├───────────┼───────────┤      ├───────────┼───────────┤
+#   │ 18 19 20  │ 21 22 23  │      │ 18 19 20  │ 21 22 23  │
+#   │ 24 25 26  │ 27 28 29  │      │ 24 25 26  │ 27 28 29  │
+#   │ 30 31 32  │ 33 34 35  │      │ 30 31 32  │ 33 34 35  │
+#   └───────────┴───────────┘      └───────────┴───────────┘
+#   
+#   Process (0,0): A[0:3, 0:3], X[0:3, 0:3]
+#   Process (0,1): A[0:3, 3:6], X[0:3, 3:6]  
+#   Process (1,0): A[3:6, 0:3], X[3:6, 0:3]
+#   Process (1,1): A[3:6, 3:6], X[3:6, 3:6]
+#   </div>
+#
+
 A_slice = local_block_spit(A_shape, rank, comm)
 x_slice = local_block_spit(x_shape, rank, comm)
+################################################################################
+# Extract the local portion of each matrix for this process
 A_local = A_data[A_slice]
 x_local = x_data[x_slice]
 
+print(f"Process {rank}: A_local shape {A_local.shape}, X_local shape {x_local.shape}")
+print(f"Process {rank}: A slice {A_slice}, X slice {x_slice}")
+
 x_dist = pylops_mpi.DistributedArray(global_shape=(K * M),
                                      local_shapes=comm.allgather(x_local.shape[0] * x_local.shape[1]),
                                      base_comm=comm,
                                      partition=pylops_mpi.Partition.SCATTER,
                                      dtype=x_local.dtype)
-x_dist.local_array[:] = x_local.flatten()
+x_dist[:] = x_local.flatten()
 
+################################################################################
+# We are now ready to create the SUMMA :py:class:`pylops_mpi.basicoperators.MPIMatrixMult`
+# operator and the input matrix :math:`\mathbf{X}`. Given that we chose a block-block distribution
+# of data we shall use SUMMA
 Aop = MPIMatrixMult(A_local, M, base_comm=comm, kind="summa", dtype=A_local.dtype)
+
+################################################################################
+# We can now apply the forward pass :math:`\mathbf{y} = \mathbf{Ax}` (which
+# effectively implements a distributed matrix-matrix multiplication
+# :math:`Y = \mathbf{AX}`). Note :math:`\mathbf{Y}` is distributed in the same
+# way as the input :math:`\mathbf{X}` in a block-block fashion.
 y_dist = Aop @ x_dist
+
+###############################################################################
+# Next we apply the adjoint pass :math:`\mathbf{x}_{adj} = \mathbf{A}^H \mathbf{x}`
+# (which effectively implements a distributed summa matrix-matrix multiplication
+# :math:`\mathbf{X}_{adj} = \mathbf{A}^H \mathbf{X}`). Note that
+# :math:`\mathbf{X}_{adj}` is again distributed in the same way as the input
+# :math:`\mathbf{X}` in a block-block fashion.
 xadj_dist = Aop.H @ y_dist
 
-y = block_gather(y_dist, (N,M), (N,M), comm)
-xadj = block_gather(xadj_dist, (K,M), (K,M), comm)
-if rank == 0 :
-    y_correct = np.allclose(A_data @ x_data, y)
-    print("y expected: ", y_correct)
-    if not y_correct:
-        print("expected:\n", A_data @ x_data)
-        print("calculated:\n",y)
-
-    xadj_correct = np.allclose((A_data.T.dot((A_data @ x_data).conj())).conj(), xadj.astype(np.int32))
-    print("xadj expected: ", xadj_correct)
-    if not xadj_correct:
-        print("expected:\n", (A_data.T.dot((A_data @ x_data).conj())).conj())
-        print("calculated:\n", xadj.astype(np.int32))
+###############################################################################
+# Finally, we show that the SUMMA :py:class:`pylops_mpi.basicoperators.MPIMatrixMult`
+# operator can be combined with any other PyLops-MPI operator. We are going to
+# apply here a conjugate operator to the output of the matrix multiplication.
+Dop = Conj(dims=(A_local.shape[0], x_local.shape[1]))
+DBop = pylops_mpi.MPIBlockDiag(ops=[Dop, ])
+Op = DBop @ Aop
+y1 = Op @ x_dist
diff --git a/pylops_mpi/LinearOperator.py b/pylops_mpi/LinearOperator.py
@@ -76,7 +76,6 @@ def matvec(self, x: DistributedArray) -> DistributedArray:
 
         """
         M, N = self.shape
-
         if x.global_shape != (N,):
             raise ValueError("dimension mismatch")
 
diff --git a/pylops_mpi/basicoperators/MatrixMult.py b/pylops_mpi/basicoperators/MatrixMult.py
@@ -1,3 +1,4 @@
+
 import math
 import numpy as np
 from typing import Tuple, Union, Literal
@@ -14,7 +15,7 @@
 
 
 def active_grid_comm(base_comm: MPI.Comm, N: int, M: int):
-    r"""Configure active grid
+    r"""Configure active grid for distributed matrix multiplication.
 
     Configure a square process grid from a parent MPI communicator and
     select a subset of "active" processes. Each process in ``base_comm``
@@ -721,7 +722,6 @@ def MPIMatrixMult(
     ``kind`` parameter.
 
     The forward operation computes::
-
         :math:`\mathbf{Y} = \mathbf{A} \cdot \mathbf{X}`
 
     where:
@@ -730,7 +730,6 @@ def MPIMatrixMult(
     - :math:`\mathbf{Y}` is the resulting distributed matrix of shape :math:`[N \times M]`
 
     The adjoint (conjugate-transpose) operation computes::
-    
         :math:`\mathbf{X}_{adj} = \mathbf{A}^H \cdot \mathbf{Y}`
 
     where :math:`\mathbf{A}^H` is the complex-conjugate transpose of :math:`\mathbf{A}`.
@@ -792,3 +791,5 @@ def MPIMatrixMult(
         return _MPIBlockMatrixMult(A, M, saveAt, base_comm, dtype)
     else:
         raise NotImplementedError("kind must be summa or block")
+
+__all__ = ["active_grid_comm", "block_gather", "local_block_spit", "MPIMatrixMult"]