charmplusplus · ZwFink · Dec 6, 2021 · Dec 21, 2021 · Dec 21, 2021 · Dec 21, 2021
diff --git a/examples/cannon/README.md b/examples/cannon/README.md
@@ -0,0 +1,49 @@
+# Cannon's Matrix Multiplication Example
+
+This example demonstrates an implementation of Cannon's Matrix Multiplication algorithm using Charm4py. It showcases several features of Charm4py, including 2-dimensional chare arrays, Channels, Reductions, and Futures. The example also utilizes Numba to accelerate computation.
+
+## What it does
+
+Cannon's algorithm is a distributed algorithm for matrix multiplication. It divides the input matrices into sub-matrices and distributes them among a 2D grid of processors (or chares, in this case). The algorithm then performs a series of local multiplications and data shifts to compute the final result.
+
+Key features demonstrated:
+
+1. Use of Numba for accelerated matrix multiplication
+2. 2-dimensional chare arrays for distributed computation
+3. Channels for efficient communication between chares
+4. Reductions and Futures for synchronization
+
+## How to run
+
+1. Ensure Numba is installed for improved performance (optional):
+   ```
+   pip install numba
+   ```
+
+2. Run the example with the following command:
+   ```
+   python3 -m charmrun.start +pN cannon.py <matrix_dim> <chare_dim>
+   ```
+   Where:
+   - `N` is the number of PEs
+   - `<matrix_dim>` is the dimension of the input matrices (must be a perfect square)
+   - `<chare_dim>` is the dimension of the chare grid (must be a perfect square)
+   - `<matrix_dim>` must be divisible by `<chare_dim>`
+
+   For example:
+   ```
+   python3 -m charmrun.start +p4 1000 10
+   ```
+   This will multiply two 1000x1000 matrices using a 10x10 grid of chares.
+
+3. The program will output the size of each chare's sub-array and the total execution time.
+
+## Requirements
+
+- Charm4py (assumed to be installed)
+- Numpy
+- Numba (optional, for improved performance)
+
+## Note
+
+This example is designed to showcase Charm4py features and distributed computing concepts. For production use, consider using optimized libraries like ScaLAPACK for large-scale matrix operations.
diff --git a/examples/cannon/cannon.py b/examples/cannon/cannon.py
@@ -0,0 +1,128 @@
+from charm4py import charm, Chare, Array, Future, coro, Channel
+import time
+import numpy as np
+
+try:
+    from numba import njit
+except ImportError:
+    # create a dummy numba.njit decorator
+    def njit(func):
+        return func
+
+@njit
+def matmul(C, A, B):
+    C += A @ B
+
+class SubMatrix(Chare):
+    def __init__(self, subdim_size, charedim, init_done):
+        super().__init__()
+        self.subdim_size = subdim_size
+        self.charedim = charedim
+
+        self.neighbor_cache = {}
+
+        self.sub_a = np.ones((subdim_size, subdim_size), dtype=np.float64)
+        self.sub_a[:,:] = (charedim*self.thisIndex[1]) + self.thisIndex[0]
+        self.sub_b = np.ones((subdim_size, subdim_size), dtype=np.float64)
+        self.sub_b[:,:] = (charedim*self.thisIndex[0]) + self.thisIndex[1]
+
+        self.recv_a = np.ndarray((subdim_size,subdim_size), dtype=np.float64)
+        self.recv_b = np.ndarray((subdim_size,subdim_size), dtype=np.float64)
+
+        self.sub_c = np.zeros((subdim_size, subdim_size), dtype=np.float64)
+
+        warmup_c = np.zeros((subdim_size, subdim_size), dtype=np.float64)
+
+        # ensure the kernel is compiled
+        matmul(warmup_c, self.sub_a, self.sub_b)
+
+        self.reduce(init_done)
+
+    def get_neighbor_channel(self, target_idx):
+        if target_idx not in self.neighbor_cache:
+            self.neighbor_cache[target_idx] = Channel(self,
+                                                      self.thisProxy[target_idx]
+                                                      )
+        return self.neighbor_cache[target_idx]
+
+    @coro
+    def cannons_multiplication(self, mult_done_future):
+        # do initial shift
+        # left-shift
+        if self.thisIndex[0] > 0:
+            self.shift(0, self.thisIndex[0])
+            self.sub_a, self.recv_a = self.recv_a, self.sub_a
+
+        # up-shift
+        if self.thisIndex[1] > 0:
+            self.shift(self.thisIndex[1], 0)
+            self.sub_b, self.recv_b = self.recv_b, self.sub_b
+
+        # todo multiplication kernel, will be interesting to see how they compare
+        matmul(self.sub_c, self.sub_a, self.sub_b)
+
+        for iter in range(self.charedim - 1):
+            self.shift(0, 1)
+            self.shift(1, 0)
+
+            self.sub_a, self.recv_a = self.recv_a, self.sub_a
+            self.sub_b, self.recv_b = self.recv_b, self.sub_b
+
+            matmul(self.sub_c, self.sub_a, self.sub_b)
+
+        self.reduce(mult_done_future)
+
+    # the communication routines should be optimized so both sends/receives can complete in parallel
+    def shift(self, up_shift, left_shift):
+        send_target_idx = ((self.thisIndex[0] - up_shift) % self.charedim,
+                           (self.thisIndex[1] - left_shift) % self.charedim
+                           )
+        recv_target_idx = ((self.thisIndex[0] + up_shift) % self.charedim,
+                           (self.thisIndex[1] + left_shift) % self.charedim
+                           )
+
+        send_ch = self.get_neighbor_channel(send_target_idx)
+        recv_ch = self.get_neighbor_channel(recv_target_idx)
+
+        if left_shift:
+            send_ch.send(self.sub_a)
+            self.recv_a = recv_ch.recv()
+        if up_shift:
+            send_ch.send(self.sub_b)
+            self.recv_b = recv_ch.recv()
+
+
+def main(args):
+    if len(args) < 3:
+        print(f"USAGE: {args[0]} matrix_dim chare_dim")
+        print("matrix_dim and chare_dim must be perfect squares "
+              "where matrix_dim is divisible by chare_dim"
+              )
+        charm.exit(1)
+    matrix_dim = int(args[1])
+    chare_dim = int(args[2])
+
+    if matrix_dim % chare_dim:
+        print("ERROR: Matrix dim must evenly divide chare dim.")
+        charm.exit(1)
+
+    # size of each chare's sub-matrix
+    subdim_size = matrix_dim // chare_dim
+    print(f"Size of each chare's sub-array: {8*(subdim_size**2)/(1024**2)}MiB")
+
+    init_done = Future()
+    chares = Array(SubMatrix, (chare_dim, chare_dim),
+                   args=[subdim_size, chare_dim, init_done]
+                   )
+    init_done.get()
+
+    mult_done_future = Future()
+    tstart = time.time()
+    chares.cannons_multiplication(mult_done_future)
+    mult_done_future.get()
+    tend = time.time()
+
+    print(f"Elapsed time: {tend-tstart}")
+    charm.exit()
+
+charm.start(main)