Replace npu_sync with dma_wait in programming examples (#1791)

Xilinx · Sep 25, 2024 · 9cf8c0e · 9cf8c0e
1 parent 1cd4c27
commit 9cf8c0e
Show file tree

Hide file tree

Showing 46 changed files with 390 additions and 398 deletions.
diff --git a/programming_examples/basic/dma_transpose/aie2.py b/programming_examples/basic/dma_transpose/aie2.py
@@ -52,17 +52,18 @@ def core_body():
 
             @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
             def sequence(A, B, C):
-                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
                 # The strides below are configured to read across all rows in the same column
                 # Stride of K in dim/wrap 2 skips an entire row to read a full column
                 npu_dma_memcpy_nd(
-                    metadata="in",
+                    metadata=of_in,
                     bd_id=1,
                     mem=A,
                     sizes=[1, K, M, 1],
                     strides=[1, 1, K, 1],
+                    issue_token=True,
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
+                npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                dma_wait(of_in, of_out)
 
     print(ctx.module)
 

diff --git a/programming_examples/basic/matrix_multiplication/cascade/aie2.py b/programming_examples/basic/matrix_multiplication/cascade/aie2.py
@@ -344,7 +344,7 @@ def sequence(A, B, C):
                     C_col_offset = col * n
                     C_offset = C_col_offset + C_row_offset
                     npu_dma_memcpy_nd(
-                        metadata=C_l2l3_fifos[col].sym_name.value,
+                        metadata=C_l2l3_fifos[col],
                         bd_id=0,
                         mem=C,
                         offsets=[0, 0, 0, C_offset],
@@ -357,7 +357,7 @@ def sequence(A, B, C):
                         A_offset = A_block_offset + A_row_offset
                         B_col_offset = col * n
                         npu_dma_memcpy_nd(
-                            metadata=A_l3l2_fifos[col].sym_name.value,
+                            metadata=A_l3l2_fifos[col],
                             bd_id=2 * tile_row + 1,
                             mem=A,
                             offsets=[0, 0, 0, A_offset],
@@ -370,7 +370,7 @@ def sequence(A, B, C):
                             strides=[0, k * n_aie_rows, K, 1],
                         )
                         npu_dma_memcpy_nd(
-                            metadata=B_l3l2_fifos[col].sym_name.value,
+                            metadata=B_l3l2_fifos[col],
                             bd_id=2 * tile_row + 2,
                             mem=B,
                             offsets=[0, 0, 0, B_col_offset],
@@ -382,8 +382,7 @@ def sequence(A, B, C):
                             ],
                             strides=[n * n_aie_cols, k * n_aie_rows * N, N, 1],
                         )
-                for col in range(n_aie_cols):
-                    npu_sync(column=col, row=0, direction=0, channel=0)
+                dma_wait(*C_l2l3_fifos)
 
 
 if __name__ == "__main__":

diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
@@ -81,71 +81,53 @@ def device_body():
             ComputeTile2 = tile(2, 2)
             ComputeTile3 = tile(3, 2)
             cores = [ComputeTile0, ComputeTile1, ComputeTile2, ComputeTile3]
-            memA_fifo_names = ["memA0", "memA1", "memA2", "memA3"]
-            memA_fifos = {}
-            inA_fifo_names = ["inA0", "inA1", "inA2", "inA3"]
-            inA_fifos = {}
-            inB_fifo_names = ["inB"]
-            inB_fifos = {}
-            outC_fifo_names = ["outC0", "outC1", "outC2", "outC3"]
-            outC_fifos = {}
+            memA_fifos = []
+            inA_fifos = []
+            outC_fifos = []
 
             # AIE-array data movement with object fifos
             # Input A
             for i in range(n_cores):
-                memA_fifos[memA_fifo_names[i]] = object_fifo(
-                    memA_fifo_names[i],
-                    ShimTiles[i],
-                    MemTiles[i],
-                    2,
-                    memRef_inA_ty,
+                memA_fifos.append(
+                    object_fifo(f"memA{i}", ShimTiles[i], MemTiles[i], 2, memRef_inA_ty)
                 )
-                inA_fifos[inA_fifo_names[i]] = object_fifo(
-                    inA_fifo_names[i],
-                    MemTiles[i],
-                    cores[i],
-                    2,
-                    memRef_A_ty,
-                    (
-                        [
-                            (k // 2 // 2, 2),
-                            (m, k),
-                            (2, 1),
-                        ]
-                        if vectorized
-                        else []
-                    ),  # transpose at 4-byte (2xbf16) granularity
+                inA_fifos.append(
+                    object_fifo(
+                        f"inA{i}",
+                        MemTiles[i],
+                        cores[i],
+                        2,
+                        memRef_A_ty,
+                        (
+                            [
+                                (k // 2 // 2, 2),
+                                (m, k),
+                                (2, 1),
+                            ]
+                            if vectorized
+                            else []
+                        ),  # transpose at 4-byte (2xbf16) granularity
+                    )
                 )
-                object_fifo_link(
-                    memA_fifos[memA_fifo_names[i]], inA_fifos[inA_fifo_names[i]]
+                object_fifo_link(memA_fifos[i], inA_fifos[i])
+
+                # Output C
+                outC_fifos.append(
+                    object_fifo(f"outC{i}", cores[i], ShimTiles[i], 2, memRef_outC_ty)
                 )
 
             # Input B
-            inB_fifos[inB_fifo_names[0]] = object_fifo(
-                inB_fifo_names[0],
-                ShimTiles[1 % n_cores],
-                cores[0:n_cores],
-                2,
-                memRef_inB_ty,
+            inB_fifo = object_fifo(
+                "inB", ShimTiles[1 % n_cores], cores[0:n_cores], 2, memRef_inB_ty
             )
 
-            # Output C
-            for i in range(n_cores):
-                outC_fifos[outC_fifo_names[i]] = object_fifo(
-                    outC_fifo_names[i],
-                    cores[i],
-                    ShimTiles[i],
-                    2,
-                    memRef_outC_ty,
-                )
-
             # Set up compute tiles
             for i in range(n_cores):
                 # Compute tile i
                 @core(cores[i], f"mv_{m}x{k}.o")
                 def core_body():
                     for _ in range_(0xFFFFFFFF):
-                        elem_out = outC_fifos[outC_fifo_names[i]].acquire(
+                        elem_out = outC_fifos[i].acquire(
                             ObjectFifoPort.Produce,
                             1,
                         )
@@ -155,31 +137,16 @@ def core_body():
                             call(zero_scalar, [elem_out])
 
                         for _ in range_(K_div_k):
-                            elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
-                                ObjectFifoPort.Consume,
-                                1,
-                            )
-                            elem_in_b = inB_fifos[inB_fifo_names[0]].acquire(
-                                ObjectFifoPort.Consume,
-                                1,
-                            )
+                            elem_in_a = inA_fifos[i].acquire(ObjectFifoPort.Consume, 1)
+                            elem_in_b = inB_fifo.acquire(ObjectFifoPort.Consume, 1)
                             if vectorized:
                                 call(matvec, [elem_in_a, elem_in_b, elem_out])
                             else:
                                 call(matvec_scalar, [elem_in_a, elem_in_b, elem_out])
-                            inA_fifos[inA_fifo_names[i]].release(
-                                ObjectFifoPort.Consume,
-                                1,
-                            )
-                            inB_fifos[inB_fifo_names[0]].release(
-                                ObjectFifoPort.Consume,
-                                1,
-                            )
-
-                        outC_fifos[outC_fifo_names[i]].release(
-                            ObjectFifoPort.Produce,
-                            1,
-                        )
+                            inA_fifos[i].release(ObjectFifoPort.Consume, 1)
+                            inB_fifo.release(ObjectFifoPort.Consume, 1)
+
+                        outC_fifos[i].release(ObjectFifoPort.Produce, 1)
 
             # To/from AIE-array data movement
 
@@ -190,7 +157,7 @@ def core_body():
             )
             def sequence(A, B, C):
                 npu_dma_memcpy_nd(
-                    metadata=inB_fifo_names[0],
+                    metadata=inB_fifo,
                     bd_id=2,
                     mem=B,
                     sizes=[M_div_m_div_n_cores, 1, 1, K],
@@ -200,24 +167,22 @@ def sequence(A, B, C):
                     A_offset = i * M_div_m_div_n_cores * m * K
                     C_offset = i * M_div_m_div_n_cores * m
                     npu_dma_memcpy_nd(
-                        metadata=memA_fifo_names[i],
+                        metadata=memA_fifos[i],
                         bd_id=1,
                         mem=A,
                         offsets=[0, 0, 0, A_offset],
                         sizes=[M_div_m_div_n_cores, K_div_k, m, k],
                         strides=[m_x_K, k, K, 1],
                     )
                     npu_dma_memcpy_nd(
-                        metadata=outC_fifo_names[i],
+                        metadata=outC_fifos[i],
                         bd_id=0,
                         mem=C,
                         offsets=[0, 0, 0, C_offset],
                         sizes=[1, 1, 1, C_sz_div_n_cores],
                         strides=[0, 0, 0, 1],
                     )
-
-                for i in range(n_cores):
-                    npu_sync(column=i, row=0, direction=0, channel=0)
+                dma_wait(*outC_fifos)
 
     print(ctx.module)
 

diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -117,10 +117,6 @@ def device_body():
             memref_b_ty = T.memref(k, n, dtype_in())
             memref_c_ty = T.memref(m, n, dtype_out())
 
-            ofifo_memref_a_ty = TypeAttr.get(ObjectFifoType.get(memref_a_ty))
-            ofifo_memref_b_ty = TypeAttr.get(ObjectFifoType.get(memref_b_ty))
-            ofifo_memref_c_ty = TypeAttr.get(ObjectFifoType.get(memref_c_ty))
-
             # AIE Core Function declarations
             zero_scalar = external_func(
                 f"zero_scalar_{dtype_out_str}", inputs=[memref_c_ty]
@@ -296,7 +292,7 @@ def sequence(A, B, C):
                             # At the very last iteration, we may not need a 'pong' iteration
                             break
                         npu_dma_memcpy_nd(
-                            metadata="outC",
+                            metadata=outC,
                             bd_id=bd_id_base,
                             mem=C,
                             offsets=[0, 0, 0, C_row_offset],
@@ -306,23 +302,23 @@ def sequence(A, B, C):
                         for tile_row in range(num_tile_rows):
                             A_row_offset = (row_base + tile_row) * m * K
                             npu_dma_memcpy_nd(
-                                metadata="inA",
+                                metadata=inA,
                                 bd_id=bd_id_base + 2 * tile_row + 1,
                                 mem=A,
                                 offsets=[0, 0, 0, A_row_offset],
                                 sizes=[N_div_n, K_div_k, m, k],
                                 strides=[0, k, K, 1],
                             )
                             npu_dma_memcpy_nd(
-                                metadata="inB",
+                                metadata=inB,
                                 bd_id=bd_id_base + 2 * tile_row + 2,
                                 mem=B,
                                 sizes=[N_div_n, K_div_k, k, n],
                                 strides=[n, k_x_N, N, 1],
                             )
                         if tile_row_block > 0 or (tile_row_block == 0 and pingpong > 0):
-                            npu_sync(column=0, row=0, direction=0, channel=0)
-                npu_sync(column=0, row=0, direction=0, channel=0)
+                            dma_wait(outC)
+                dma_wait(outC)
 
     print(ctx.module)
 

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/README.md b/programming_examples/basic/matrix_multiplication/whole_array/README.md
@@ -209,7 +209,7 @@ The signature of the `aie.runtime_sequence()` operation lists as its arguments a
         * Analogously to the data layout transformations described [further above](#tiling-and-data-layout-transformations) to translate a `m`&times;`k` matrix into blocks of `r`&times;`s`-submatrices, this transfer translates the input `M`&times;`K` and `K`&times;`N` matrices into submatrices of size `m`&times;`k` and `k`&times;`n`.
            > Note that data layout transformations in the `npu_dma_memcpy_nd` operation are expressed in units of 4 bytes. This is why you will see all strides and the lowest-dimension length multiplied by a factor of `word_size_in` or `word_size_out` (to get the size in bytes) and then divided by four (to get the size in units of 4 bytes). This discrepancy will be streamlined in future versions.
     * The DMA transfer function `npu_dma_memcpy_nd` sends a segment of matrix C data (submatrix c) from the corresponding `outC_fifos` for the respective column, back to the host while maintaining the appropriate strides and offsets.
-    * After completing DMA transfers for each column, `npu_sync` is used to synchronize their completion.
+    * After completing DMA transfers for each column, `dma_wait` is used to synchronize their completion.
 
 The aforementioned transfers of rows of tiles of the `A` matrix are further split into a "ping" and a "pong" phase.
 This allows us to reconfigure half of the buffer descriptors used for transferring `A` concurrently with the other half running (transferring data).

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -372,7 +372,7 @@ def sequence(A, B, C):
                         C_col_offset = col * n
                         C_offset = C_col_offset + C_row_offset
                         npu_dma_memcpy_nd(
-                            metadata=C_l2l3_fifos[col].sym_name.value,
+                            metadata=C_l2l3_fifos[col],
                             bd_id=bd_id_base,
                             mem=C,
                             offsets=[0, 0, 0, C_offset],
@@ -408,7 +408,7 @@ def sequence(A, B, C):
                             )  # base address for the shim in this column
                             A_offset = A_block_offset + A_row_offset
                             npu_dma_memcpy_nd(
-                                metadata=A_l3l2_fifos[col].sym_name.value,
+                                metadata=A_l3l2_fifos[col],
                                 bd_id=bd_id_base + 2 * tile_row + 1,
                                 mem=A,
                                 offsets=[0, 0, 0, A_offset],
@@ -441,7 +441,7 @@ def sequence(A, B, C):
                             #      ----------------
                             B_col_offset = col * n if not b_col_maj else col * n * K
                             npu_dma_memcpy_nd(
-                                metadata=B_l3l2_fifos[col].sym_name.value,
+                                metadata=B_l3l2_fifos[col],
                                 bd_id=bd_id_base + 2 * tile_row + 2,
                                 mem=B,
                                 offsets=[0, 0, 0, B_col_offset],
@@ -457,12 +457,8 @@ def sequence(A, B, C):
                                 ),
                             )
                     if tb > 0 or (tb == 0 and pingpong > 0):
-                        for col in range(n_aie_cols):
-                            npu_sync(
-                                column=col, row=0, direction=0, channel=0
-                            )  # C done
-            for col in range(n_aie_cols):
-                npu_sync(column=col, row=0, direction=0, channel=0)
+                        dma_wait(*C_l2l3_fifos)
+            dma_wait(*C_l2l3_fifos)
 
 
 if __name__ == "__main__":

diff --git a/programming_examples/basic/matrix_scalar_add/aie2.py b/programming_examples/basic/matrix_scalar_add/aie2.py
@@ -79,20 +79,22 @@ def core_body():
         @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
         def sequence(inTensor, notUsed, outTensor):
             npu_dma_memcpy_nd(
-                metadata="out0",
-                bd_id=0,
-                mem=outTensor,
+                metadata=of_in1,
+                bd_id=1,
+                mem=inTensor,
                 sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
                 strides=[1, 1, IMAGE_WIDTH, 1],
+                issue_token=True,
             )
+
             npu_dma_memcpy_nd(
-                metadata="in0",
-                bd_id=1,
-                mem=inTensor,
+                metadata=of_out1,
+                bd_id=0,
+                mem=outTensor,
                 sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
                 strides=[1, 1, IMAGE_WIDTH, 1],
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
+            dma_wait(of_in1, of_out1)
 
 
 with mlir_mod_ctx() as ctx:

diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py
@@ -99,11 +99,12 @@ def core_body():
 
             @runtime_sequence(tensor_in_ty, tensor_in_ty, tensor_out_ty)
             def sequence(A, B, C):
+                npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N])
                 npu_dma_memcpy_nd(
-                    metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, out_size]
+                    metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, out_size]
                 )
-                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_sync(column=0, row=0, direction=0, channel=0)
+                # of_out will only complete after of_in completes, so we just wait on of_out instead of both
+                dma_wait(of_out)
 
     print(ctx.module)
 

diff --git a/programming_examples/basic/memtile_repeat/simple_repeat/aie2.py b/programming_examples/basic/memtile_repeat/simple_repeat/aie2.py
@@ -55,14 +55,15 @@ def device_body():
 
             @runtime_sequence(tensor_ty, tensor_ty, tensor_out_ty)
             def sequence(A, B, C):
+                npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N])
                 npu_dma_memcpy_nd(
-                    metadata="out",
+                    metadata=of_out,
                     bd_id=0,
                     mem=C,
                     sizes=[1, 1, 1, N * (memtile_repeat_count + 1)],
                 )
-                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_sync(column=0, row=0, direction=0, channel=0)
+                # of_out will only complete after of_in completes, so we just wait on of_out instead of both
+                dma_wait(of_out)
 
     print(ctx.module)