Skip to content

Commit

Permalink
Replace npu_sync with dma_wait in programming examples (#1791)
Browse files Browse the repository at this point in the history
  • Loading branch information
hunhoffe authored Sep 25, 2024
1 parent 1cd4c27 commit 9cf8c0e
Show file tree
Hide file tree
Showing 46 changed files with 390 additions and 398 deletions.
7 changes: 4 additions & 3 deletions programming_examples/basic/dma_transpose/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,18 @@ def core_body():

@runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
def sequence(A, B, C):
npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
# The strides below are configured to read across all rows in the same column
# Stride of K in dim/wrap 2 skips an entire row to read a full column
npu_dma_memcpy_nd(
metadata="in",
metadata=of_in,
bd_id=1,
mem=A,
sizes=[1, K, M, 1],
strides=[1, 1, K, 1],
issue_token=True,
)
npu_sync(column=0, row=0, direction=0, channel=0)
npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])
dma_wait(of_in, of_out)

print(ctx.module)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def sequence(A, B, C):
C_col_offset = col * n
C_offset = C_col_offset + C_row_offset
npu_dma_memcpy_nd(
metadata=C_l2l3_fifos[col].sym_name.value,
metadata=C_l2l3_fifos[col],
bd_id=0,
mem=C,
offsets=[0, 0, 0, C_offset],
Expand All @@ -357,7 +357,7 @@ def sequence(A, B, C):
A_offset = A_block_offset + A_row_offset
B_col_offset = col * n
npu_dma_memcpy_nd(
metadata=A_l3l2_fifos[col].sym_name.value,
metadata=A_l3l2_fifos[col],
bd_id=2 * tile_row + 1,
mem=A,
offsets=[0, 0, 0, A_offset],
Expand All @@ -370,7 +370,7 @@ def sequence(A, B, C):
strides=[0, k * n_aie_rows, K, 1],
)
npu_dma_memcpy_nd(
metadata=B_l3l2_fifos[col].sym_name.value,
metadata=B_l3l2_fifos[col],
bd_id=2 * tile_row + 2,
mem=B,
offsets=[0, 0, 0, B_col_offset],
Expand All @@ -382,8 +382,7 @@ def sequence(A, B, C):
],
strides=[n * n_aie_cols, k * n_aie_rows * N, N, 1],
)
for col in range(n_aie_cols):
npu_sync(column=col, row=0, direction=0, channel=0)
dma_wait(*C_l2l3_fifos)


if __name__ == "__main__":
Expand Down
115 changes: 40 additions & 75 deletions programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,71 +81,53 @@ def device_body():
ComputeTile2 = tile(2, 2)
ComputeTile3 = tile(3, 2)
cores = [ComputeTile0, ComputeTile1, ComputeTile2, ComputeTile3]
memA_fifo_names = ["memA0", "memA1", "memA2", "memA3"]
memA_fifos = {}
inA_fifo_names = ["inA0", "inA1", "inA2", "inA3"]
inA_fifos = {}
inB_fifo_names = ["inB"]
inB_fifos = {}
outC_fifo_names = ["outC0", "outC1", "outC2", "outC3"]
outC_fifos = {}
memA_fifos = []
inA_fifos = []
outC_fifos = []

# AIE-array data movement with object fifos
# Input A
for i in range(n_cores):
memA_fifos[memA_fifo_names[i]] = object_fifo(
memA_fifo_names[i],
ShimTiles[i],
MemTiles[i],
2,
memRef_inA_ty,
memA_fifos.append(
object_fifo(f"memA{i}", ShimTiles[i], MemTiles[i], 2, memRef_inA_ty)
)
inA_fifos[inA_fifo_names[i]] = object_fifo(
inA_fifo_names[i],
MemTiles[i],
cores[i],
2,
memRef_A_ty,
(
[
(k // 2 // 2, 2),
(m, k),
(2, 1),
]
if vectorized
else []
), # transpose at 4-byte (2xbf16) granularity
inA_fifos.append(
object_fifo(
f"inA{i}",
MemTiles[i],
cores[i],
2,
memRef_A_ty,
(
[
(k // 2 // 2, 2),
(m, k),
(2, 1),
]
if vectorized
else []
), # transpose at 4-byte (2xbf16) granularity
)
)
object_fifo_link(
memA_fifos[memA_fifo_names[i]], inA_fifos[inA_fifo_names[i]]
object_fifo_link(memA_fifos[i], inA_fifos[i])

# Output C
outC_fifos.append(
object_fifo(f"outC{i}", cores[i], ShimTiles[i], 2, memRef_outC_ty)
)

# Input B
inB_fifos[inB_fifo_names[0]] = object_fifo(
inB_fifo_names[0],
ShimTiles[1 % n_cores],
cores[0:n_cores],
2,
memRef_inB_ty,
inB_fifo = object_fifo(
"inB", ShimTiles[1 % n_cores], cores[0:n_cores], 2, memRef_inB_ty
)

# Output C
for i in range(n_cores):
outC_fifos[outC_fifo_names[i]] = object_fifo(
outC_fifo_names[i],
cores[i],
ShimTiles[i],
2,
memRef_outC_ty,
)

# Set up compute tiles
for i in range(n_cores):
# Compute tile i
@core(cores[i], f"mv_{m}x{k}.o")
def core_body():
for _ in range_(0xFFFFFFFF):
elem_out = outC_fifos[outC_fifo_names[i]].acquire(
elem_out = outC_fifos[i].acquire(
ObjectFifoPort.Produce,
1,
)
Expand All @@ -155,31 +137,16 @@ def core_body():
call(zero_scalar, [elem_out])

for _ in range_(K_div_k):
elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
ObjectFifoPort.Consume,
1,
)
elem_in_b = inB_fifos[inB_fifo_names[0]].acquire(
ObjectFifoPort.Consume,
1,
)
elem_in_a = inA_fifos[i].acquire(ObjectFifoPort.Consume, 1)
elem_in_b = inB_fifo.acquire(ObjectFifoPort.Consume, 1)
if vectorized:
call(matvec, [elem_in_a, elem_in_b, elem_out])
else:
call(matvec_scalar, [elem_in_a, elem_in_b, elem_out])
inA_fifos[inA_fifo_names[i]].release(
ObjectFifoPort.Consume,
1,
)
inB_fifos[inB_fifo_names[0]].release(
ObjectFifoPort.Consume,
1,
)

outC_fifos[outC_fifo_names[i]].release(
ObjectFifoPort.Produce,
1,
)
inA_fifos[i].release(ObjectFifoPort.Consume, 1)
inB_fifo.release(ObjectFifoPort.Consume, 1)

outC_fifos[i].release(ObjectFifoPort.Produce, 1)

# To/from AIE-array data movement

Expand All @@ -190,7 +157,7 @@ def core_body():
)
def sequence(A, B, C):
npu_dma_memcpy_nd(
metadata=inB_fifo_names[0],
metadata=inB_fifo,
bd_id=2,
mem=B,
sizes=[M_div_m_div_n_cores, 1, 1, K],
Expand All @@ -200,24 +167,22 @@ def sequence(A, B, C):
A_offset = i * M_div_m_div_n_cores * m * K
C_offset = i * M_div_m_div_n_cores * m
npu_dma_memcpy_nd(
metadata=memA_fifo_names[i],
metadata=memA_fifos[i],
bd_id=1,
mem=A,
offsets=[0, 0, 0, A_offset],
sizes=[M_div_m_div_n_cores, K_div_k, m, k],
strides=[m_x_K, k, K, 1],
)
npu_dma_memcpy_nd(
metadata=outC_fifo_names[i],
metadata=outC_fifos[i],
bd_id=0,
mem=C,
offsets=[0, 0, 0, C_offset],
sizes=[1, 1, 1, C_sz_div_n_cores],
strides=[0, 0, 0, 1],
)

for i in range(n_cores):
npu_sync(column=i, row=0, direction=0, channel=0)
dma_wait(*outC_fifos)

print(ctx.module)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,6 @@ def device_body():
memref_b_ty = T.memref(k, n, dtype_in())
memref_c_ty = T.memref(m, n, dtype_out())

ofifo_memref_a_ty = TypeAttr.get(ObjectFifoType.get(memref_a_ty))
ofifo_memref_b_ty = TypeAttr.get(ObjectFifoType.get(memref_b_ty))
ofifo_memref_c_ty = TypeAttr.get(ObjectFifoType.get(memref_c_ty))

# AIE Core Function declarations
zero_scalar = external_func(
f"zero_scalar_{dtype_out_str}", inputs=[memref_c_ty]
Expand Down Expand Up @@ -296,7 +292,7 @@ def sequence(A, B, C):
# At the very last iteration, we may not need a 'pong' iteration
break
npu_dma_memcpy_nd(
metadata="outC",
metadata=outC,
bd_id=bd_id_base,
mem=C,
offsets=[0, 0, 0, C_row_offset],
Expand All @@ -306,23 +302,23 @@ def sequence(A, B, C):
for tile_row in range(num_tile_rows):
A_row_offset = (row_base + tile_row) * m * K
npu_dma_memcpy_nd(
metadata="inA",
metadata=inA,
bd_id=bd_id_base + 2 * tile_row + 1,
mem=A,
offsets=[0, 0, 0, A_row_offset],
sizes=[N_div_n, K_div_k, m, k],
strides=[0, k, K, 1],
)
npu_dma_memcpy_nd(
metadata="inB",
metadata=inB,
bd_id=bd_id_base + 2 * tile_row + 2,
mem=B,
sizes=[N_div_n, K_div_k, k, n],
strides=[n, k_x_N, N, 1],
)
if tile_row_block > 0 or (tile_row_block == 0 and pingpong > 0):
npu_sync(column=0, row=0, direction=0, channel=0)
npu_sync(column=0, row=0, direction=0, channel=0)
dma_wait(outC)
dma_wait(outC)

print(ctx.module)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ The signature of the `aie.runtime_sequence()` operation lists as its arguments a
* Analogously to the data layout transformations described [further above](#tiling-and-data-layout-transformations) to translate a `m`×`k` matrix into blocks of `r`×`s`-submatrices, this transfer translates the input `M`×`K` and `K`×`N` matrices into submatrices of size `m`×`k` and `k`×`n`.
> Note that data layout transformations in the `npu_dma_memcpy_nd` operation are expressed in units of 4 bytes. This is why you will see all strides and the lowest-dimension length multiplied by a factor of `word_size_in` or `word_size_out` (to get the size in bytes) and then divided by four (to get the size in units of 4 bytes). This discrepancy will be streamlined in future versions.
* The DMA transfer function `npu_dma_memcpy_nd` sends a segment of matrix C data (submatrix c) from the corresponding `outC_fifos` for the respective column, back to the host while maintaining the appropriate strides and offsets.
* After completing DMA transfers for each column, `npu_sync` is used to synchronize their completion.
* After completing DMA transfers for each column, `dma_wait` is used to synchronize their completion.

The aforementioned transfers of rows of tiles of the `A` matrix are further split into a "ping" and a "pong" phase.
This allows us to reconfigure half of the buffer descriptors used for transferring `A` concurrently with the other half running (transferring data).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ def sequence(A, B, C):
C_col_offset = col * n
C_offset = C_col_offset + C_row_offset
npu_dma_memcpy_nd(
metadata=C_l2l3_fifos[col].sym_name.value,
metadata=C_l2l3_fifos[col],
bd_id=bd_id_base,
mem=C,
offsets=[0, 0, 0, C_offset],
Expand Down Expand Up @@ -408,7 +408,7 @@ def sequence(A, B, C):
) # base address for the shim in this column
A_offset = A_block_offset + A_row_offset
npu_dma_memcpy_nd(
metadata=A_l3l2_fifos[col].sym_name.value,
metadata=A_l3l2_fifos[col],
bd_id=bd_id_base + 2 * tile_row + 1,
mem=A,
offsets=[0, 0, 0, A_offset],
Expand Down Expand Up @@ -441,7 +441,7 @@ def sequence(A, B, C):
# ----------------
B_col_offset = col * n if not b_col_maj else col * n * K
npu_dma_memcpy_nd(
metadata=B_l3l2_fifos[col].sym_name.value,
metadata=B_l3l2_fifos[col],
bd_id=bd_id_base + 2 * tile_row + 2,
mem=B,
offsets=[0, 0, 0, B_col_offset],
Expand All @@ -457,12 +457,8 @@ def sequence(A, B, C):
),
)
if tb > 0 or (tb == 0 and pingpong > 0):
for col in range(n_aie_cols):
npu_sync(
column=col, row=0, direction=0, channel=0
) # C done
for col in range(n_aie_cols):
npu_sync(column=col, row=0, direction=0, channel=0)
dma_wait(*C_l2l3_fifos)
dma_wait(*C_l2l3_fifos)


if __name__ == "__main__":
Expand Down
16 changes: 9 additions & 7 deletions programming_examples/basic/matrix_scalar_add/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,20 +79,22 @@ def core_body():
@runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
def sequence(inTensor, notUsed, outTensor):
npu_dma_memcpy_nd(
metadata="out0",
bd_id=0,
mem=outTensor,
metadata=of_in1,
bd_id=1,
mem=inTensor,
sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
strides=[1, 1, IMAGE_WIDTH, 1],
issue_token=True,
)

npu_dma_memcpy_nd(
metadata="in0",
bd_id=1,
mem=inTensor,
metadata=of_out1,
bd_id=0,
mem=outTensor,
sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
strides=[1, 1, IMAGE_WIDTH, 1],
)
npu_sync(column=0, row=0, direction=0, channel=0)
dma_wait(of_in1, of_out1)


with mlir_mod_ctx() as ctx:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,12 @@ def core_body():

@runtime_sequence(tensor_in_ty, tensor_in_ty, tensor_out_ty)
def sequence(A, B, C):
npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N])
npu_dma_memcpy_nd(
metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, out_size]
metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, out_size]
)
npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
npu_sync(column=0, row=0, direction=0, channel=0)
# of_out will only complete after of_in completes, so we just wait on of_out instead of both
dma_wait(of_out)

print(ctx.module)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,15 @@ def device_body():

@runtime_sequence(tensor_ty, tensor_ty, tensor_out_ty)
def sequence(A, B, C):
npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N])
npu_dma_memcpy_nd(
metadata="out",
metadata=of_out,
bd_id=0,
mem=C,
sizes=[1, 1, 1, N * (memtile_repeat_count + 1)],
)
npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
npu_sync(column=0, row=0, direction=0, channel=0)
# of_out will only complete after of_in completes, so we just wait on of_out instead of both
dma_wait(of_out)

print(ctx.module)

Expand Down
Loading

0 comments on commit 9cf8c0e

Please sign in to comment.