diff --git a/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex b/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex index 7bee800377e..be2854f5170 100644 --- a/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex +++ b/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex @@ -2,18 +2,18 @@ Hello, world! ---- ---- Basic counts tool results: Total counts: - 655 total \(fetched\) instructions - 243 total unique \(fetched\) instructions + 685 total \(fetched\) instructions + 255 total unique \(fetched\) instructions 0 total non-fetched instructions 0 total prefetches #if (__ARM_FEATURE_SVE_BITS == 128) - 1069 total data loads + 1137 total data loads 861 total data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 1967 total data loads + 2035 total data loads 1595 total data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3763 total data loads + 3831 total data loads 3063 total data stores #endif 0 total icache flushes @@ -22,18 +22,18 @@ Total counts: .* total scheduling markers .* Thread .* counts: - 655 \(fetched\) instructions - 243 unique \(fetched\) instructions + 685 \(fetched\) instructions + 255 unique \(fetched\) instructions 0 non-fetched instructions 0 prefetches #if (__ARM_FEATURE_SVE_BITS == 128) - 1069 data loads + 1137 data loads 861 data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 1967 data loads + 2035 data loads 1595 data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3763 data loads + 3831 data loads 3063 data stores #endif 0 icache flushes diff --git a/clients/drcachesim/tests/allasm_scattergather_aarch64.asm b/clients/drcachesim/tests/allasm_scattergather_aarch64.asm index 2c0f86c15cc..b1105e6a7b2 100644 --- a/clients/drcachesim/tests/allasm_scattergather_aarch64.asm +++ b/clients/drcachesim/tests/allasm_scattergather_aarch64.asm @@ -366,6 +366,21 @@ test_scalar_plus_immediate: ret +test_replicating_loads: + ld1rqb DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 16 + ld1rqh DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 8 + ld1rqw DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #2] // 4 + ld1rqd DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #3] // 2 + // Total: 30 + + ld1rqb DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #0] // 16 + ld1rqh DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #0] // 8 + ld1rqw DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #0] // 4 + ld1rqd DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #0] // 2 + // Total: 30 + + ret + _start: #ifdef __APPLE__ adrp BUFFER_REG, buffer@PAGE @@ -407,8 +422,10 @@ _start: bl test_scalar_plus_immediate // +(374 * vl_bytes/16) loads // +(322 * vl_bytes/16) stores + bl test_replicating_loads // +60 loads + // +0 stores // Running total: - // Loads: (136 + 14 + 374 + 374) * vl_bytes/16 = 898 * vl_bytes/16 + // Loads: (136 + 14 + 374 + 374) * vl_bytes/16 + 60 = 898 * vl_bytes/16 + 60 // Stores: (82 + 8 + 322 + 322) * vl_bytes/16 = 734 * vl_bytes/16 /* Run all the instructions with no active elements */ @@ -422,9 +439,10 @@ _start: bl test_vector_plus_immediate // +0 loads, +0 stores bl test_scalar_plus_scalar // +0 loads, +0 stores bl test_scalar_plus_immediate // +0 loads, +0 stores + bl test_replicating_loads // +0 loads, +0 stores // Running total (unchanged from above): - // Loads: 898 * vl_bytes/16 + // Loads: (898 * vl_bytes/16) + 60 // Stores: 734 * vl_bytes/16 /* Run all instructions with one active element */ @@ -437,26 +455,28 @@ _start: bl test_vector_plus_immediate // +7 loads, +4 stores bl test_scalar_plus_scalar // +56 loads, +46 stores bl test_scalar_plus_immediate // +56 loads, +46 stores + bl test_replicating_loads // +8 loads, +0 stores // Running total: - // Loads: (898 * vl_bytes/16) + 52 + 7 + 56 + 56 = (898 * vl_bytes/16) + 171 + // Loads: (898 * vl_bytes/16) + 60 + 52 + 7 + 56 + 56 + 8 = (898 * vl_bytes/16) + 239 // Stores: (734 * vl_bytes/16) + 41 + 4 + 46 + 46 = (734 * vl_bytes/16) + 127 // The functions in this file have the following instructions counts: - // _start 37 + // _start 40 // test_scalar_plus_vector 84 // test_vector_plus_immediate 12 // test_scalar_plus_scalar 55 // test_scalar_plus_immediate 55 - // So there are 37 + 84 + 12 + 55 + 55 = 243 unique instructions - // We run the test_* functions 3 times each so the totoal instruction executed is - // ((84 + 12 + 55 + 55) * 3) + 37 = (206 * 3) + 37 = 655 + // test_replicating_loads 9 + // So there are 40 + 84 + 12 + 55 + 55 + 9 = 255 unique instructions + // We run the test_* functions 3 times each so the total instruction executed is + // ((84 + 12 + 55 + 55 + 9) * 3) + 40 = (215 * 3) + 37 = 685 // Totals: - // Loads: (898 * vl_bytes/16) + 171 + // Loads: (898 * vl_bytes/16) + 239 // Stores: (734 * vl_bytes/16) + 127 - // Instructions: 703 - // Unique instructions: 259 + // Instructions: 685 + // Unique instructions: 255 // Exit. mov w0, #1 // stdout diff --git a/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex b/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex index 04f579ec8b8..3cda0549822 100644 --- a/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex +++ b/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex @@ -1,18 +1,18 @@ Hello, world! Basic counts tool results: Total counts: - 655 total \(fetched\) instructions - 243 total unique \(fetched\) instructions + 685 total \(fetched\) instructions + 255 total unique \(fetched\) instructions 0 total non-fetched instructions 0 total prefetches #if (__ARM_FEATURE_SVE_BITS == 128) - 1069 total data loads + 1137 total data loads 861 total data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 1967 total data loads + 2035 total data loads 1595 total data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3763 total data loads + 3831 total data loads 3063 total data stores #endif 0 total icache flushes @@ -21,18 +21,18 @@ Total counts: .* total scheduling markers .* Thread .* counts: - 655 \(fetched\) instructions - 243 unique \(fetched\) instructions + 685 \(fetched\) instructions + 255 unique \(fetched\) instructions 0 non-fetched instructions 0 prefetches #if (__ARM_FEATURE_SVE_BITS == 128) - 1069 data loads + 1137 data loads 861 data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 1967 data loads + 2035 data loads 1595 data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 3763 data loads + 3831 data loads 3063 data stores #endif 0 icache flushes diff --git a/clients/drcachesim/tests/scattergather-aarch64.templatex b/clients/drcachesim/tests/scattergather-aarch64.templatex index c3c7a9762b7..6efdb5b4544 100644 --- a/clients/drcachesim/tests/scattergather-aarch64.templatex +++ b/clients/drcachesim/tests/scattergather-aarch64.templatex @@ -144,6 +144,10 @@ ldnt1w scalar\+scalar: PASS ld1sw scalar\+scalar: PASS ld1d scalar\+scalar: PASS ldnt1d scalar\+scalar: PASS +ld1rqb scalar\+scalar: PASS +ld1rqh scalar\+scalar: PASS +ld1rqw scalar\+scalar: PASS +ld1rqd scalar\+scalar: PASS ld2b scalar\+scalar: PASS ld2h scalar\+scalar: PASS ld2w scalar\+scalar: PASS @@ -212,6 +216,12 @@ ld1d scalar\+immediate 64bit element: PASS ld1d scalar\+immediate 64bit element \(min index\): PASS ld1d scalar\+immediate 64bit element \(max index\): PASS ldnt1d scalar\+immediate 64bit element: PASS +ld1rqb scalar\+immediate: PASS +ld1rqh scalar\+immediate: PASS +ld1rqw scalar\+immediate: PASS +ld1rqd scalar\+immediate: PASS +ld1rqd scalar\+immediate \(min index\): PASS +ld1rqd scalar\+immediate \(max index\): PASS ld2b scalar\+immediate: PASS ld2h scalar\+immediate: PASS ld2w scalar\+immediate: PASS diff --git a/ext/drx/scatter_gather_aarch64.c b/ext/drx/scatter_gather_aarch64.c index d8b70dfc292..e1fd2db8362 100644 --- a/ext/drx/scatter_gather_aarch64.c +++ b/ext/drx/scatter_gather_aarch64.c @@ -68,7 +68,7 @@ typedef struct _per_thread_t { * This corresponds to the spill slot storage in per_thread_t. */ typedef struct _spill_slot_state_t { -#define NUM_PRED_SLOTS 1 +#define NUM_PRED_SLOTS 2 reg_id_t pred_slots[NUM_PRED_SLOTS]; #define NUM_VECTOR_SLOTS 1 @@ -621,12 +621,37 @@ static void expand_contiguous(void *drcontext, instrlist_t *bb, instr_t *sg_instr, const scatter_gather_info_t *sg_info, reg_id_t new_base, reg_id_t scalar_index, reg_id_t scalar_src_or_dst, - reg_id_t scratch_pred, reg_id_t scratch_vec, app_pc orig_app_pc) + reg_id_t scratch_pred, reg_id_t governing_pred, reg_id_t scratch_vec, + app_pc orig_app_pc) { #define EMIT(op, ...) \ instrlist_preinsert( \ bb, sg_instr, INSTR_XL8(INSTR_CREATE_##op(drcontext, __VA_ARGS__), orig_app_pc)) + if (sg_info->is_replicating && proc_get_vector_length_bytes() > 16) { + /* This instruction loads a fixed size 16-byte vector which is replicated to + * all quadword elements on hardware with a vector length > 16 bytes. + * Only the bottom 16 bits of the governing predicate register are used so we + * need to mask out any higher bits than that. + */ + DR_ASSERT(sg_info->scatter_gather_size == OPSZ_16); + + /* Set scratch_pred to a value with the first 16 elements active */ + /* ptrue scratch_pred.b, vl16 */ + EMIT(ptrue_sve, opnd_create_reg_element_vector(scratch_pred, OPSZ_1), + opnd_create_immed_pred_constr(DR_PRED_CONSTR_VL16)); + + /* Create a new governing predicate by applying the mask we created in + * scratch_pred to the instruction's mask_reg. + */ + + /* and governing_pred.b, mask_reg/z, mask_reg.b, scratch_pred.b */ + EMIT(and_sve_pred_b, opnd_create_reg_element_vector(governing_pred, OPSZ_1), + opnd_create_predicate_reg(sg_info->mask_reg, /*merging=*/false), + opnd_create_reg_element_vector(sg_info->mask_reg, OPSZ_1), + opnd_create_reg_element_vector(scratch_pred, OPSZ_1)); + } + /* Calculate the new base address in scratch_gpr0. * Note that we can't use drutil_insert_get_mem_addr() here because we don't want the * BSD licensed drx to have a dependency on the LGPL licensed drutil. @@ -686,6 +711,7 @@ expand_contiguous(void *drcontext, instrlist_t *bb, instr_t *sg_instr, } else { /* scalar+scalar: Keep the original modifier copied from sg_info */ } + modified_sg_info.mask_reg = governing_pred; /* Note that modified_sg_info might not describe a valid SVE instruction. * For example if we are expanding: @@ -702,6 +728,21 @@ expand_contiguous(void *drcontext, instrlist_t *bb, instr_t *sg_instr, /* Expand the instruction as if it were a scalar+vector scatter/gather instruction */ expand_scatter_gather(drcontext, bb, sg_instr, &modified_sg_info, scalar_index, scalar_src_or_dst, scratch_pred, orig_app_pc); + + if (sg_info->is_replicating && proc_get_vector_length_bytes() > 16) { + /* All supported replicating loads load a 16-byte vector. */ + DR_ASSERT(sg_info->scatter_gather_size == OPSZ_16); + + /* Replicate the first quadword element (16 bytes) to the other elements in the + * vector. + */ + + /* dup gather_dst.q, gather_dst.q[0]*/ + EMIT(dup_sve_idx, + opnd_create_reg_element_vector(sg_info->gather_dst_reg, OPSZ_16), + opnd_create_reg_element_vector(sg_info->gather_dst_reg, OPSZ_16), + opnd_create_immed_uint(0, OPSZ_2b)); + } #undef EMIT } @@ -713,13 +754,20 @@ expand_contiguous(void *drcontext, instrlist_t *bb, instr_t *sg_instr, reg_id_t reserve_sve_register(void *drcontext, instrlist_t *bb, instr_t *where, reg_id_t scratch_gpr0, reg_id_t min_register, reg_id_t max_register, - size_t slot_tls_offset, opnd_size_t reg_size, uint slot_num) + size_t slot_tls_offset, opnd_size_t reg_size, uint slot_num, + reg_id_t *already_allocated_regs, uint num_already_allocated) { /* Search the instruction for an unused register we will use as a temp. */ reg_id_t reg; for (reg = min_register; reg <= max_register; ++reg) { - if (!instr_uses_reg(where, reg)) - break; + if (!instr_uses_reg(where, reg)) { + bool reg_already_allocated = false; + for (uint i = 0; !reg_already_allocated && i < num_already_allocated; i++) { + reg_already_allocated = already_allocated_regs[i] == reg; + } + if (!reg_already_allocated) + break; + } } DR_ASSERT(!instr_uses_reg(where, reg)); @@ -759,10 +807,11 @@ reserve_pred_register(void *drcontext, instrlist_t *bb, instr_t *where, /* Some instructions require the predicate to be in the range p0 - p7. This includes * LASTB which we use to extract elements from the vector register. */ - const reg_id_t reg = reserve_sve_register( - drcontext, bb, where, scratch_gpr0, DR_REG_P0, DR_REG_P7, - offsetof(per_thread_t, scratch_pred_spill_slots), - opnd_size_from_bytes(proc_get_vector_length_bytes() / 8), slot); + const reg_id_t reg = + reserve_sve_register(drcontext, bb, where, scratch_gpr0, DR_REG_P0, DR_REG_P7, + offsetof(per_thread_t, scratch_pred_spill_slots), + opnd_size_from_bytes(proc_get_vector_length_bytes() / 8), + slot, slot_state->pred_slots, slot); slot_state->pred_slots[slot] = reg; return reg; @@ -783,7 +832,8 @@ reserve_vector_register(void *drcontext, instrlist_t *bb, instr_t *where, const reg_id_t reg = reserve_sve_register(drcontext, bb, where, scratch_gpr0, DR_REG_Z0, DR_REG_Z31, offsetof(per_thread_t, scratch_vector_spill_slots_aligned), - opnd_size_from_bytes(proc_get_vector_length_bytes()), slot); + opnd_size_from_bytes(proc_get_vector_length_bytes()), slot, + slot_state->vector_slots, slot); slot_state->vector_slots[slot] = reg; return reg; @@ -897,10 +947,6 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e /* TODO i#5036: Add support for first-fault and non-fault accesses. */ return true; } - if (sg_info.is_replicating) { - /* TODO i#5036: Add support for ld1rq* replicating loads. */ - return true; - } const bool is_contiguous = !(reg_is_z(sg_info.base_reg) || reg_is_z(sg_info.index_reg)); @@ -967,6 +1013,12 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e &spill_slot_state); } + reg_id_t governing_pred = sg_info.mask_reg; + if (sg_info.is_replicating && proc_get_vector_length_bytes() > 16) { + governing_pred = reserve_pred_register(drcontext, bb, sg_instr, scratch_gpr, + &spill_slot_state); + } + const app_pc orig_app_pc = instr_get_app_pc(sg_instr); emulated_instr_t emulated_instr; @@ -980,8 +1032,8 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e if (is_contiguous) { /* scalar+scalar or scalar+immediate predicated contiguous access */ expand_contiguous(drcontext, bb, sg_instr, &sg_info, contiguous_new_base, - scratch_gpr, scalar_src_or_dst, scratch_pred, scratch_vec, - orig_app_pc); + scratch_gpr, scalar_src_or_dst, scratch_pred, governing_pred, + scratch_vec, orig_app_pc); } else { /* scalar+vector or vector+immediate scatter/gather */ expand_scatter_gather(drcontext, bb, sg_instr, &sg_info, scratch_gpr, @@ -990,13 +1042,21 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e drmgr_insert_emulation_end(drcontext, bb, sg_instr); - if (scratch_vec != DR_REG_INVALID) { - unreserve_vector_register(drcontext, bb, sg_instr, scratch_gpr, scratch_vec, - &spill_slot_state); + for (uint i = 0; i < NUM_VECTOR_SLOTS; i++) { + const reg_id_t reg = spill_slot_state.vector_slots[i]; + if (reg != DR_REG_NULL) { + unreserve_vector_register(drcontext, bb, sg_instr, scratch_gpr, reg, + &spill_slot_state); + } } - unreserve_pred_register(drcontext, bb, sg_instr, scratch_gpr, scratch_pred, - &spill_slot_state); + for (uint i = 0; i < NUM_PRED_SLOTS; i++) { + const reg_id_t reg = spill_slot_state.pred_slots[i]; + if (reg != DR_REG_NULL) { + unreserve_pred_register(drcontext, bb, sg_instr, scratch_gpr, reg, + &spill_slot_state); + } + } if (drreg_unreserve_register(drcontext, bb, sg_instr, scratch_gpr) != DRREG_SUCCESS) { DR_ASSERT_MSG(false, "drreg_unreserve_register should not fail"); diff --git a/suite/tests/client-interface/drx-scattergather-aarch64.cpp b/suite/tests/client-interface/drx-scattergather-aarch64.cpp index e0fabdf0836..7661cc862a8 100644 --- a/suite/tests/client-interface/drx-scattergather-aarch64.cpp +++ b/suite/tests/client-interface/drx-scattergather-aarch64.cpp @@ -2843,6 +2843,55 @@ test_ld1_scalar_plus_scalar() INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE), /*index=*/2, }, + // Load and replicate instructions + { + "ld1rqb scalar+scalar", + TEST_FUNC("ld1rqb z21.b, p1/z, [%[base], %[index]]"), + { /*zt=*/ { 21 }, /*pg=*/1 }, + std::array, 1> { + { 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, + 0x17, 0x18, 0x19, 0x20, 0x21, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x06, + 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x20, 0x21, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, + 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21 } }, + INPUT_DATA.base_addr_for_data_size(element_size_t::BYTE), + /*index=*/6, + }, + { + "ld1rqh scalar+scalar", + TEST_FUNC("ld1rqh z25.h, p0/z, [%[base], %[index], lsl #1]"), + { /*zt=*/ { 25 }, /*pg=*/0 }, + std::array, 1> { + { 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, + 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, + 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, + 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019 } }, + INPUT_DATA.base_addr_for_data_size(element_size_t::HALF), + /*index=*/12, + }, + { + "ld1rqw scalar+scalar", + TEST_FUNC("ld1rqw z29.s, p1/z, [%[base], %[index], lsl #2]"), + { /*zt=*/ { 29 }, /*pg=*/1 }, + std::array, 1> { + { 0x00000020, 0x00000021, 0x00000022, 0x00000023, 0x00000020, 0x00000021, + 0x00000022, 0x00000023, 0x00000020, 0x00000021, 0x00000022, 0x00000023, + 0x00000020, 0x00000021, 0x00000022, 0x00000023 } }, + INPUT_DATA.base_addr_for_data_size(element_size_t::SINGLE), + /*index=*/-12, + }, + { + "ld1rqd scalar+scalar", + TEST_FUNC("ld1rqd z31.d, p2/z, [%[base], %[index], lsl #3]"), + { /*zt=*/ { 31 }, /*pg=*/2 }, + std::array, 1> { + { 0xfffffffffffffff6, 0xfffffffffffffff5, 0xfffffffffffffff6, + 0xfffffffffffffff5, 0xfffffffffffffff6, 0xfffffffffffffff5, + 0xfffffffffffffff6, 0xfffffffffffffff5 } }, + INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE), + /*index=*/-6, + }, }); # undef TEST_FUNC } @@ -4371,6 +4420,104 @@ test_ld1_scalar_plus_immediate() 0x0000000000000022, 0x0000000000000023 } }, INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE), }, + // Load and replicate instructions + { + "ld1rqb scalar+immediate", + TEST_FUNC("ld1rqb z26.b, p5/z, [%[base], #80]"), + { /*zt=*/26, /*pg=*/5 }, + std::array, 1> { { 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, + 0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, + 0xf4, 0xf3, 0xf2, 0xf1 } }, + std::array, 1> { + { 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, 0x23, 0xf8, 0xf7, 0xf6, + 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, + 0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1 } }, + std::array, 1> { + { 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, 0x23, 0xf8, 0xf7, 0xf6, + 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, + 0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x16, + 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, + 0xf4, 0xf3, 0xf2, 0xf1, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, + 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1 } }, + INPUT_DATA.base_addr_for_data_size(element_size_t::BYTE), + }, + { + "ld1rqh scalar+immediate", + TEST_FUNC("ld1rqh z27.h, p4/z, [%[base], #48]"), + { /*zt=*/27, /*pg=*/4 }, + std::array, 1> { + { 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1 } }, + std::array, 1> { { + + 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1, 0xfff8, + 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1 } }, + std::array, 1> { + { 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1, + 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1, + 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1, + 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1 } }, + INPUT_DATA.base_addr_for_data_size(element_size_t::HALF), + }, + { + "ld1rqw scalar+immediate", + TEST_FUNC("ld1rqw z28.s, p3/z, [%[base], #-16]"), + { /*zt=*/28, /*pg=*/3 }, + std::array, 1> { + { 0xfffffff4, 0xfffffff3, 0xfffffff2, 0xfffffff1 } }, + std::array, 1> { { 0xfffffff4, 0xfffffff3, 0xfffffff2, + 0xfffffff1, 0xfffffff4, 0xfffffff3, + 0xfffffff2, 0xfffffff1 } }, + std::array, 1> { + { 0xfffffff4, 0xfffffff3, 0xfffffff2, 0xfffffff1, 0xfffffff4, 0xfffffff3, + 0xfffffff2, 0xfffffff1, 0xfffffff4, 0xfffffff3, 0xfffffff2, 0xfffffff1, + 0xfffffff4, 0xfffffff3, 0xfffffff2, 0xfffffff1 } }, + INPUT_DATA.base_addr_for_data_size(element_size_t::SINGLE), + }, + { + "ld1rqd scalar+immediate", + TEST_FUNC("ld1rqd z29.d, p2/z, [%[base], #-32]"), + { /*zt=*/29, /*pg=*/2 }, + std::array, 1> { + { 0xfffffffffffffff4, 0xfffffffffffffff3 } }, + std::array, 1> { + { 0xfffffffffffffff4, 0xfffffffffffffff3, 0xfffffffffffffff4, + 0xfffffffffffffff3 } }, + std::array, 1> { + { 0xfffffffffffffff4, 0xfffffffffffffff3, 0xfffffffffffffff4, + 0xfffffffffffffff3, 0xfffffffffffffff4, 0xfffffffffffffff3, + 0xfffffffffffffff4, 0xfffffffffffffff3 } }, + INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE), + }, + { + "ld1rqd scalar+immediate (min index)", + TEST_FUNC("ld1rqd z30.d, p1/z, [%[base], #-128]"), + { /*zt=*/30, /*pg=*/1 }, + std::array, 1> { + { 0x0000000000000016, 0x0000000000000017 } }, + std::array, 1> { + { 0x0000000000000016, 0x0000000000000017, 0x0000000000000016, + 0x0000000000000017 } }, + std::array, 1> { + { 0x0000000000000016, 0x0000000000000017, 0x0000000000000016, + 0x0000000000000017, 0x0000000000000016, 0x0000000000000017, + 0x0000000000000016, 0x0000000000000017 } }, + INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE), + }, + { + "ld1rqd scalar+immediate (max index)", + TEST_FUNC("ld1rqd z31.d, p0/z, [%[base], #112]"), + { /*zt=*/31, /*pg=*/0 }, + std::array, 1> { + { 0x0000000000000014, 0x0000000000000015 } }, + std::array, 1> { + { 0x0000000000000014, 0x0000000000000015, 0x0000000000000014, + 0x0000000000000015 } }, + std::array, 1> { + { 0x0000000000000014, 0x0000000000000015, 0x0000000000000014, + 0x0000000000000015, 0x0000000000000014, 0x0000000000000015, + 0x0000000000000014, 0x0000000000000015 } }, + INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE), + }, }); # undef TEST_FUNC } diff --git a/suite/tests/client-interface/drx-scattergather-aarch64.templatex b/suite/tests/client-interface/drx-scattergather-aarch64.templatex index 3215ba8dae4..12bdc719316 100644 --- a/suite/tests/client-interface/drx-scattergather-aarch64.templatex +++ b/suite/tests/client-interface/drx-scattergather-aarch64.templatex @@ -144,6 +144,10 @@ ldnt1w scalar\+scalar: PASS ld1sw scalar\+scalar: PASS ld1d scalar\+scalar: PASS ldnt1d scalar\+scalar: PASS +ld1rqb scalar\+scalar: PASS +ld1rqh scalar\+scalar: PASS +ld1rqw scalar\+scalar: PASS +ld1rqd scalar\+scalar: PASS ld2b scalar\+scalar: PASS ld2h scalar\+scalar: PASS ld2w scalar\+scalar: PASS @@ -212,6 +216,12 @@ ld1d scalar\+immediate 64bit element: PASS ld1d scalar\+immediate 64bit element \(min index\): PASS ld1d scalar\+immediate 64bit element \(max index\): PASS ldnt1d scalar\+immediate 64bit element: PASS +ld1rqb scalar\+immediate: PASS +ld1rqh scalar\+immediate: PASS +ld1rqw scalar\+immediate: PASS +ld1rqd scalar\+immediate: PASS +ld1rqd scalar\+immediate \(min index\): PASS +ld1rqd scalar\+immediate \(max index\): PASS ld2b scalar\+immediate: PASS ld2h scalar\+immediate: PASS ld2w scalar\+immediate: PASS @@ -269,7 +279,7 @@ st4d scalar\+immediate \(max index\): PASS #endif /* __ARM_FEATURE_SVE */ #ifndef TEST_SAMPLE_CLIENT #ifdef __ARM_FEATURE_SVE -event_exit, 1068 scatter/gather instructions +event_exit, 1108 scatter/gather instructions #else event_exit, 0 scatter/gather instructions #endif /* __ARM_FEATURE_SVE */