diff --git a/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex b/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex
index 7bee800377e..be2854f5170 100644
--- a/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex
+++ b/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex
@@ -2,18 +2,18 @@ Hello, world!
 ---- <application exited with code 0> ----
 Basic counts tool results:
 Total counts:
-         655 total \(fetched\) instructions
-         243 total unique \(fetched\) instructions
+         685 total \(fetched\) instructions
+         255 total unique \(fetched\) instructions
            0 total non-fetched instructions
            0 total prefetches
 #if (__ARM_FEATURE_SVE_BITS == 128)
-        1069 total data loads
+        1137 total data loads
          861 total data stores
 #elif (__ARM_FEATURE_SVE_BITS == 256)
-        1967 total data loads
+        2035 total data loads
         1595 total data stores
 #elif (__ARM_FEATURE_SVE_BITS == 512)
-        3763 total data loads
+        3831 total data loads
         3063 total data stores
 #endif
            0 total icache flushes
@@ -22,18 +22,18 @@ Total counts:
      .* total scheduling markers
 .*
 Thread .* counts:
-         655 \(fetched\) instructions
-         243 unique \(fetched\) instructions
+         685 \(fetched\) instructions
+         255 unique \(fetched\) instructions
            0 non-fetched instructions
            0 prefetches
 #if (__ARM_FEATURE_SVE_BITS == 128)
-        1069 data loads
+        1137 data loads
          861 data stores
 #elif (__ARM_FEATURE_SVE_BITS == 256)
-        1967 data loads
+        2035 data loads
         1595 data stores
 #elif (__ARM_FEATURE_SVE_BITS == 512)
-        3763 data loads
+        3831 data loads
         3063 data stores
 #endif
            0 icache flushes
diff --git a/clients/drcachesim/tests/allasm_scattergather_aarch64.asm b/clients/drcachesim/tests/allasm_scattergather_aarch64.asm
index 2c0f86c15cc..b1105e6a7b2 100644
--- a/clients/drcachesim/tests/allasm_scattergather_aarch64.asm
+++ b/clients/drcachesim/tests/allasm_scattergather_aarch64.asm
@@ -366,6 +366,21 @@ test_scalar_plus_immediate:
 
         ret
 
+test_replicating_loads:
+        ld1rqb  DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, X_INDEX_REG]         // 16
+        ld1rqh  DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 8
+        ld1rqw  DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #2] // 4
+        ld1rqd  DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #3] // 2
+                                                                             // Total: 30
+
+        ld1rqb  DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #0] // 16
+        ld1rqh  DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #0] // 8
+        ld1rqw  DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #0] // 4
+        ld1rqd  DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #0] // 2
+                                                            // Total: 30
+
+        ret
+
 _start:
 #ifdef __APPLE__
         adrp     BUFFER_REG, buffer@PAGE
@@ -407,8 +422,10 @@ _start:
 
         bl      test_scalar_plus_immediate // +(374 * vl_bytes/16) loads
                                            // +(322 * vl_bytes/16) stores
+        bl      test_replicating_loads     // +60 loads
+                                           // +0 stores
         // Running total:
-        // Loads: (136 + 14 + 374 + 374) * vl_bytes/16 = 898 * vl_bytes/16
+        // Loads: (136 + 14 + 374 + 374) * vl_bytes/16 + 60 = 898 * vl_bytes/16 + 60
         // Stores: (82 + 8 + 322 + 322) * vl_bytes/16 = 734 * vl_bytes/16
 
         /* Run all the instructions with no active elements */
@@ -422,9 +439,10 @@ _start:
         bl      test_vector_plus_immediate // +0 loads, +0 stores
         bl      test_scalar_plus_scalar    // +0 loads, +0 stores
         bl      test_scalar_plus_immediate // +0 loads, +0 stores
+        bl      test_replicating_loads     // +0 loads, +0 stores
 
         // Running total (unchanged from above):
-        // Loads:  898 * vl_bytes/16
+        // Loads:  (898 * vl_bytes/16) + 60
         // Stores: 734 * vl_bytes/16
 
         /* Run all instructions with one active element */
@@ -437,26 +455,28 @@ _start:
         bl      test_vector_plus_immediate // +7 loads,  +4 stores
         bl      test_scalar_plus_scalar    // +56 loads, +46 stores
         bl      test_scalar_plus_immediate // +56 loads, +46 stores
+        bl      test_replicating_loads     // +8 loads, +0 stores
 
         // Running total:
-        // Loads:  (898 * vl_bytes/16) + 52 + 7 + 56 + 56 = (898 * vl_bytes/16) + 171
+        // Loads:  (898 * vl_bytes/16) + 60 + 52 + 7 + 56 + 56 + 8 = (898 * vl_bytes/16) + 239
         // Stores: (734 * vl_bytes/16) + 41 + 4 + 46 + 46 = (734 * vl_bytes/16) + 127
 
         // The functions in this file have the following instructions counts:
-        //     _start                       37
+        //     _start                       40
         //     test_scalar_plus_vector      84
         //     test_vector_plus_immediate   12
         //     test_scalar_plus_scalar      55
         //     test_scalar_plus_immediate   55
-        // So there are 37 + 84 + 12 + 55 + 55 = 243 unique instructions
-        // We run the test_* functions 3 times each so the totoal instruction executed is
-        //     ((84 + 12 + 55 + 55) * 3) + 37 = (206 * 3) + 37 = 655
+        //     test_replicating_loads       9
+        // So there are 40 + 84 + 12 + 55 + 55 + 9 = 255 unique instructions
+        // We run the test_* functions 3 times each so the total instruction executed is
+        //     ((84 + 12 + 55 + 55 + 9) * 3) + 40 = (215 * 3) + 37 = 685
 
         // Totals:
-        // Loads:  (898 * vl_bytes/16) + 171
+        // Loads:  (898 * vl_bytes/16) + 239
         // Stores: (734 * vl_bytes/16) + 127
-        // Instructions: 703
-        // Unique instructions: 259
+        // Instructions: 685
+        // Unique instructions: 255
 
 // Exit.
         mov      w0, #1            // stdout
diff --git a/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex b/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex
index 04f579ec8b8..3cda0549822 100644
--- a/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex
+++ b/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex
@@ -1,18 +1,18 @@
 Hello, world!
 Basic counts tool results:
 Total counts:
-         655 total \(fetched\) instructions
-         243 total unique \(fetched\) instructions
+         685 total \(fetched\) instructions
+         255 total unique \(fetched\) instructions
            0 total non-fetched instructions
            0 total prefetches
 #if (__ARM_FEATURE_SVE_BITS == 128)
-        1069 total data loads
+        1137 total data loads
          861 total data stores
 #elif (__ARM_FEATURE_SVE_BITS == 256)
-        1967 total data loads
+        2035 total data loads
         1595 total data stores
 #elif (__ARM_FEATURE_SVE_BITS == 512)
-        3763 total data loads
+        3831 total data loads
         3063 total data stores
 #endif
            0 total icache flushes
@@ -21,18 +21,18 @@ Total counts:
      .* total scheduling markers
 .*
 Thread .* counts:
-         655 \(fetched\) instructions
-         243 unique \(fetched\) instructions
+         685 \(fetched\) instructions
+         255 unique \(fetched\) instructions
            0 non-fetched instructions
            0 prefetches
 #if (__ARM_FEATURE_SVE_BITS == 128)
-        1069 data loads
+        1137 data loads
          861 data stores
 #elif (__ARM_FEATURE_SVE_BITS == 256)
-        1967 data loads
+        2035 data loads
         1595 data stores
 #elif (__ARM_FEATURE_SVE_BITS == 512)
-        3763 data loads
+        3831 data loads
         3063 data stores
 #endif
            0 icache flushes
diff --git a/clients/drcachesim/tests/scattergather-aarch64.templatex b/clients/drcachesim/tests/scattergather-aarch64.templatex
index c3c7a9762b7..6efdb5b4544 100644
--- a/clients/drcachesim/tests/scattergather-aarch64.templatex
+++ b/clients/drcachesim/tests/scattergather-aarch64.templatex
@@ -144,6 +144,10 @@ ldnt1w scalar\+scalar: PASS
 ld1sw scalar\+scalar: PASS
 ld1d scalar\+scalar: PASS
 ldnt1d scalar\+scalar: PASS
+ld1rqb scalar\+scalar: PASS
+ld1rqh scalar\+scalar: PASS
+ld1rqw scalar\+scalar: PASS
+ld1rqd scalar\+scalar: PASS
 ld2b scalar\+scalar: PASS
 ld2h scalar\+scalar: PASS
 ld2w scalar\+scalar: PASS
@@ -212,6 +216,12 @@ ld1d scalar\+immediate 64bit element: PASS
 ld1d scalar\+immediate 64bit element \(min index\): PASS
 ld1d scalar\+immediate 64bit element \(max index\): PASS
 ldnt1d scalar\+immediate 64bit element: PASS
+ld1rqb scalar\+immediate: PASS
+ld1rqh scalar\+immediate: PASS
+ld1rqw scalar\+immediate: PASS
+ld1rqd scalar\+immediate: PASS
+ld1rqd scalar\+immediate \(min index\): PASS
+ld1rqd scalar\+immediate \(max index\): PASS
 ld2b scalar\+immediate: PASS
 ld2h scalar\+immediate: PASS
 ld2w scalar\+immediate: PASS
diff --git a/ext/drx/scatter_gather_aarch64.c b/ext/drx/scatter_gather_aarch64.c
index d8b70dfc292..e1fd2db8362 100644
--- a/ext/drx/scatter_gather_aarch64.c
+++ b/ext/drx/scatter_gather_aarch64.c
@@ -68,7 +68,7 @@ typedef struct _per_thread_t {
  * This corresponds to the spill slot storage in per_thread_t.
  */
 typedef struct _spill_slot_state_t {
-#define NUM_PRED_SLOTS 1
+#define NUM_PRED_SLOTS 2
     reg_id_t pred_slots[NUM_PRED_SLOTS];
 
 #define NUM_VECTOR_SLOTS 1
@@ -621,12 +621,37 @@ static void
 expand_contiguous(void *drcontext, instrlist_t *bb, instr_t *sg_instr,
                   const scatter_gather_info_t *sg_info, reg_id_t new_base,
                   reg_id_t scalar_index, reg_id_t scalar_src_or_dst,
-                  reg_id_t scratch_pred, reg_id_t scratch_vec, app_pc orig_app_pc)
+                  reg_id_t scratch_pred, reg_id_t governing_pred, reg_id_t scratch_vec,
+                  app_pc orig_app_pc)
 {
 #define EMIT(op, ...)    \
     instrlist_preinsert( \
         bb, sg_instr, INSTR_XL8(INSTR_CREATE_##op(drcontext, __VA_ARGS__), orig_app_pc))
 
+    if (sg_info->is_replicating && proc_get_vector_length_bytes() > 16) {
+        /* This instruction loads a fixed size 16-byte vector which is replicated to
+         * all quadword elements on hardware with a vector length > 16 bytes.
+         * Only the bottom 16 bits of the governing predicate register are used so we
+         * need to mask out any higher bits than that.
+         */
+        DR_ASSERT(sg_info->scatter_gather_size == OPSZ_16);
+
+        /* Set scratch_pred to a value with the first 16 elements active */
+        /* ptrue    scratch_pred.b, vl16 */
+        EMIT(ptrue_sve, opnd_create_reg_element_vector(scratch_pred, OPSZ_1),
+             opnd_create_immed_pred_constr(DR_PRED_CONSTR_VL16));
+
+        /* Create a new governing predicate by applying the mask we created in
+         * scratch_pred to the instruction's mask_reg.
+         */
+
+        /* and      governing_pred.b, mask_reg/z, mask_reg.b, scratch_pred.b */
+        EMIT(and_sve_pred_b, opnd_create_reg_element_vector(governing_pred, OPSZ_1),
+             opnd_create_predicate_reg(sg_info->mask_reg, /*merging=*/false),
+             opnd_create_reg_element_vector(sg_info->mask_reg, OPSZ_1),
+             opnd_create_reg_element_vector(scratch_pred, OPSZ_1));
+    }
+
     /* Calculate the new base address in scratch_gpr0.
      * Note that we can't use drutil_insert_get_mem_addr() here because we don't want the
      * BSD licensed drx to have a dependency on the LGPL licensed drutil.
@@ -686,6 +711,7 @@ expand_contiguous(void *drcontext, instrlist_t *bb, instr_t *sg_instr,
     } else {
         /* scalar+scalar: Keep the original modifier copied from sg_info */
     }
+    modified_sg_info.mask_reg = governing_pred;
 
     /* Note that modified_sg_info might not describe a valid SVE instruction.
      * For example if we are expanding:
@@ -702,6 +728,21 @@ expand_contiguous(void *drcontext, instrlist_t *bb, instr_t *sg_instr,
     /* Expand the instruction as if it were a scalar+vector scatter/gather instruction */
     expand_scatter_gather(drcontext, bb, sg_instr, &modified_sg_info, scalar_index,
                           scalar_src_or_dst, scratch_pred, orig_app_pc);
+
+    if (sg_info->is_replicating && proc_get_vector_length_bytes() > 16) {
+        /* All supported replicating loads load a 16-byte vector. */
+        DR_ASSERT(sg_info->scatter_gather_size == OPSZ_16);
+
+        /* Replicate the first quadword element (16 bytes) to the other elements in the
+         * vector.
+         */
+
+        /* dup gather_dst.q, gather_dst.q[0]*/
+        EMIT(dup_sve_idx,
+             opnd_create_reg_element_vector(sg_info->gather_dst_reg, OPSZ_16),
+             opnd_create_reg_element_vector(sg_info->gather_dst_reg, OPSZ_16),
+             opnd_create_immed_uint(0, OPSZ_2b));
+    }
 #undef EMIT
 }
 
@@ -713,13 +754,20 @@ expand_contiguous(void *drcontext, instrlist_t *bb, instr_t *sg_instr,
 reg_id_t
 reserve_sve_register(void *drcontext, instrlist_t *bb, instr_t *where,
                      reg_id_t scratch_gpr0, reg_id_t min_register, reg_id_t max_register,
-                     size_t slot_tls_offset, opnd_size_t reg_size, uint slot_num)
+                     size_t slot_tls_offset, opnd_size_t reg_size, uint slot_num,
+                     reg_id_t *already_allocated_regs, uint num_already_allocated)
 {
     /* Search the instruction for an unused register we will use as a temp. */
     reg_id_t reg;
     for (reg = min_register; reg <= max_register; ++reg) {
-        if (!instr_uses_reg(where, reg))
-            break;
+        if (!instr_uses_reg(where, reg)) {
+            bool reg_already_allocated = false;
+            for (uint i = 0; !reg_already_allocated && i < num_already_allocated; i++) {
+                reg_already_allocated = already_allocated_regs[i] == reg;
+            }
+            if (!reg_already_allocated)
+                break;
+        }
     }
     DR_ASSERT(!instr_uses_reg(where, reg));
 
@@ -759,10 +807,11 @@ reserve_pred_register(void *drcontext, instrlist_t *bb, instr_t *where,
     /* Some instructions require the predicate to be in the range p0 - p7. This includes
      * LASTB which we use to extract elements from the vector register.
      */
-    const reg_id_t reg = reserve_sve_register(
-        drcontext, bb, where, scratch_gpr0, DR_REG_P0, DR_REG_P7,
-        offsetof(per_thread_t, scratch_pred_spill_slots),
-        opnd_size_from_bytes(proc_get_vector_length_bytes() / 8), slot);
+    const reg_id_t reg =
+        reserve_sve_register(drcontext, bb, where, scratch_gpr0, DR_REG_P0, DR_REG_P7,
+                             offsetof(per_thread_t, scratch_pred_spill_slots),
+                             opnd_size_from_bytes(proc_get_vector_length_bytes() / 8),
+                             slot, slot_state->pred_slots, slot);
 
     slot_state->pred_slots[slot] = reg;
     return reg;
@@ -783,7 +832,8 @@ reserve_vector_register(void *drcontext, instrlist_t *bb, instr_t *where,
     const reg_id_t reg =
         reserve_sve_register(drcontext, bb, where, scratch_gpr0, DR_REG_Z0, DR_REG_Z31,
                              offsetof(per_thread_t, scratch_vector_spill_slots_aligned),
-                             opnd_size_from_bytes(proc_get_vector_length_bytes()), slot);
+                             opnd_size_from_bytes(proc_get_vector_length_bytes()), slot,
+                             slot_state->vector_slots, slot);
 
     slot_state->vector_slots[slot] = reg;
     return reg;
@@ -897,10 +947,6 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e
         /* TODO i#5036: Add support for first-fault and non-fault accesses. */
         return true;
     }
-    if (sg_info.is_replicating) {
-        /* TODO i#5036: Add support for ld1rq* replicating loads. */
-        return true;
-    }
 
     const bool is_contiguous =
         !(reg_is_z(sg_info.base_reg) || reg_is_z(sg_info.index_reg));
@@ -967,6 +1013,12 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e
                                               &spill_slot_state);
     }
 
+    reg_id_t governing_pred = sg_info.mask_reg;
+    if (sg_info.is_replicating && proc_get_vector_length_bytes() > 16) {
+        governing_pred = reserve_pred_register(drcontext, bb, sg_instr, scratch_gpr,
+                                               &spill_slot_state);
+    }
+
     const app_pc orig_app_pc = instr_get_app_pc(sg_instr);
 
     emulated_instr_t emulated_instr;
@@ -980,8 +1032,8 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e
     if (is_contiguous) {
         /* scalar+scalar or scalar+immediate predicated contiguous access */
         expand_contiguous(drcontext, bb, sg_instr, &sg_info, contiguous_new_base,
-                          scratch_gpr, scalar_src_or_dst, scratch_pred, scratch_vec,
-                          orig_app_pc);
+                          scratch_gpr, scalar_src_or_dst, scratch_pred, governing_pred,
+                          scratch_vec, orig_app_pc);
     } else {
         /* scalar+vector or vector+immediate scatter/gather */
         expand_scatter_gather(drcontext, bb, sg_instr, &sg_info, scratch_gpr,
@@ -990,13 +1042,21 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e
 
     drmgr_insert_emulation_end(drcontext, bb, sg_instr);
 
-    if (scratch_vec != DR_REG_INVALID) {
-        unreserve_vector_register(drcontext, bb, sg_instr, scratch_gpr, scratch_vec,
-                                  &spill_slot_state);
+    for (uint i = 0; i < NUM_VECTOR_SLOTS; i++) {
+        const reg_id_t reg = spill_slot_state.vector_slots[i];
+        if (reg != DR_REG_NULL) {
+            unreserve_vector_register(drcontext, bb, sg_instr, scratch_gpr, reg,
+                                      &spill_slot_state);
+        }
     }
 
-    unreserve_pred_register(drcontext, bb, sg_instr, scratch_gpr, scratch_pred,
-                            &spill_slot_state);
+    for (uint i = 0; i < NUM_PRED_SLOTS; i++) {
+        const reg_id_t reg = spill_slot_state.pred_slots[i];
+        if (reg != DR_REG_NULL) {
+            unreserve_pred_register(drcontext, bb, sg_instr, scratch_gpr, reg,
+                                    &spill_slot_state);
+        }
+    }
 
     if (drreg_unreserve_register(drcontext, bb, sg_instr, scratch_gpr) != DRREG_SUCCESS) {
         DR_ASSERT_MSG(false, "drreg_unreserve_register should not fail");
diff --git a/suite/tests/client-interface/drx-scattergather-aarch64.cpp b/suite/tests/client-interface/drx-scattergather-aarch64.cpp
index e0fabdf0836..7661cc862a8 100644
--- a/suite/tests/client-interface/drx-scattergather-aarch64.cpp
+++ b/suite/tests/client-interface/drx-scattergather-aarch64.cpp
@@ -2843,6 +2843,55 @@ test_ld1_scalar_plus_scalar()
             INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE),
             /*index=*/2,
         },
+        // Load and replicate instructions
+        {
+            "ld1rqb scalar+scalar",
+            TEST_FUNC("ld1rqb z21.b, p1/z, [%[base], %[index]]"),
+            { /*zt=*/ { 21 }, /*pg=*/1 },
+            std::array<std::array<uint8_t, 64>, 1> {
+                { 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
+                  0x17, 0x18, 0x19, 0x20, 0x21, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11,
+                  0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x06,
+                  0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+                  0x18, 0x19, 0x20, 0x21, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12,
+                  0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21 } },
+            INPUT_DATA.base_addr_for_data_size(element_size_t::BYTE),
+            /*index=*/6,
+        },
+        {
+            "ld1rqh scalar+scalar",
+            TEST_FUNC("ld1rqh z25.h, p0/z, [%[base], %[index], lsl #1]"),
+            { /*zt=*/ { 25 }, /*pg=*/0 },
+            std::array<std::array<uint16_t, 32>, 1> {
+                { 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019,
+                  0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019,
+                  0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019,
+                  0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019 } },
+            INPUT_DATA.base_addr_for_data_size(element_size_t::HALF),
+            /*index=*/12,
+        },
+        {
+            "ld1rqw scalar+scalar",
+            TEST_FUNC("ld1rqw z29.s, p1/z, [%[base], %[index], lsl #2]"),
+            { /*zt=*/ { 29 }, /*pg=*/1 },
+            std::array<std::array<uint32_t, 16>, 1> {
+                { 0x00000020, 0x00000021, 0x00000022, 0x00000023, 0x00000020, 0x00000021,
+                  0x00000022, 0x00000023, 0x00000020, 0x00000021, 0x00000022, 0x00000023,
+                  0x00000020, 0x00000021, 0x00000022, 0x00000023 } },
+            INPUT_DATA.base_addr_for_data_size(element_size_t::SINGLE),
+            /*index=*/-12,
+        },
+        {
+            "ld1rqd scalar+scalar",
+            TEST_FUNC("ld1rqd z31.d, p2/z, [%[base], %[index], lsl #3]"),
+            { /*zt=*/ { 31 }, /*pg=*/2 },
+            std::array<std::array<uint64_t, 8>, 1> {
+                { 0xfffffffffffffff6, 0xfffffffffffffff5, 0xfffffffffffffff6,
+                  0xfffffffffffffff5, 0xfffffffffffffff6, 0xfffffffffffffff5,
+                  0xfffffffffffffff6, 0xfffffffffffffff5 } },
+            INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE),
+            /*index=*/-6,
+        },
     });
 #    undef TEST_FUNC
 }
@@ -4371,6 +4420,104 @@ test_ld1_scalar_plus_immediate()
                   0x0000000000000022, 0x0000000000000023 } },
             INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE),
         },
+        // Load and replicate instructions
+        {
+            "ld1rqb scalar+immediate",
+            TEST_FUNC("ld1rqb z26.b, p5/z, [%[base], #80]"),
+            { /*zt=*/26, /*pg=*/5 },
+            std::array<std::array<uint8_t, 16>, 1> { { 0x16, 0x17, 0x18, 0x19, 0x20, 0x21,
+                                                       0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5,
+                                                       0xf4, 0xf3, 0xf2, 0xf1 } },
+            std::array<std::array<uint8_t, 32>, 1> {
+                { 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, 0x23, 0xf8, 0xf7, 0xf6,
+                  0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21,
+                  0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1 } },
+            std::array<std::array<uint8_t, 64>, 1> {
+                { 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, 0x23, 0xf8, 0xf7, 0xf6,
+                  0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21,
+                  0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x16,
+                  0x17, 0x18, 0x19, 0x20, 0x21, 0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5,
+                  0xf4, 0xf3, 0xf2, 0xf1, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22,
+                  0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1 } },
+            INPUT_DATA.base_addr_for_data_size(element_size_t::BYTE),
+        },
+        {
+            "ld1rqh scalar+immediate",
+            TEST_FUNC("ld1rqh z27.h, p4/z, [%[base], #48]"),
+            { /*zt=*/27, /*pg=*/4 },
+            std::array<std::array<uint16_t, 8>, 1> {
+                { 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1 } },
+            std::array<std::array<uint16_t, 16>, 1> { {
+
+                0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1, 0xfff8,
+                0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1 } },
+            std::array<std::array<uint16_t, 32>, 1> {
+                { 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1,
+                  0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1,
+                  0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1,
+                  0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, 0xfff1 } },
+            INPUT_DATA.base_addr_for_data_size(element_size_t::HALF),
+        },
+        {
+            "ld1rqw scalar+immediate",
+            TEST_FUNC("ld1rqw z28.s, p3/z, [%[base], #-16]"),
+            { /*zt=*/28, /*pg=*/3 },
+            std::array<std::array<uint32_t, 4>, 1> {
+                { 0xfffffff4, 0xfffffff3, 0xfffffff2, 0xfffffff1 } },
+            std::array<std::array<uint32_t, 8>, 1> { { 0xfffffff4, 0xfffffff3, 0xfffffff2,
+                                                       0xfffffff1, 0xfffffff4, 0xfffffff3,
+                                                       0xfffffff2, 0xfffffff1 } },
+            std::array<std::array<uint32_t, 16>, 1> {
+                { 0xfffffff4, 0xfffffff3, 0xfffffff2, 0xfffffff1, 0xfffffff4, 0xfffffff3,
+                  0xfffffff2, 0xfffffff1, 0xfffffff4, 0xfffffff3, 0xfffffff2, 0xfffffff1,
+                  0xfffffff4, 0xfffffff3, 0xfffffff2, 0xfffffff1 } },
+            INPUT_DATA.base_addr_for_data_size(element_size_t::SINGLE),
+        },
+        {
+            "ld1rqd scalar+immediate",
+            TEST_FUNC("ld1rqd z29.d, p2/z, [%[base], #-32]"),
+            { /*zt=*/29, /*pg=*/2 },
+            std::array<std::array<uint64_t, 2>, 1> {
+                { 0xfffffffffffffff4, 0xfffffffffffffff3 } },
+            std::array<std::array<uint64_t, 4>, 1> {
+                { 0xfffffffffffffff4, 0xfffffffffffffff3, 0xfffffffffffffff4,
+                  0xfffffffffffffff3 } },
+            std::array<std::array<uint64_t, 8>, 1> {
+                { 0xfffffffffffffff4, 0xfffffffffffffff3, 0xfffffffffffffff4,
+                  0xfffffffffffffff3, 0xfffffffffffffff4, 0xfffffffffffffff3,
+                  0xfffffffffffffff4, 0xfffffffffffffff3 } },
+            INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE),
+        },
+        {
+            "ld1rqd scalar+immediate (min index)",
+            TEST_FUNC("ld1rqd z30.d, p1/z, [%[base], #-128]"),
+            { /*zt=*/30, /*pg=*/1 },
+            std::array<std::array<uint64_t, 2>, 1> {
+                { 0x0000000000000016, 0x0000000000000017 } },
+            std::array<std::array<uint64_t, 4>, 1> {
+                { 0x0000000000000016, 0x0000000000000017, 0x0000000000000016,
+                  0x0000000000000017 } },
+            std::array<std::array<uint64_t, 8>, 1> {
+                { 0x0000000000000016, 0x0000000000000017, 0x0000000000000016,
+                  0x0000000000000017, 0x0000000000000016, 0x0000000000000017,
+                  0x0000000000000016, 0x0000000000000017 } },
+            INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE),
+        },
+        {
+            "ld1rqd scalar+immediate (max index)",
+            TEST_FUNC("ld1rqd z31.d, p0/z, [%[base], #112]"),
+            { /*zt=*/31, /*pg=*/0 },
+            std::array<std::array<uint64_t, 2>, 1> {
+                { 0x0000000000000014, 0x0000000000000015 } },
+            std::array<std::array<uint64_t, 4>, 1> {
+                { 0x0000000000000014, 0x0000000000000015, 0x0000000000000014,
+                  0x0000000000000015 } },
+            std::array<std::array<uint64_t, 8>, 1> {
+                { 0x0000000000000014, 0x0000000000000015, 0x0000000000000014,
+                  0x0000000000000015, 0x0000000000000014, 0x0000000000000015,
+                  0x0000000000000014, 0x0000000000000015 } },
+            INPUT_DATA.base_addr_for_data_size(element_size_t::DOUBLE),
+        },
     });
 #    undef TEST_FUNC
 }
diff --git a/suite/tests/client-interface/drx-scattergather-aarch64.templatex b/suite/tests/client-interface/drx-scattergather-aarch64.templatex
index 3215ba8dae4..12bdc719316 100644
--- a/suite/tests/client-interface/drx-scattergather-aarch64.templatex
+++ b/suite/tests/client-interface/drx-scattergather-aarch64.templatex
@@ -144,6 +144,10 @@ ldnt1w scalar\+scalar: PASS
 ld1sw scalar\+scalar: PASS
 ld1d scalar\+scalar: PASS
 ldnt1d scalar\+scalar: PASS
+ld1rqb scalar\+scalar: PASS
+ld1rqh scalar\+scalar: PASS
+ld1rqw scalar\+scalar: PASS
+ld1rqd scalar\+scalar: PASS
 ld2b scalar\+scalar: PASS
 ld2h scalar\+scalar: PASS
 ld2w scalar\+scalar: PASS
@@ -212,6 +216,12 @@ ld1d scalar\+immediate 64bit element: PASS
 ld1d scalar\+immediate 64bit element \(min index\): PASS
 ld1d scalar\+immediate 64bit element \(max index\): PASS
 ldnt1d scalar\+immediate 64bit element: PASS
+ld1rqb scalar\+immediate: PASS
+ld1rqh scalar\+immediate: PASS
+ld1rqw scalar\+immediate: PASS
+ld1rqd scalar\+immediate: PASS
+ld1rqd scalar\+immediate \(min index\): PASS
+ld1rqd scalar\+immediate \(max index\): PASS
 ld2b scalar\+immediate: PASS
 ld2h scalar\+immediate: PASS
 ld2w scalar\+immediate: PASS
@@ -269,7 +279,7 @@ st4d scalar\+immediate \(max index\): PASS
 #endif /* __ARM_FEATURE_SVE */
 #ifndef TEST_SAMPLE_CLIENT
 #ifdef __ARM_FEATURE_SVE
-event_exit, 1068 scatter/gather instructions
+event_exit, 1108 scatter/gather instructions
 #else
 event_exit, 0 scatter/gather instructions
 #endif /* __ARM_FEATURE_SVE */