Skip to content

Commit

Permalink
[all]: Rework Neon load/store and load/store pair
Browse files Browse the repository at this point in the history
Currently Neon load/store is split in two instructions: one for
handling immediate/register offset and another for handling post-index
variant. The similar applies to Neon load/store pair. Note, missing
support for pre-index variant for Neon load/store.

Let's refactor those instructions to take advantage of existing
infrastructure for handling addressing modes. That removes need for
special instruction for post-index variant and as a bonus naturally
brings support for pre-index variant for Neon load/store.

Signed-off-by: Vladimir Murzin <[email protected]>
  • Loading branch information
Vladimir Murzin committed Mar 19, 2024
1 parent d9f2328 commit 6cdf3cf
Show file tree
Hide file tree
Showing 7 changed files with 301 additions and 450 deletions.
18 changes: 9 additions & 9 deletions gen/AArch64Compile_gen.ml
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ module Make(Cfg:Config) : XXXCompile_gen.S =
let emit_load_reg temporal st init rA =
let r1,st = next_vreg st in
let r2,st = next_vreg st in
let ldp = [I_LDP_SIMD(temporal,A64.VSIMD32,to_scalar r1,to_scalar r2,rA,0)] in
let ldp = [I_LDP_SIMD(temporal,A64.VSIMD32,to_scalar r1,to_scalar r2,rA,(0,A64.Idx))] in
let r3,st = next_vreg st in
let add = [I_ADD_SIMD (r3,r1,r2)] in
let rX,st = next_reg st in
Expand All @@ -603,7 +603,7 @@ module Make(Cfg:Config) : XXXCompile_gen.S =
module LDUR = struct
let emit_load_reg st init rA =
let r,st = next_scalar_reg st in
let ldur = [I_LDUR_SIMD(A64.VSIMD32,r,rA,None)] in
let ldur = [I_LDUR_SIMD(A64.VSIMD32,r,rA,0)] in
let rX,st = next_reg st in
let fmov = [I_FMOV_TG(A64.V32,rX,A64.VSIMD32,r)] in
rX,init,lift_code (ldur@fmov),st
Expand All @@ -622,7 +622,7 @@ module Make(Cfg:Config) : XXXCompile_gen.S =
module LDAPUR = struct
let emit_load_reg st init rA =
let r,st = next_scalar_reg st in
let ldur = [I_LDAPUR_SIMD(A64.VSIMD32,r,rA,None)] in
let ldur = [I_LDAPUR_SIMD(A64.VSIMD32,r,rA,0)] in
let rX,st = next_reg st in
let fmov = [I_FMOV_TG(A64.V32,rX,A64.VSIMD32,r)] in
rX,init,lift_code (ldur@fmov),st
Expand Down Expand Up @@ -834,7 +834,7 @@ module Make(Cfg:Config) : XXXCompile_gen.S =
let r1,st = next_vreg st in
let r2,st = next_vreg st in
let movi = List.mapi (fun i r -> movi_reg r (v+i)) [r1;r2] in
let stp = [I_STP_SIMD(temporal,A64.VSIMD32,to_scalar r1,to_scalar r2,rA,0)] in
let stp = [I_STP_SIMD(temporal,A64.VSIMD32,to_scalar r1,to_scalar r2,rA,(0,A64.Idx))] in
init,pseudo movi@pseudo stp,st

let emit_store n st p init loc v =
Expand All @@ -854,15 +854,15 @@ module Make(Cfg:Config) : XXXCompile_gen.S =
let r2,st = next_vreg st in
let movi = List.mapi (fun i r -> movi_reg r (v+i)) [r1;r2] in
let adds = List.map (fun v -> add_simd v rB) [r1;r2] in
let stp = [I_STP_SIMD(temporal,A64.VSIMD32,to_scalar r1,to_scalar r2,rA,0)] in
let stp = [I_STP_SIMD(temporal,A64.VSIMD32,to_scalar r1,to_scalar r2,rA,(0,A64.Idx))] in
init,lift_code(dup@movi@adds@stp),st
end

module STUR = struct
let emit_store_reg st init rA v =
let r,st = next_vreg st in
let movi = [movi_reg r v] in
let stur = [I_STUR_SIMD(A64.VSIMD32,to_scalar r,rA,None)] in
let stur = [I_STUR_SIMD(A64.VSIMD32,to_scalar r,rA,0)] in
init,lift_code(movi@stur),st

let emit_store st p init loc v =
Expand All @@ -881,15 +881,15 @@ module Make(Cfg:Config) : XXXCompile_gen.S =
let r1,st = next_vreg st in
let movi = [movi_reg r1 v] in
let adds = [add_simd r1 rB]in
let stur = [I_STUR_SIMD(A64.VSIMD32,to_scalar r1,rA,None)] in
let stur = [I_STUR_SIMD(A64.VSIMD32,to_scalar r1,rA,0)] in
init,lift_code(dup@movi@adds@stur),st
end

module STLUR = struct
let emit_store_reg st init rA v =
let r,st = next_vreg st in
let movi = [movi_reg r v] in
let stlur = [I_STLUR_SIMD(A64.VSIMD32,to_scalar r,rA,None)] in
let stlur = [I_STLUR_SIMD(A64.VSIMD32,to_scalar r,rA,0)] in
init,lift_code(movi@stlur),st

let emit_store st p init loc v =
Expand All @@ -908,7 +908,7 @@ module Make(Cfg:Config) : XXXCompile_gen.S =
let r1,st = next_vreg st in
let movi = [movi_reg r1 v] in
let adds = [add_simd r1 rB]in
let stlur = [I_STLUR_SIMD(A64.VSIMD32,to_scalar r1,rA,None)] in
let stlur = [I_STLUR_SIMD(A64.VSIMD32,to_scalar r1,rA,0)] in
init,lift_code(dup@movi@adds@stlur),st
end

Expand Down
26 changes: 11 additions & 15 deletions herd/AArch64Arch_herd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,16 @@ module Make (C:Arch_herd.Config)(V:Value.AArch64) =
| I_EOR_SIMD _| I_ERET| I_FENCE _| I_GC _| I_IC _| I_LD1 _| I_LD1M _| I_LD1R _ | I_LDAP1 _
| I_LD2 _| I_LD2M _| I_LD2R _| I_LD3 _| I_LD3M _| I_LD3R _| I_LD4 _| I_LD4M _
| I_LD4R _| I_LDAR _| I_LDARBH _| I_LDCT _| I_LDG _| I_LDOP _| I_LDOPBH _
| I_LDP _| I_LDP_P_SIMD _| I_LDP_SIMD _| I_LDPSW _| I_LDR _
| I_LDRSW _ | I_LDR_P_SIMD _ | I_LDAPUR_SIMD _
| I_LDR_SIMD _| I_LDRBH _| I_LDRS _| I_LDUR _| I_LDUR_SIMD _| I_LDXP _| I_MOV _ | I_FMOV_TG _
| I_LDP _| I_LDP_SIMD _| I_LDPSW _| I_LDR _
| I_LDRSW _ | I_LDR_SIMD _ | I_LDAPUR_SIMD _
| I_LDRBH _| I_LDRS _| I_LDUR _| I_LDUR_SIMD _| I_LDXP _| I_MOV _ | I_FMOV_TG _
| I_ADDV _| I_DUP _ | I_MOV_FG _| I_MOV_S _| I_MOV_TG _| I_MOV_V _| I_MOV_VE _| I_MOVI_S _
| I_MOVI_V _| I_MOVK _| I_MOVZ _| I_MOVN _| I_MRS _| I_MSR _| I_OP3 _| I_RBIT _
| I_RET _
| I_SBFM _| I_SC _| I_SEAL _| I_ST1 _| I_STL1 _| I_ST1M _| I_ST2 _| I_ST2M _| I_ST3 _
| I_ST3M _| I_ST4 _| I_ST4M _| I_STCT _| I_STG _| I_STLR _| I_STLRBH _| I_STOP _
| I_STOPBH _| I_STP _| I_STP_P_SIMD _| I_STP_SIMD _| I_STR _ | I_STLUR_SIMD _
| I_STR_P_SIMD _| I_STR_SIMD _| I_STRBH _| I_STUR_SIMD _| I_STXP _| I_STXR _
| I_STOPBH _| I_STP _| I_STP_SIMD _| I_STR _ | I_STLUR_SIMD _
| I_STR_SIMD _| I_STRBH _| I_STUR_SIMD _| I_STXP _| I_STXR _
| I_STXRBH _| I_STZG _| I_STZ2G _
| I_SWP _| I_SWPBH _| I_SXTW _| I_TLBI _| I_UBFM _
| I_UDF _| I_UNSEAL _ | I_ADDSUBEXT _ | I_ABS _ | I_REV _ | I_EXTR _
Expand Down Expand Up @@ -229,10 +229,10 @@ module Make (C:Arch_herd.Config)(V:Value.AArch64) =
| I_LDOP (_,v,_,_,_,_) | I_STOP (_,v,_,_,_) ->
Some (tr_variant v)
| I_STZG _|I_STZ2G _ -> Some MachSize.granule
| I_LDR_SIMD (v,_,_,_,_) | I_LDR_P_SIMD (v,_,_,_)
| I_LDP_SIMD (_,v,_,_,_,_) | I_LDP_P_SIMD (_,v,_,_,_,_)
| I_STR_SIMD (v,_,_,_,_) | I_STR_P_SIMD (v,_,_,_)
| I_STP_SIMD (_,v,_,_,_,_) | I_STP_P_SIMD (_,v,_,_,_,_)
| I_LDR_SIMD (v,_,_,_)
| I_LDP_SIMD (_,v,_,_,_,_)
| I_STR_SIMD (v,_,_,_)
| I_STP_SIMD (_,v,_,_,_,_)
| I_LDUR_SIMD (v,_,_,_) | I_STUR_SIMD (v,_,_,_)
| I_LDAPUR_SIMD (v,_,_,_) | I_STLUR_SIMD (v,_,_,_) ->
Some (tr_simd_variant v)
Expand Down Expand Up @@ -344,10 +344,8 @@ module Make (C:Arch_herd.Config)(V:Value.AArch64) =
| I_LD3R _|I_LD4 _|I_LD4M _|I_LD4R _
| I_ST1 _|I_ST1M _|I_ST2 _|I_ST2M _
| I_ST3 _|I_ST3M _|I_ST4 _|I_ST4M _
| I_LDP_P_SIMD _|I_STP_P_SIMD _
| I_LDP_SIMD _|I_STP_SIMD _
| I_LDR_SIMD _|I_LDR_P_SIMD _
| I_STR_SIMD _|I_STR_P_SIMD _
| I_LDR_SIMD _|I_STR_SIMD _
| I_LDUR_SIMD _|I_LDAPUR_SIMD _|I_STUR_SIMD _|I_STLUR_SIMD _
| I_MOV_VE _
| I_MOV_V _|I_MOV_TG _|I_MOV_FG _
Expand Down Expand Up @@ -376,10 +374,8 @@ module Make (C:Arch_herd.Config)(V:Value.AArch64) =
| I_LD4 _|I_LD4M _|I_LD4R _|I_ST1 _|I_STL1 _
| I_ST1M _|I_ST2 _|I_ST2M _|I_ST3 _
| I_ST3M _|I_ST4 _|I_ST4M _
| I_LDP_P_SIMD _|I_STP_P_SIMD _
| I_LDP_SIMD _|I_STP_SIMD _
| I_LDR_SIMD _|I_LDR_P_SIMD _
| I_STR_SIMD _|I_STR_P_SIMD _
| I_LDR_SIMD _|I_STR_SIMD _
| I_LDUR_SIMD _|I_LDAPUR_SIMD _|I_STUR_SIMD _|I_STLUR_SIMD _
| I_MOV_VE _
| I_MOV_V _|I_MOV_TG _|I_MOV_FG _
Expand Down
118 changes: 73 additions & 45 deletions herd/AArch64Sem.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1890,9 +1890,9 @@ module Make
let simd_ldr = do_simd_ldr Annot.N
let simd_ldar = do_simd_ldr Annot.Q

let do_simd_str an sz rs rd kr s ii =
get_ea rs kr s ii >>|
read_reg_neon true rd ii >>= fun (addr, v) ->
let do_simd_str an sz ma rd ii =
ma >>|
read_reg_neon true rd ii >>= fun (addr,v) ->
if sz == MachSize.S128 then
do_write_mem_2_ops sz an aexp Access.VIR addr v ii >>= B.next2T
else
Expand All @@ -1901,9 +1901,9 @@ module Make
let simd_str = do_simd_str Annot.N
let simd_stlr = do_simd_str Annot.L

let simd_str_p sz rs rd k ii =
read_reg_ord rs ii >>|
read_reg_neon true rd ii >>= fun (addr, v) ->
let simd_str_p sz ma rd rs k ii =
ma >>|
read_reg_neon true rd ii >>= fun (addr,v) ->
if sz == MachSize.S128 then
(* 128-bit Neon LDR/STR and friends are split into two 64-bit
* single-copy atomic accesses. *)
Expand Down Expand Up @@ -2625,57 +2625,85 @@ module Make
!!!!(read_reg_ord rA ii >>= fun addr ->
(store_m addr rs ii >>|
post_kr rA addr kr ii))

| I_LDR_SIMD(var,r1,rA,kr,s) ->
| I_LDR_SIMD(var,r1,rA,MemExt.Reg(v,kr,sext,s)) ->
let access_size = tr_simd_variant var in
get_ea_reg rA v kr sext s ii >>= fun addr ->
simd_ldr access_size addr r1 ii >>= B.next1T
| I_LDR_SIMD(var,r1,rA,MemExt.Imm (k,Idx)) ->
let access_size = tr_simd_variant var in
get_ea_idx rA k ii >>= fun addr ->
simd_ldr access_size addr r1 ii >>= B.next1T
| I_LDR_SIMD(var,r1,rA,MemExt.Imm (k,PreIdx)) ->
let access_size = tr_simd_variant var in
get_ea rA kr s ii >>= fun addr ->
get_ea_preindexed rA k ii >>= fun addr ->
simd_ldr access_size addr r1 ii >>= B.next1T
| I_LDR_P_SIMD(var,r1,rA,k) ->
| I_LDR_SIMD(var,r1,rA,MemExt.Imm (k,PostIdx)) ->
let access_size = tr_simd_variant var in
read_reg_ord rA ii >>= fun addr ->
simd_ldr access_size addr r1 ii >>|
post_kr rA addr (K k) ii >>= B.next2T
| I_LDUR_SIMD(var,r1,rA,k) ->
let access_size = tr_simd_variant var and
k = K (match k with Some k -> k | None -> 0) in
(get_ea rA k S_NOEXT ii >>= fun addr ->
simd_ldr access_size addr r1 ii) >>= B.next1T
let access_size = tr_simd_variant var in
get_ea rA (K k) S_NOEXT ii >>= fun addr ->
simd_ldr access_size addr r1 ii >>= B.next1T
| I_LDAPUR_SIMD(var,r1,rA,k) ->
let access_size = tr_simd_variant var and
k = K (match k with Some k -> k | None -> 0) in
(get_ea rA k S_NOEXT ii >>= fun addr ->
simd_ldar access_size addr r1 ii) >>= B.next1T
| I_STR_SIMD(var,r1,rA,kr,s) ->
let access_size = tr_simd_variant var in
simd_str access_size rA r1 kr s ii
| I_STR_P_SIMD(var,r1,rA,k) ->
get_ea rA (K k) S_NOEXT ii >>= fun addr ->
simd_ldar access_size addr r1 ii >>= B.next1T
| I_STR_SIMD(var,r1,rA,MemExt.Reg (v,kr,sext,s)) ->
let access_size = tr_simd_variant var in
let ma = get_ea_reg rA v kr sext s ii in
simd_str access_size ma r1 ii
| I_STR_SIMD(var,r1,rA,MemExt.Imm (k,Idx)) ->
let access_size = tr_simd_variant var in
let ma = get_ea_idx rA k ii in
simd_str access_size ma r1 ii
| I_STR_SIMD(var,r1,rA,MemExt.Imm (k,PreIdx)) ->
let access_size = tr_simd_variant var in
simd_str_p access_size rA r1 (K k) ii
let ma = get_ea_preindexed rA k ii in
simd_str access_size ma r1 ii
| I_STR_SIMD(var,r1,rA,MemExt.Imm (k,PostIdx)) ->
let access_size = tr_simd_variant var in
let ma = read_reg_ord rA ii in
simd_str_p access_size ma r1 rA (K k) ii
| I_STUR_SIMD(var,r1,rA,k) ->
let access_size = tr_simd_variant var and
k = K (match k with Some k -> k | None -> 0) in
simd_str access_size rA r1 k S_NOEXT ii
let access_size = tr_simd_variant var in
let ma = get_ea_idx rA k ii in
simd_str access_size ma r1 ii
| I_STLUR_SIMD(var,r1,rA,k) ->
let access_size = tr_simd_variant var and
k = K (match k with Some k -> k | None -> 0) in
simd_stlr access_size rA r1 k S_NOEXT ii
| I_LDP_SIMD(tnt,var,r1,r2,r3,k) ->
get_ea_idx r3 k ii >>= fun addr ->
simd_ldp tnt var addr r1 r2 ii
| I_LDP_P_SIMD(tnt,var,r1,r2,r3,k) ->
read_reg_ord r3 ii >>= fun addr ->
(simd_ldp tnt var addr r1 r2 ii >>|
post_kr r3 addr (K k) ii) >>=
fun (b,()) -> M.unitT b
| I_STP_SIMD(tnt,var,r1,r2,r3,k) ->
get_ea_idx r3 k ii >>= fun addr ->
simd_stp tnt var addr r1 r2 ii
| I_STP_P_SIMD(tnt,var,r1,r2,r3,k) ->
read_reg_ord r3 ii >>= fun addr ->
simd_stp tnt var addr r1 r2 ii >>|
post_kr r3 addr (K k) ii >>=
fun (b,()) -> M.unitT b

let access_size = tr_simd_variant var in
let ma = get_ea_idx rA k ii in
simd_stlr access_size ma r1 ii
| I_LDP_SIMD(tnt,var,r1,r2,r3,idx) ->
begin
match idx with
| k,Idx ->
get_ea_idx r3 k ii >>= fun addr ->
simd_ldp tnt var addr r1 r2 ii
| k,PreIdx ->
get_ea_preindexed r3 k ii >>= fun addr ->
simd_ldp tnt var addr r1 r2 ii
| k,PostIdx ->
read_reg_ord r3 ii >>= fun addr ->
(simd_ldp tnt var addr r1 r2 ii >>|
post_kr r3 addr (K k) ii) >>=
fun (b,()) -> M.unitT b
end
| I_STP_SIMD(tnt,var,r1,r2,r3,idx) ->
begin
match idx with
| k,Idx ->
get_ea_idx r3 k ii >>= fun addr ->
simd_stp tnt var addr r1 r2 ii
| k,PreIdx ->
get_ea_preindexed r3 k ii >>= fun addr ->
simd_stp tnt var addr r1 r2 ii
| k,PostIdx ->
read_reg_ord r3 ii >>= fun addr ->
(simd_stp tnt var addr r1 r2 ii >>|
post_kr r3 addr (K k) ii) >>=
fun (b,()) -> M.unitT b
end
(* Morello instructions *)
| I_ALIGND(rd,rn,k) ->
check_morello inst ;
Expand Down
7 changes: 3 additions & 4 deletions jingle/AArch64Arch_jingle.ml
Original file line number Diff line number Diff line change
Expand Up @@ -659,10 +659,9 @@ include Arch.MakeArch(struct
| I_ST2 _ | I_ST2M _
| I_ST3 _ | I_ST3M _
| I_ST4 _ | I_ST4M _
| I_LDP_SIMD _ | I_LDP_P_SIMD _
| I_STP_SIMD _ | I_STP_P_SIMD _
| I_LDR_SIMD _ | I_LDR_P_SIMD _
| I_STR_SIMD _ | I_STR_P_SIMD _
| I_LDP_SIMD _
| I_STP_SIMD _
| I_LDR_SIMD _ | I_STR_SIMD _
| I_LDUR_SIMD _ | I_LDAPUR_SIMD _
| I_STUR_SIMD _ | I_STLUR_SIMD _
| I_ADDV _ | I_DUP _ | I_FMOV_TG _
Expand Down
Loading

0 comments on commit 6cdf3cf

Please sign in to comment.