From c6edd43ee488b734019665e270de057d6f7c2ec9 Mon Sep 17 00:00:00 2001 From: Konstantin Schwarz Date: Fri, 2 Aug 2024 23:14:06 +0100 Subject: [PATCH] [AIE2] Add type stubs and intrinsics declarations that are not supported yet This allows us to compile kernels using AIE_API abstractions while bringing up complete support for the intrinsic API. --- clang/lib/Headers/CMakeLists.txt | 1 + clang/lib/Headers/aiev2_aie_api_compat.h | 1461 ++++++++++++++++++++++ clang/lib/Headers/aiev2_core.h | 105 ++ clang/lib/Headers/aiev2intrin.h | 16 + 4 files changed, 1583 insertions(+) create mode 100644 clang/lib/Headers/aiev2_aie_api_compat.h diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index a5d84b47197d..022b9c7958c3 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -74,6 +74,7 @@ set(aie_files aiev2_addlog.h aiev2_ldst.h aiev2intrin.h + aiev2_aie_api_compat.h ) set(arm_common_files diff --git a/clang/lib/Headers/aiev2_aie_api_compat.h b/clang/lib/Headers/aiev2_aie_api_compat.h new file mode 100644 index 000000000000..67c8bee0cf6d --- /dev/null +++ b/clang/lib/Headers/aiev2_aie_api_compat.h @@ -0,0 +1,1461 @@ +//===- aiev2_aie_api_compat.h -----------------------------------*- C++-*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +#ifndef __AIEV2_AIE_API_COMPAT_H +#define __AIEV2_AIE_API_COMPAT_H + +#define __AIE_MODEL_VERSION__ 10500 +#define __AIE_ARCH_MODEL_VERSION__ 20010500 +#define __AIE_MODEL_VERSION_NAME_STR__ "aie2_arch_r1p5" + +struct acc16 {}; +struct acc24 {}; +struct acc40 {}; +struct acc56 {}; + +struct cacc16 {}; +struct cacc24 {}; +struct cacc32 {}; +struct cacc40 {}; +struct cacc56 {}; +struct cacc64 {}; + +struct v8cint16_compress {}; +struct v4cint32_compress {}; +struct v8cint32_compress {}; +struct v16cint16_compress {}; + +typedef cint32_t cint32_w64; + +struct v2cacc64 {}; +struct v4cacc64 { + v4cacc64(v8cint32); + operator v8cint32(); +}; +struct v8cacc64; +struct v8cacc64 {}; + +struct caccfloat {}; +struct v4caccfloat {}; +struct v8caccfloat {}; +struct v16caccfloat {}; + +struct v1cfloat {}; +struct v2cfloat {}; +struct v4cfloat {}; +struct v8cfloat { + v8cfloat(v8cint32); +}; +struct v16cfloat {}; + +struct cbfloat16 {}; +struct v2cbfloat16 {}; +struct v4cbfloat16 {}; +struct v8cbfloat16 {}; +struct v16cbfloat16 { + v16cbfloat16(v16cint16); +}; +struct v32cbfloat16 {}; + +// clang-format off +inline __attribute__((always_inline)) v2cbfloat16 undef_v2cbfloat16(); +inline __attribute__((always_inline)) v4cint16 undef_v4cint16(); +inline __attribute__((always_inline)) v2cint32 undef_v2cint32(); +inline __attribute__((always_inline)) v4float undef_v4float(); +inline __attribute__((always_inline)) v2cfloat undef_v2cfloat(); +inline __attribute__((always_inline)) v4cbfloat16 undef_v4cbfloat16(); +inline __attribute__((always_inline)) v8cint16 undef_v8cint16(); +inline __attribute__((always_inline)) v4cint32 undef_v4cint32(); +inline __attribute__((always_inline)) v2cacc64 undef_v2cacc64(); +inline __attribute__((always_inline)) v4caccfloat undef_v4caccfloat(); +inline __attribute__((always_inline)) v4cfloat undef_v4cfloat(); +inline __attribute__((always_inline)) v8cbfloat16 undef_v8cbfloat16(); +inline __attribute__((always_inline)) v16cint16 undef_v16cint16(); +inline __attribute__((always_inline)) v8cint32 undef_v8cint32(); +inline __attribute__((always_inline)) v4cacc64 undef_v4cacc64(); +inline __attribute__((always_inline)) v8caccfloat undef_v8caccfloat(); +inline __attribute__((always_inline)) v8cfloat undef_v8cfloat(); +inline __attribute__((always_inline)) v16cbfloat16 undef_v16cbfloat16(); +inline __attribute__((always_inline)) v32cint16 undef_v32cint16(); +inline __attribute__((always_inline)) v16cint32 undef_v16cint32(); +inline __attribute__((always_inline)) v8cacc64 undef_v8cacc64(); +inline __attribute__((always_inline)) v16caccfloat undef_v16caccfloat(); +inline __attribute__((always_inline)) v16cfloat undef_v16cfloat(); +inline __attribute__((always_inline)) v32cbfloat16 undef_v32cbfloat16(); +inline __attribute__((always_inline)) unsigned int as_uint32(cint16 ); +inline __attribute__((always_inline)) int as_int32(cint16 ); +inline __attribute__((always_inline)) cint16 as_cint16(int ); +inline __attribute__((always_inline)) cint16 as_cint16(unsigned int ); +inline __attribute__((always_inline)) int get_real(cint32_w64 ); +inline __attribute__((always_inline)) int get_imag(cint32_w64 ); +inline __attribute__((always_inline)) cint32_w64 upd_real(cint32_w64 , int ); +inline __attribute__((always_inline)) cint32_w64 upd_imag(cint32_w64 , int ); +inline __attribute__((always_inline)) int zeroextend4(int ); +inline __attribute__((always_inline)) int signextend4(int ); +inline __attribute__((always_inline)) v8cint16 extract_v8cint16(v16cint16 , int ); +inline __attribute__((always_inline)) v16cint16 insert(v16cint16 , int , v8cint16 ); +inline __attribute__((always_inline)) v16cint16 set_v16cint16(int , v8cint16 ); +inline __attribute__((always_inline)) v16cint16 concat(v8cint16 , v8cint16 ); +inline __attribute__((always_inline)) v4cint32 extract_v4cint32(v8cint32 , int ); +inline __attribute__((always_inline)) v8cint32 insert(v8cint32 , int , v4cint32 ); +inline __attribute__((always_inline)) v8cint32 set_v8cint32(int , v4cint32 ); +inline __attribute__((always_inline)) v8cint32 concat(v4cint32 , v4cint32 ); +inline __attribute__((always_inline)) v2cacc64 extract_v2cacc64(v4cacc64 , int ); +inline __attribute__((always_inline)) v4cacc64 insert(v4cacc64 , int , v2cacc64 ); +inline __attribute__((always_inline)) v4cacc64 set_v4cacc64(int , v2cacc64 ); +inline __attribute__((always_inline)) v4cacc64 concat(v2cacc64 , v2cacc64 ); +inline __attribute__((always_inline)) v4caccfloat extract_v4caccfloat(v8caccfloat , int ); +inline __attribute__((always_inline)) v8caccfloat insert(v8caccfloat , int , v4caccfloat ); +inline __attribute__((always_inline)) v8caccfloat set_v8caccfloat(int , v4caccfloat ); +inline __attribute__((always_inline)) v8caccfloat concat(v4caccfloat , v4caccfloat ); +inline __attribute__((always_inline)) v4cfloat extract_v4cfloat(v8cfloat , int ); +inline __attribute__((always_inline)) v8cfloat insert(v8cfloat , int , v4cfloat ); +inline __attribute__((always_inline)) v8cfloat set_v8cfloat(int , v4cfloat ); +inline __attribute__((always_inline)) v8cfloat concat(v4cfloat , v4cfloat ); +inline __attribute__((always_inline)) v8cbfloat16 extract_v8cbfloat16(v16cbfloat16 , int ); +inline __attribute__((always_inline)) v16cbfloat16 insert(v16cbfloat16 , int , v8cbfloat16 ); +inline __attribute__((always_inline)) v16cbfloat16 set_v16cbfloat16(int , v8cbfloat16 ); +inline __attribute__((always_inline)) v16cbfloat16 concat(v8cbfloat16 , v8cbfloat16 ); +inline __attribute__((always_inline)) v8cint16 extract_v8cint16(v32cint16 , int ); +inline __attribute__((always_inline)) v32cint16 insert(v32cint16 , int , v8cint16 ); +inline __attribute__((always_inline)) v32cint16 set_v32cint16(int , v8cint16 ); +inline __attribute__((always_inline)) v32cint16 concat(v8cint16 , v8cint16 , v8cint16 , v8cint16 ); +inline __attribute__((always_inline)) v16cint16 extract_v16cint16(v32cint16 , int ); +inline __attribute__((always_inline)) v32cint16 insert(v32cint16 , int , v16cint16 ); +inline __attribute__((always_inline)) v32cint16 set_v32cint16(int , v16cint16 ); +inline __attribute__((always_inline)) v32cint16 concat(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v4cint32 extract_v4cint32(v16cint32 , int ); +inline __attribute__((always_inline)) v16cint32 insert(v16cint32 , int , v4cint32 ); +inline __attribute__((always_inline)) v16cint32 set_v16cint32(int , v4cint32 ); +inline __attribute__((always_inline)) v16cint32 concat(v4cint32 , v4cint32 , v4cint32 , v4cint32 ); +inline __attribute__((always_inline)) v8cint32 extract_v8cint32(v16cint32 , int ); +inline __attribute__((always_inline)) v16cint32 insert(v16cint32 , int , v8cint32 ); +inline __attribute__((always_inline)) v16cint32 set_v16cint32(int , v8cint32 ); +inline __attribute__((always_inline)) v16cint32 concat(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v2cacc64 extract_v2cacc64(v8cacc64 , int ); +inline __attribute__((always_inline)) v8cacc64 insert(v8cacc64 , int , v2cacc64 ); +inline __attribute__((always_inline)) v8cacc64 set_v8cacc64(int , v2cacc64 ); +inline __attribute__((always_inline)) v8cacc64 concat(v2cacc64 , v2cacc64 , v2cacc64 , v2cacc64 ); +inline __attribute__((always_inline)) v4cacc64 extract_v4cacc64(v8cacc64 , int ); +inline __attribute__((always_inline)) v8cacc64 insert(v8cacc64 , int , v4cacc64 ); +inline __attribute__((always_inline)) v8cacc64 set_v8cacc64(int , v4cacc64 ); +inline __attribute__((always_inline)) v8cacc64 concat(v4cacc64 , v4cacc64 ); +inline __attribute__((always_inline)) v4caccfloat extract_v4caccfloat(v16caccfloat , int ); +inline __attribute__((always_inline)) v16caccfloat insert(v16caccfloat , int , v4caccfloat ); +inline __attribute__((always_inline)) v16caccfloat set_v16caccfloat(int , v4caccfloat ); +inline __attribute__((always_inline)) v16caccfloat concat(v4caccfloat , v4caccfloat , v4caccfloat , v4caccfloat ); +inline __attribute__((always_inline)) v8caccfloat extract_v8caccfloat(v16caccfloat , int ); +inline __attribute__((always_inline)) v16caccfloat insert(v16caccfloat , int , v8caccfloat ); +inline __attribute__((always_inline)) v16caccfloat set_v16caccfloat(int , v8caccfloat ); +inline __attribute__((always_inline)) v16caccfloat concat(v8caccfloat , v8caccfloat ); +inline __attribute__((always_inline)) v4cfloat extract_v4cfloat(v16cfloat , int ); +inline __attribute__((always_inline)) v16cfloat insert(v16cfloat , int , v4cfloat ); +inline __attribute__((always_inline)) v16cfloat set_v16cfloat(int , v4cfloat ); +inline __attribute__((always_inline)) v16cfloat concat(v4cfloat , v4cfloat , v4cfloat , v4cfloat ); +inline __attribute__((always_inline)) v8cfloat extract_v8cfloat(v16cfloat , int ); +inline __attribute__((always_inline)) v16cfloat insert(v16cfloat , int , v8cfloat ); +inline __attribute__((always_inline)) v16cfloat set_v16cfloat(int , v8cfloat ); +inline __attribute__((always_inline)) v16cfloat concat(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v8cbfloat16 extract_v8cbfloat16(v32cbfloat16 , int ); +inline __attribute__((always_inline)) v32cbfloat16 insert(v32cbfloat16 , int , v8cbfloat16 ); +inline __attribute__((always_inline)) v32cbfloat16 set_v32cbfloat16(int , v8cbfloat16 ); +inline __attribute__((always_inline)) v32cbfloat16 concat(v8cbfloat16 , v8cbfloat16 , v8cbfloat16 , v8cbfloat16 ); +inline __attribute__((always_inline)) v16cbfloat16 extract_v16cbfloat16(v32cbfloat16 , int ); +inline __attribute__((always_inline)) v32cbfloat16 insert(v32cbfloat16 , int , v16cbfloat16 ); +inline __attribute__((always_inline)) v32cbfloat16 set_v32cbfloat16(int , v16cbfloat16 ); +inline __attribute__((always_inline)) v32cbfloat16 concat(v16cbfloat16 , v16cbfloat16 ); +inline __attribute__((always_inline)) v4cint16 extract_v4cint16(v16cint16 , int ); +inline __attribute__((always_inline)) v2cint32 extract_v2cint32(v8cint32 , int ); +inline __attribute__((always_inline)) v2cfloat extract_v2cfloat(v8cfloat , int ); +inline __attribute__((always_inline)) v4cbfloat16 extract_v4cbfloat16(v16cbfloat16 , int ); +inline __attribute__((always_inline)) v16cint16 set_v16cint16(int , v4cint16 ); +inline __attribute__((always_inline)) v8cint32 set_v8cint32(int , v2cint32 ); +inline __attribute__((always_inline)) v8cfloat set_v8cfloat(int , v2cfloat ); +inline __attribute__((always_inline)) v16cbfloat16 set_v16cbfloat16(int , v4cbfloat16 ); +inline __attribute__((always_inline)) v4cint16 extract_v4cint16(v8cint16 , int ); +inline __attribute__((always_inline)) v2cint32 extract_v2cint32(v4cint32 , int ); +inline __attribute__((always_inline)) v2cfloat extract_v2cfloat(v4cfloat , int ); +inline __attribute__((always_inline)) v4cbfloat16 extract_v4cbfloat16(v8cbfloat16 , int ); +inline __attribute__((always_inline)) v8cint16 set_v8cint16(int , v4cint16 ); +inline __attribute__((always_inline)) v4cint32 set_v4cint32(int , v2cint32 ); +inline __attribute__((always_inline)) v4cfloat set_v4cfloat(int , v2cfloat ); +inline __attribute__((always_inline)) v8cbfloat16 set_v8cbfloat16(int , v4cbfloat16 ); +inline __attribute__((always_inline)) v16cint16 insert(v16cint16 , int , v4cint16 ); +inline __attribute__((always_inline)) v8cint32 insert(v8cint32 , int , v2cint32 ); +inline __attribute__((always_inline)) v8cfloat insert(v8cfloat , int , v2cfloat ); +inline __attribute__((always_inline)) v16cbfloat16 insert(v16cbfloat16 , int , v4cbfloat16 ); +inline __attribute__((always_inline)) v8cint16 insert(v8cint16 , int , v4cint16 ); +inline __attribute__((always_inline)) v4cint32 insert(v4cint32 , int , v2cint32 ); +inline __attribute__((always_inline)) v4cfloat insert(v4cfloat , int , v2cfloat ); +inline __attribute__((always_inline)) v8cbfloat16 insert(v8cbfloat16 , int , v4cbfloat16 ); +inline __attribute__((always_inline)) v16cint16 concat(v4cint16 , v4cint16 , v4cint16 , v4cint16 ); +inline __attribute__((always_inline)) v8cint32 concat(v2cint32 , v2cint32 , v2cint32 , v2cint32 ); +inline __attribute__((always_inline)) v8cfloat concat(v2cfloat , v2cfloat , v2cfloat , v2cfloat ); +inline __attribute__((always_inline)) v16cbfloat16 concat(v4cbfloat16 , v4cbfloat16 , v4cbfloat16 , v4cbfloat16 ); +inline __attribute__((always_inline)) v8cint16 concat(v4cint16 , v4cint16 ); +inline __attribute__((always_inline)) v4cint32 concat(v2cint32 , v2cint32 ); +inline __attribute__((always_inline)) v4cfloat concat(v2cfloat , v2cfloat ); +inline __attribute__((always_inline)) v8cbfloat16 concat(v4cbfloat16 , v4cbfloat16 ); +inline __attribute__((always_inline)) v128uint4 extract_v128uint4(v256uint4_sparse ); +inline __attribute__((always_inline)) v64uint8 extract_v64uint8(v128uint8_sparse ); +inline __attribute__((always_inline)) v32uint16 extract_v32uint16(v64uint16_sparse ); +inline __attribute__((always_inline)) v128int4 extract_v128int4(v256int4_sparse ); +inline __attribute__((always_inline)) v64int8 extract_v64int8(v128int8_sparse ); +inline __attribute__((always_inline)) v32int16 extract_v32int16(v64int16_sparse ); +inline __attribute__((always_inline)) v32bfloat16 extract_v32bfloat16(v64bfloat16_sparse ); +inline __attribute__((always_inline)) v128uint4 extract_sparse_data(v256uint4_sparse ); +inline __attribute__((always_inline)) v64uint8 extract_sparse_data(v128uint8_sparse ); +inline __attribute__((always_inline)) v32uint16 extract_sparse_data(v64uint16_sparse ); +inline __attribute__((always_inline)) v128int4 extract_sparse_data(v256int4_sparse ); +inline __attribute__((always_inline)) v64int8 extract_sparse_data(v128int8_sparse ); +inline __attribute__((always_inline)) v32int16 extract_sparse_data(v64int16_sparse ); +inline __attribute__((always_inline)) v32bfloat16 extract_sparse_data(v64bfloat16_sparse ); +inline __attribute__((always_inline)) sparsity_t extract_sparsity(v256uint4_sparse ); +inline __attribute__((always_inline)) sparsity_t extract_sparsity(v128uint8_sparse ); +inline __attribute__((always_inline)) sparsity_t extract_sparsity(v64uint16_sparse ); +inline __attribute__((always_inline)) sparsity_t extract_sparsity(v256int4_sparse ); +inline __attribute__((always_inline)) sparsity_t extract_sparsity(v128int8_sparse ); +inline __attribute__((always_inline)) sparsity_t extract_sparsity(v64int16_sparse ); +inline __attribute__((always_inline)) sparsity_t extract_sparsity(v64bfloat16_sparse ); +inline __attribute__((always_inline)) v256uint4_sparse update(v256uint4_sparse , v128uint4 ); +inline __attribute__((always_inline)) v128uint8_sparse update(v128uint8_sparse , v64uint8 ); +inline __attribute__((always_inline)) v64uint16_sparse update(v64uint16_sparse , v32uint16 ); +inline __attribute__((always_inline)) v256int4_sparse update(v256int4_sparse , v128int4 ); +inline __attribute__((always_inline)) v128int8_sparse update(v128int8_sparse , v64int8 ); +inline __attribute__((always_inline)) v64int16_sparse update(v64int16_sparse , v32int16 ); +inline __attribute__((always_inline)) v64bfloat16_sparse update(v64bfloat16_sparse , v32bfloat16 ); +inline __attribute__((always_inline)) v256uint4_sparse update(v256uint4_sparse , sparsity_t ); +inline __attribute__((always_inline)) v128uint8_sparse update(v128uint8_sparse , sparsity_t ); +inline __attribute__((always_inline)) v64uint16_sparse update(v64uint16_sparse , sparsity_t ); +inline __attribute__((always_inline)) v256int4_sparse update(v256int4_sparse , sparsity_t ); +inline __attribute__((always_inline)) v128int8_sparse update(v128int8_sparse , sparsity_t ); +inline __attribute__((always_inline)) v64int16_sparse update(v64int16_sparse , sparsity_t ); +inline __attribute__((always_inline)) v64bfloat16_sparse update(v64bfloat16_sparse , sparsity_t ); +inline __attribute__((always_inline)) unsigned int get_symsat(); +inline __attribute__((always_inline)) unsigned int get_srs_of(); +inline __attribute__((always_inline)) void set_srs_of(); +inline __attribute__((always_inline)) void set_srs_of(unsigned int ); +inline __attribute__((always_inline)) void clr_srs_of(); +inline __attribute__((always_inline)) unsigned int get_ups_of(); +inline __attribute__((always_inline)) void set_ups_of(); +inline __attribute__((always_inline)) void set_ups_of(unsigned int ); +inline __attribute__((always_inline)) void clr_ups_of(); +inline __attribute__((always_inline)) unsigned int get_fpmulmac_flags(); +inline __attribute__((always_inline)) void set_fpmulmac_flags(unsigned int ); +inline __attribute__((always_inline)) unsigned int get_fp2int_flags(); +inline __attribute__((always_inline)) void set_fp2int_flags(unsigned int ); +inline __attribute__((always_inline)) unsigned int get_fpf2f_flags(); +inline __attribute__((always_inline)) void set_fpf2f_flags(unsigned int ); +inline __attribute__((always_inline)) unsigned int get_compr_uf(); +inline __attribute__((always_inline)) void set_compr_uf(); +inline __attribute__((always_inline)) void set_compr_uf(unsigned int ); +inline __attribute__((always_inline)) void clr_compr_uf(); +inline __attribute__((always_inline)) unsigned int get_sparse_of(); +inline __attribute__((always_inline)) void set_sparse_of(); +inline __attribute__((always_inline)) void set_sparse_of(unsigned int ); +inline __attribute__((always_inline)) void clr_sparse_of(); +inline __attribute__((always_inline)) v16cint16 shiftx(v16cint16 , v16cint16 , int , unsigned int ); +inline __attribute__((always_inline)) v8cint32 shiftx(v8cint32 , v8cint32 , int , unsigned int ); +inline __attribute__((always_inline)) v16cbfloat16 shiftx(v16cbfloat16 , v16cbfloat16 , int , int ); +inline __attribute__((always_inline)) v8cfloat shiftx(v8cfloat , v8cfloat , int , unsigned int ); +inline __attribute__((always_inline)) v16cint16 shift_bytes(v16cint16 , v16cint16 , unsigned int ); +inline __attribute__((always_inline)) v8cint32 shift_bytes(v8cint32 , v8cint32 , unsigned int ); +inline __attribute__((always_inline)) v16cbfloat16 shift_bytes(v16cbfloat16 , v16cbfloat16 , int ); +inline __attribute__((always_inline)) v8cfloat shift_bytes(v8cfloat , v8cfloat , unsigned int ); +inline __attribute__((always_inline)) v16cint16 shift(v16cint16 , v16cint16 , unsigned int ); +inline __attribute__((always_inline)) v8cint32 shift(v8cint32 , v8cint32 , unsigned int ); +inline __attribute__((always_inline)) v16cbfloat16 shift(v16cbfloat16 , v16cbfloat16 , int ); +inline __attribute__((always_inline)) v8cfloat shift(v8cfloat , v8cfloat , unsigned int ); +inline __attribute__((always_inline)) v16cint16 broadcast_c16(cint16 ); +inline __attribute__((always_inline)) v8cint32 broadcast_c32(cint32 ); +inline __attribute__((always_inline)) v16cint16 broadcast_to_v16cint16(cint16 ); +inline __attribute__((always_inline)) v16cint16 broadcast_to_v16cint16(v2cint16 ); +inline __attribute__((always_inline)) v8cint32 broadcast_to_v8cint32(cint32 ); +inline __attribute__((always_inline)) v64int8 broadcast_one_to_v64int8(); +inline __attribute__((always_inline)) v32int16 broadcast_one_to_v32int16(); +inline __attribute__((always_inline)) v16int32 broadcast_one_to_v16int32(); +inline __attribute__((always_inline)) v64uint8 broadcast_one_to_v64uint8(); +inline __attribute__((always_inline)) v32uint16 broadcast_one_to_v32uint16(); +inline __attribute__((always_inline)) v16uint32 broadcast_one_to_v16uint32(); +inline __attribute__((always_inline)) v16cint16 broadcast_one_to_v16cint16(); +inline __attribute__((always_inline)) v8cint32 broadcast_one_to_v8cint32(); +inline __attribute__((always_inline)) v16cint16 broadcast_one_c16(); +inline __attribute__((always_inline)) v8cint32 broadcast_one_c32(); +inline __attribute__((always_inline)) v16cint16 broadcast_zero_to_v16cint16(); +inline __attribute__((always_inline)) v8cint32 broadcast_zero_to_v8cint32(); +inline __attribute__((always_inline)) v16cint16 broadcast_zero_c16(); +inline __attribute__((always_inline)) v8cint32 broadcast_zero_c32(); +inline __attribute__((always_inline)) v16cint16 broadcast_elem(v16cint16 , int ); +inline __attribute__((always_inline)) v8cint32 broadcast_elem(v8cint32 , int ); +inline __attribute__((always_inline)) v16cint16 upd_elem(v16cint16 , int , cint16 ); +inline __attribute__((always_inline)) v8cint32 upd_elem(v8cint32 , int , cint32 ); +inline __attribute__((always_inline)) v16cint16 insert(v16cint16 , int , cint16 ); +inline __attribute__((always_inline)) v16cint16 insert(v16cint16 , int , v2cint16 ); +inline __attribute__((always_inline)) v16cint16 insert(v16cint16 , int , unsigned long long ); +inline __attribute__((always_inline)) v16float insert(v16float , int , unsigned long long ); +inline __attribute__((always_inline)) v8cint32 insert(v8cint32 , int , cint32 ); +inline __attribute__((always_inline)) v8cfloat insert(v8cfloat , int , cfloat ); +inline __attribute__((always_inline)) v16cbfloat16 insert(v16cbfloat16 , int , cbfloat16 ); +inline __attribute__((always_inline)) v32int32 insert_element(v32int32 , int , int ); +inline __attribute__((always_inline)) v32acc32 insert_element(v32acc32 , int , int ); +inline __attribute__((always_inline)) v16acc64 insert_element(v16acc64 , int , long long ); +inline __attribute__((always_inline)) v32int32 upd_elem(v32int32 , int , int ); +inline __attribute__((always_inline)) v32acc32 upd_elem(v32acc32 , int , int ); +inline __attribute__((always_inline)) v16acc64 upd_elem(v16acc64 , int , int ); +inline __attribute__((always_inline)) v16cint16 shiftl_elem(v16cint16 , cint16 ); +inline __attribute__((always_inline)) v8cint32 shiftl_elem(v8cint32 , cint32_w64 ); +inline __attribute__((always_inline)) v8cint32 shiftl_elem(v8cint32 , cint32 ); +inline __attribute__((always_inline)) v8cfloat shiftl_elem(v8cfloat , cfloat ); +inline __attribute__((always_inline)) v16cbfloat16 shiftl_elem(v16cbfloat16 , cbfloat16 ); +inline __attribute__((always_inline)) v16cint16 shiftr_elem(v16cint16 , cint16 ); +inline __attribute__((always_inline)) v8cint32 shiftr_elem(v8cint32 , cint32_w64 ); +inline __attribute__((always_inline)) v8cint32 shiftr_elem(v8cint32 , cint32 ); +inline __attribute__((always_inline)) v8cfloat shiftr_elem(v8cfloat , cfloat ); +inline __attribute__((always_inline)) v16cbfloat16 shiftr_elem(v16cbfloat16 , cbfloat16 ); +inline __attribute__((always_inline)) cint16 ext_elem(v16cint16 , int , int ); +inline __attribute__((always_inline)) v2cint16 ext_v2cint16(v16cint16 , int , int ); +inline __attribute__((always_inline)) unsigned long long ext_u64(v16cint16 , int , int ); +inline __attribute__((always_inline)) cint32 ext_elem(v8cint32 , int , int ); +inline __attribute__((always_inline)) cint32 extract_elem(v8cint32 , int , int ); +inline __attribute__((always_inline)) cint16 extract_elem(v16cint16 , int , int ); +inline __attribute__((always_inline)) v2cint16 extract_v2cint16(v16cint16 , int , int ); +inline __attribute__((always_inline)) float get_lo(v1cfloat ); +inline __attribute__((always_inline)) float get_hi(v1cfloat ); +inline __attribute__((always_inline)) cfloat extract_elem(v8cfloat , int ); +inline __attribute__((always_inline)) cfloat extract_elem(v8cfloat , int , int ); +inline __attribute__((always_inline)) cbfloat16 extract_elem(v16cbfloat16 , int ); +inline __attribute__((always_inline)) cbfloat16 extract_elem(v16cbfloat16 , int , int ); +inline __attribute__((always_inline)) v2uint32 ext_vu2int32(v16uint32 , int ); +inline __attribute__((always_inline)) cint16 ext_elem(v16cint16 , int ); +inline __attribute__((always_inline)) v2cint16 ext_v2cint16(v16cint16 , int ); +inline __attribute__((always_inline)) cint32 ext_elem(v8cint32 , int ); +inline __attribute__((always_inline)) cint16 extract_elem(v16cint16 , int ); +inline __attribute__((always_inline)) v2cint16 extract_v2cint16(v16cint16 , int ); +inline __attribute__((always_inline)) cint32 extract_elem(v8cint32 , int ); +inline __attribute__((always_inline)) int extract_element(v32int32 , int ); +inline __attribute__((always_inline)) int extract_element(v32acc32 , int ); +inline __attribute__((always_inline)) long long extract_element(v16acc64 , int ); +inline __attribute__((always_inline)) int ext_elem(v32int32 , int ); +inline __attribute__((always_inline)) int ext_elem(v32acc32 , int ); +inline __attribute__((always_inline)) long long ext_elem(v16acc64 , int ); +inline __attribute__((always_inline)) v8cint32 shuffle(v8cint32 , v8cint32 , unsigned int ); +inline __attribute__((always_inline)) v16cint16 shuffle(v16cint16 , v16cint16 , unsigned int ); +inline __attribute__((always_inline)) v16cbfloat16 shuffle(v16cbfloat16 , v16cbfloat16 , unsigned int ); +inline __attribute__((always_inline)) v8cfloat shuffle(v8cfloat , v8cfloat , unsigned int ); +inline __attribute__((always_inline)) v8cint32 shuffle(v8cint32 , unsigned int ); +inline __attribute__((always_inline)) v16cint16 shuffle(v16cint16 , unsigned int ); +inline __attribute__((always_inline)) v16cbfloat16 shuffle(v16cbfloat16 , unsigned int ); +inline __attribute__((always_inline)) v8cfloat shuffle(v8cfloat , unsigned int ); +inline __attribute__((always_inline)) v16cint16 shuffle_c16(cint16 , unsigned int ); +inline __attribute__((always_inline)) v8cint32 shuffle_c32(cint32 , unsigned int ); +inline __attribute__((always_inline)) v256int4_sparse shuffle(v256int4_sparse , int ); +inline __attribute__((always_inline)) v128int8_sparse shuffle(v128int8_sparse , int ); +inline __attribute__((always_inline)) v64int16_sparse shuffle(v64int16_sparse , int ); +inline __attribute__((always_inline)) v256uint4_sparse shuffle(v256uint4_sparse , int ); +inline __attribute__((always_inline)) v128uint8_sparse shuffle(v128uint8_sparse , int ); +inline __attribute__((always_inline)) v64uint16_sparse shuffle(v64uint16_sparse , int ); +inline __attribute__((always_inline)) v64bfloat16_sparse shuffle(v64bfloat16_sparse , int ); +inline __attribute__((always_inline)) v4cacc64 get_scd_v4cacc64(int ); +inline __attribute__((always_inline)) v8caccfloat get_scd_v8caccfloat(int ); +inline __attribute__((always_inline)) v16acc32 get_scd(int ); +inline __attribute__((always_inline)) v8acc64 getl_scd(int ); +inline __attribute__((always_inline)) v16accfloat getf_scd(int ); +inline __attribute__((always_inline)) v16cint16 get_scd_v16cint16(int ); +inline __attribute__((always_inline)) v8cint32 get_scd_v8cint32(int ); +inline __attribute__((always_inline)) v8cfloat get_scd_v8cfloat(int ); +inline __attribute__((always_inline)) v16cbfloat16 get_scd_v16cbfloat16(int ); +inline __attribute__((always_inline)) v4cacc64 get_scd_v4cacc64(); +inline __attribute__((always_inline)) v8caccfloat get_scd_v8caccfloat(); +inline __attribute__((always_inline)) v16acc32 get_scd(); +inline __attribute__((always_inline)) v8acc64 getl_scd(); +inline __attribute__((always_inline)) v16accfloat getf_scd(); +inline __attribute__((always_inline)) v16cint16 get_scd_v16cint16(); +inline __attribute__((always_inline)) v8cint32 get_scd_v8cint32(); +inline __attribute__((always_inline)) v8cfloat get_scd_v8cfloat(); +inline __attribute__((always_inline)) v16cbfloat16 get_scd_v16cbfloat16(); +inline __attribute__((always_inline)) v8cacc64 get_scd_v8cacc64(int ); +inline __attribute__((always_inline)) v16caccfloat get_scd_v16caccfloat(int ); +inline __attribute__((always_inline)) v32cint16 get_scd_v32cint16(int ); +inline __attribute__((always_inline)) v16cint32 get_scd_v16cint32(int ); +inline __attribute__((always_inline)) v16cfloat get_scd_v16cfloat(int ); +inline __attribute__((always_inline)) v8cacc64 get_scd_v8cacc64(); +inline __attribute__((always_inline)) v16caccfloat get_scd_v16caccfloat(); +inline __attribute__((always_inline)) v32cint16 get_scd_v32cint16(); +inline __attribute__((always_inline)) v16cint32 get_scd_v16cint32(); +inline __attribute__((always_inline)) v16cfloat get_scd_v16cfloat(); +inline __attribute__((always_inline)) v32acc32 get_scd_lo(int ); +inline __attribute__((always_inline)) v32acc32 get_scd_hi(int ); +inline __attribute__((always_inline)) v16acc64 getl_scd_lo(int ); +inline __attribute__((always_inline)) v16acc64 getl_scd_hi(int ); +inline __attribute__((always_inline)) v32acc32 get_scd_lo(); +inline __attribute__((always_inline)) v32acc32 get_scd_hi(); +inline __attribute__((always_inline)) v16acc64 getl_scd_lo(); +inline __attribute__((always_inline)) v16acc64 getl_scd_hi(); +inline __attribute__((always_inline)) void put_mcd(v4cacc64 , int ); +inline __attribute__((always_inline)) void put_mcd(v8caccfloat , int ); +inline __attribute__((always_inline)) void put_mcd(v16cint16 , int ); +inline __attribute__((always_inline)) void put_mcd(v8cint32 , int ); +inline __attribute__((always_inline)) void put_mcd(v8cfloat , int ); +inline __attribute__((always_inline)) void put_mcd(v16cbfloat16 , int ); +inline __attribute__((always_inline)) void put_mcd(v4cacc64 ); +inline __attribute__((always_inline)) void put_mcd(v8caccfloat ); +inline __attribute__((always_inline)) void put_mcd(v16cint16 ); +inline __attribute__((always_inline)) void put_mcd(v8cint32 ); +inline __attribute__((always_inline)) void put_mcd(v8cfloat ); +inline __attribute__((always_inline)) void put_mcd(v16cbfloat16 ); +inline __attribute__((always_inline)) void put_mcd(v8cacc64 , int ); +inline __attribute__((always_inline)) void put_mcd(v16caccfloat , int ); +inline __attribute__((always_inline)) void put_mcd(v32cint16 , int ); +inline __attribute__((always_inline)) void put_mcd(v16cint32 , int ); +inline __attribute__((always_inline)) void put_mcd(v16cfloat , int ); +inline __attribute__((always_inline)) void put_mcd(v8cacc64 ); +inline __attribute__((always_inline)) void put_mcd(v32cint16 ); +inline __attribute__((always_inline)) void put_mcd(v16cint32 ); +inline __attribute__((always_inline)) void put_mcd(v16cfloat ); +inline __attribute__((always_inline)) void put_ms(cint16 ); +inline __attribute__((always_inline)) void put_ms(cint16 , int ); +inline __attribute__((always_inline)) void put_ms_nb(cint16 , int , bool & ); +inline __attribute__((always_inline)) void put_ms_nb(cint16 , bool & ); +inline __attribute__((always_inline)) void put_ms(cint16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v2bfloat16 ); +inline __attribute__((always_inline)) void put_ms(v2bfloat16 , int ); +inline __attribute__((always_inline)) void put_ms_nb(v2bfloat16 , int , bool & ); +inline __attribute__((always_inline)) void put_ms_nb(v2bfloat16 , bool & ); +inline __attribute__((always_inline)) void put_ms(v2bfloat16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v4cint16 , int ); +inline __attribute__((always_inline)) void put_ms(v4cint16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v2cint32 , int ); +inline __attribute__((always_inline)) void put_ms(v2cint32 , int , int ); +inline __attribute__((always_inline)) void put_ms(v2cfloat , int ); +inline __attribute__((always_inline)) void put_ms(v2cfloat , int , int ); +inline __attribute__((always_inline)) void put_ms(v8bfloat16 , int ); +inline __attribute__((always_inline)) void put_ms(v8bfloat16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v4cbfloat16 , int ); +inline __attribute__((always_inline)) void put_ms(v4cbfloat16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v8cint16 , int ); +inline __attribute__((always_inline)) void put_ms(v8cint16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v4cint32 , int ); +inline __attribute__((always_inline)) void put_ms(v4cint32 , int , int ); +inline __attribute__((always_inline)) void put_ms(v4cfloat , int ); +inline __attribute__((always_inline)) void put_ms(v4cfloat , int , int ); +inline __attribute__((always_inline)) void put_ms(v16bfloat16 , int ); +inline __attribute__((always_inline)) void put_ms(v16bfloat16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v8cbfloat16 , int ); +inline __attribute__((always_inline)) void put_ms(v8cbfloat16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v16cint16 , int ); +inline __attribute__((always_inline)) void put_ms(v16cint16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v8cint32 , int ); +inline __attribute__((always_inline)) void put_ms(v8cint32 , int , int ); +inline __attribute__((always_inline)) void put_ms(v8cfloat , int ); +inline __attribute__((always_inline)) void put_ms(v8cfloat , int , int ); +inline __attribute__((always_inline)) void put_ms(v32bfloat16 , int ); +inline __attribute__((always_inline)) void put_ms(v32bfloat16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v16cbfloat16 , int ); +inline __attribute__((always_inline)) void put_ms(v16cbfloat16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v32int32 , int ); +inline __attribute__((always_inline)) void put_ms(v256int4 , int ); +inline __attribute__((always_inline)) void put_ms(v256uint4 , int ); +inline __attribute__((always_inline)) void put_ms(v128int8 , int ); +inline __attribute__((always_inline)) void put_ms(v128uint8 , int ); +inline __attribute__((always_inline)) void put_ms(v64int16 , int ); +inline __attribute__((always_inline)) void put_ms(v64uint16 , int ); +inline __attribute__((always_inline)) void put_ms(v32cint16 , int ); +inline __attribute__((always_inline)) void put_ms(v32cint16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v32uint32 , int ); +inline __attribute__((always_inline)) void put_ms(v16cint32 , int ); +inline __attribute__((always_inline)) void put_ms(v16cint32 , int , int ); +inline __attribute__((always_inline)) void put_ms(v16cfloat , int ); +inline __attribute__((always_inline)) void put_ms(v16cfloat , int , int ); +inline __attribute__((always_inline)) void put_ms(v64bfloat16 , int ); +inline __attribute__((always_inline)) void put_ms(v64bfloat16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v32cbfloat16 , int ); +inline __attribute__((always_inline)) void put_ms(v32cbfloat16 , int , int ); +inline __attribute__((always_inline)) void put_ms(v32float , int ); +inline __attribute__((always_inline)) float getf_ss(); +inline __attribute__((always_inline)) float getf_ss(bool & ); +inline __attribute__((always_inline)) float getf_ss_nb(bool & ); +inline __attribute__((always_inline)) float getf_ss_nb(bool & , bool & ); +inline __attribute__((always_inline)) cint16 get_ss_cint16(); +inline __attribute__((always_inline)) cint16 get_ss_cint16(bool & ); +inline __attribute__((always_inline)) cint16 get_ss_nb_cint16(bool & ); +inline __attribute__((always_inline)) cint16 get_ss_nb_cint16(bool & , bool & ); +inline __attribute__((always_inline)) v2bfloat16 get_ss_v2bfloat16(); +inline __attribute__((always_inline)) v2bfloat16 get_ss_v2bfloat16(bool & ); +inline __attribute__((always_inline)) v2bfloat16 get_ss_nb_v2bfloat16(bool & ); +inline __attribute__((always_inline)) v2bfloat16 get_ss_nb_v2bfloat16(bool & , bool & ); +inline __attribute__((always_inline)) v4int32 get_ss_v4int32(bool & ); +inline __attribute__((always_inline)) v32int4 get_ss_v32int4(bool & ); +inline __attribute__((always_inline)) v32uint4 get_ss_v32uint4(bool & ); +inline __attribute__((always_inline)) v16int8 get_ss_v16int8(bool & ); +inline __attribute__((always_inline)) v16uint8 get_ss_v16uint8(bool & ); +inline __attribute__((always_inline)) v8int16 get_ss_v8int16(bool & ); +inline __attribute__((always_inline)) v8uint16 get_ss_v8uint16(bool & ); +inline __attribute__((always_inline)) v4cint16 get_ss_v4cint16(); +inline __attribute__((always_inline)) v4cint16 get_ss_v4cint16(bool & ); +inline __attribute__((always_inline)) v4uint32 get_ss_v4uint32(bool & ); +inline __attribute__((always_inline)) v2cint32 get_ss_v2cint32(); +inline __attribute__((always_inline)) v2cint32 get_ss_v2cint32(bool & ); +inline __attribute__((always_inline)) v2cfloat get_ss_v2cfloat(); +inline __attribute__((always_inline)) v2cfloat get_ss_v2cfloat(bool & ); +inline __attribute__((always_inline)) v8bfloat16 get_ss_v8bfloat16(); +inline __attribute__((always_inline)) v8bfloat16 get_ss_v8bfloat16(bool & ); +inline __attribute__((always_inline)) v4cbfloat16 get_ss_v4cbfloat16(); +inline __attribute__((always_inline)) v4cbfloat16 get_ss_v4cbfloat16(bool & ); +inline __attribute__((always_inline)) v4float get_ss_v4float(); +inline __attribute__((always_inline)) v4float get_ss_v4float(bool & ); +inline __attribute__((always_inline)) v8int32 get_ss_v8int32(bool & ); +inline __attribute__((always_inline)) v64int4 get_ss_v64int4(bool & ); +inline __attribute__((always_inline)) v64uint4 get_ss_v64uint4(bool & ); +inline __attribute__((always_inline)) v32int8 get_ss_v32int8(bool & ); +inline __attribute__((always_inline)) v32uint8 get_ss_v32uint8(bool & ); +inline __attribute__((always_inline)) v16int16 get_ss_v16int16(bool & ); +inline __attribute__((always_inline)) v16uint16 get_ss_v16uint16(bool & ); +inline __attribute__((always_inline)) v8cint16 get_ss_v8cint16(); +inline __attribute__((always_inline)) v8cint16 get_ss_v8cint16(bool & ); +inline __attribute__((always_inline)) v8uint32 get_ss_v8uint32(bool & ); +inline __attribute__((always_inline)) v4cint32 get_ss_v4cint32(); +inline __attribute__((always_inline)) v4cint32 get_ss_v4cint32(bool & ); +inline __attribute__((always_inline)) v4cfloat get_ss_v4cfloat(); +inline __attribute__((always_inline)) v4cfloat get_ss_v4cfloat(bool & ); +inline __attribute__((always_inline)) v16bfloat16 get_ss_v16bfloat16(); +inline __attribute__((always_inline)) v16bfloat16 get_ss_v16bfloat16(bool & ); +inline __attribute__((always_inline)) v8cbfloat16 get_ss_v8cbfloat16(); +inline __attribute__((always_inline)) v8cbfloat16 get_ss_v8cbfloat16(bool & ); +inline __attribute__((always_inline)) v8float get_ss_v8float(); +inline __attribute__((always_inline)) v8float get_ss_v8float(bool & ); +inline __attribute__((always_inline)) v16int32 get_ss_v16int32(bool & ); +inline __attribute__((always_inline)) v128int4 get_ss_v128int4(bool & ); +inline __attribute__((always_inline)) v128uint4 get_ss_v128uint4(bool & ); +inline __attribute__((always_inline)) v64int8 get_ss_v64int8(bool & ); +inline __attribute__((always_inline)) v64uint8 get_ss_v64uint8(bool & ); +inline __attribute__((always_inline)) v32int16 get_ss_v32int16(bool & ); +inline __attribute__((always_inline)) v32uint16 get_ss_v32uint16(bool & ); +inline __attribute__((always_inline)) v16cint16 get_ss_v16cint16(); +inline __attribute__((always_inline)) v16cint16 get_ss_v16cint16(bool & ); +inline __attribute__((always_inline)) v16uint32 get_ss_v16uint32(bool & ); +inline __attribute__((always_inline)) v8cint32 get_ss_v8cint32(); +inline __attribute__((always_inline)) v8cint32 get_ss_v8cint32(bool & ); +inline __attribute__((always_inline)) v8cfloat get_ss_v8cfloat(); +inline __attribute__((always_inline)) v8cfloat get_ss_v8cfloat(bool & ); +inline __attribute__((always_inline)) v32bfloat16 get_ss_v32bfloat16(); +inline __attribute__((always_inline)) v32bfloat16 get_ss_v32bfloat16(bool & ); +inline __attribute__((always_inline)) v16cbfloat16 get_ss_v16cbfloat16(); +inline __attribute__((always_inline)) v16cbfloat16 get_ss_v16cbfloat16(bool & ); +inline __attribute__((always_inline)) v16float get_ss_v16float(); +inline __attribute__((always_inline)) v16float get_ss_v16float(bool & ); +inline __attribute__((always_inline)) v32int32 get_ss_v32int32(bool & ); +inline __attribute__((always_inline)) v256int4 get_ss_v256int4(bool & ); +inline __attribute__((always_inline)) v256uint4 get_ss_v256uint4(bool & ); +inline __attribute__((always_inline)) v128int8 get_ss_v128int8(bool & ); +inline __attribute__((always_inline)) v128uint8 get_ss_v128uint8(bool & ); +inline __attribute__((always_inline)) v64int16 get_ss_v64int16(bool & ); +inline __attribute__((always_inline)) v64uint16 get_ss_v64uint16(bool & ); +inline __attribute__((always_inline)) v32cint16 get_ss_v32cint16(); +inline __attribute__((always_inline)) v32cint16 get_ss_v32cint16(bool & ); +inline __attribute__((always_inline)) v32uint32 get_ss_v32uint32(bool & ); +inline __attribute__((always_inline)) v16cint32 get_ss_v16cint32(); +inline __attribute__((always_inline)) v16cint32 get_ss_v16cint32(bool & ); +inline __attribute__((always_inline)) v16cfloat get_ss_v16cfloat(); +inline __attribute__((always_inline)) v16cfloat get_ss_v16cfloat(bool & ); +inline __attribute__((always_inline)) v64bfloat16 get_ss_v64bfloat16(); +inline __attribute__((always_inline)) v64bfloat16 get_ss_v64bfloat16(bool & ); +inline __attribute__((always_inline)) v32cbfloat16 get_ss_v32cbfloat16(); +inline __attribute__((always_inline)) v32cbfloat16 get_ss_v32cbfloat16(bool & ); +inline __attribute__((always_inline)) v32float get_ss_v32float(); +inline __attribute__((always_inline)) v32float get_ss_v32float(bool & ); +inline __attribute__((always_inline)) v8cint16 ssrs(v8cacc64 , int , int ); +inline __attribute__((always_inline)) v8cint16 ssrs_conf(v8cacc64 , int , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v8cint32 lsrs(v8cacc64 , int , int ); +inline __attribute__((always_inline)) v8cint32 lsrs_conf(v8cacc64 , int , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v4cint32 lsrs(v4cacc64 , int , int ); +inline __attribute__((always_inline)) v4cint32 lsrs_conf(v4cacc64 , int , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v8cint16 ssrs(v8cacc64 , int ); +inline __attribute__((always_inline)) v8cint16 ssrs_conf(v8cacc64 , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v4cint32 lsrs(v4cacc64 , int ); +inline __attribute__((always_inline)) v4cint32 lsrs_conf(v4cacc64 , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v8cint32 lsrs(v8cacc64 , int ); +inline __attribute__((always_inline)) v8cint32 lsrs_conf(v8cacc64 , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v8cint16 srs_to_v8cint16(v8cacc64 , int , int ); +inline __attribute__((always_inline)) v8cint16 srs_to_v8cint16_conf(v8cacc64 , int , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v8cint32 srs_to_cint32(v8cacc64 , int , int ); +inline __attribute__((always_inline)) v8cint32 srs_to_cint32_conf(v8cacc64 , int , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v4cint32 srs_to_v4cint32(v4cacc64 , int , int ); +inline __attribute__((always_inline)) v4cint32 srs_to_v4cint32_conf(v4cacc64 , int , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v8cint32 srs_to_v8cint32(v8cacc64 , int , int ); +inline __attribute__((always_inline)) v8cint32 srs_to_v8cint32_conf(v8cacc64 , int , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v8cint16 srs_to_v8cint16(v8cacc64 , int ); +inline __attribute__((always_inline)) v8cint16 srs_to_v8cint16_conf(v8cacc64 , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v4cint32 srs_to_v4cint32(v4cacc64 , int ); +inline __attribute__((always_inline)) v4cint32 srs_to_v4cint32_conf(v4cacc64 , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v8cint32 srs_to_v8cint32(v8cacc64 , int ); +inline __attribute__((always_inline)) v8cint32 srs_to_v8cint32_conf(v8cacc64 , int , crsat_t , crrnd_t ); +inline __attribute__((always_inline)) v16bfloat16 srs(v16accfloat ); +inline __attribute__((always_inline)) v8cbfloat16 to_v8cbfloat16(v8caccfloat ); +inline __attribute__((always_inline)) v16cbfloat16 to_v16cbfloat16(v16caccfloat ); +inline __attribute__((always_inline)) v8acc64 lups_conf(v8int32 , int , crsat_t ); +inline __attribute__((always_inline)) v8acc64 lups_conf(v8uint32 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc32 sups_conf(v16int16 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc32 sups_conf(v16uint16 , int , crsat_t ); +inline __attribute__((always_inline)) v4cacc64 lups(v4cint32 , int ); +inline __attribute__((always_inline)) v4cacc64 lups_conf(v4cint32 , int , crsat_t ); +inline __attribute__((always_inline)) v8acc64 lups_conf(v8int32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v8acc64 lups_conf(v8uint32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc32 sups_conf(v16int16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc32 sups_conf(v16uint16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v4cacc64 lups(v4cint32 , int , int ); +inline __attribute__((always_inline)) v4cacc64 lups_conf(v4cint32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 sups_conf(v32int8 , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 sups_conf(v32uint8 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 lups_conf(v16int16 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 lups_conf(v16uint16 , int , crsat_t ); +inline __attribute__((always_inline)) v8cacc64 lups(v8cint16 , int ); +inline __attribute__((always_inline)) v8cacc64 lups_conf(v8cint16 , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 sups_conf(v32int8 , int , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 sups_conf(v32uint8 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 lups_conf(v16int16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 lups_conf(v16uint16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v8cacc64 lups(v8cint16 , int , int ); +inline __attribute__((always_inline)) v8cacc64 lups_conf(v8cint16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 lups_conf(v16int32 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 lups_conf(v16uint32 , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 sups_conf(v32int16 , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 sups_conf(v32uint16 , int , crsat_t ); +inline __attribute__((always_inline)) v8cacc64 lups(v8cint32 , int ); +inline __attribute__((always_inline)) v8cacc64 lups_conf(v8cint32 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 lups_conf(v16int32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 lups_conf(v16uint32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 sups_conf(v32int16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 sups_conf(v32uint16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v8cacc64 lups(v8cint32 , int , int ); +inline __attribute__((always_inline)) v8cacc64 lups_conf(v8cint32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v8acc64 ups_to_v8acc64_conf(v8int32 , int , crsat_t ); +inline __attribute__((always_inline)) v8acc64 ups_to_v8acc64_conf(v8uint32 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc32 ups_to_v16acc32_conf(v16int16 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc32 ups_to_v16acc32_conf(v16uint16 , int , crsat_t ); +inline __attribute__((always_inline)) v4cacc64 ups_to_v4cacc64(v4cint32 , int ); +inline __attribute__((always_inline)) v4cacc64 ups_to_v4cacc64_conf(v4cint32 , int , crsat_t ); +inline __attribute__((always_inline)) v8acc64 ups_to_v8acc64_conf(v8int32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v8acc64 ups_to_v8acc64_conf(v8uint32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc32 ups_to_v16acc32_conf(v16int16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc32 ups_to_v16acc32_conf(v16uint16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v4cacc64 ups_to_v4cacc64(v4cint32 , int , int ); +inline __attribute__((always_inline)) v4cacc64 ups_to_v4cacc64_conf(v4cint32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 ups_to_v32acc32_conf(v32int8 , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 ups_to_v32acc32_conf(v32uint8 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 ups_to_v16acc64_conf(v16int16 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 ups_to_v16acc64_conf(v16uint16 , int , crsat_t ); +inline __attribute__((always_inline)) v8cacc64 ups_to_v8cacc64(v8cint16 , int ); +inline __attribute__((always_inline)) v8cacc64 ups_to_v8cacc64_conf(v8cint16 , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 ups_to_v32acc32_conf(v32int8 , int , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 ups_to_v32acc32_conf(v32uint8 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 ups_to_v16acc64_conf(v16int16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 ups_to_v16acc64_conf(v16uint16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v8cacc64 ups_to_v8cacc64(v8cint16 , int , int ); +inline __attribute__((always_inline)) v8cacc64 ups_to_v8cacc64_conf(v8cint16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v8cacc64 ups_to_v8cacc64(v8cint32 , int ); +inline __attribute__((always_inline)) v8cacc64 ups_to_v8cacc64_conf(v8cint32 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 ups_to_v16acc64_conf(v16int32 , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 ups_to_v16acc64_conf(v16uint32 , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 ups_to_v32acc32_conf(v32int16 , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 ups_to_v32acc32_conf(v32uint16 , int , crsat_t ); +inline __attribute__((always_inline)) v8cacc64 ups_to_v8cacc64(v8cint32 , int , int ); +inline __attribute__((always_inline)) v8cacc64 ups_to_v8cacc64_conf(v8cint32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 ups_to_v16acc64_conf(v16int32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v16acc64 ups_to_v16acc64_conf(v16uint32 , int , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 ups_to_v32acc32_conf(v32int16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v32acc32 ups_to_v32acc32_conf(v32uint16 , int , int , crsat_t ); +inline __attribute__((always_inline)) v8caccfloat ups(v8cbfloat16 ); +inline __attribute__((always_inline)) v8caccfloat ups_to_v8caccfloat(v8cbfloat16 ); +inline __attribute__((always_inline)) v16caccfloat ups_to_v16caccfloat(v16cbfloat16 ); +inline __attribute__((always_inline)) int scalar_abs(int ); +inline __attribute__((always_inline)) void dstep(unsigned int , unsigned int , unsigned int & , unsigned int & , unsigned int ); +inline __attribute__((always_inline)) void nop(unsigned int ); +inline __attribute__((always_inline)) void nop(); +inline __attribute__((always_inline)) int compute_control(int , int , int , int , int , int , int , int , int , int , int ); +inline __attribute__((always_inline)) v16accfloat addmac_4x16_16x4_conf(v64bfloat16 , v64bfloat16_sparse , v16accfloat , v16accfloat , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_2(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_2(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_2(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_2(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8_2(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8_2(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8_2(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8_2(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8_2(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8_2(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_2_conf(v16cint16 , v16cint16 , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_2_conf(v16cint16 , v16cint16 , int , int ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_2_conf(v16cint16 , v16cint16 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_2_conf(v16cint16 , v16cint16 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8_2_conf(v16cint16 , v16cint16 , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8_2_conf(v16cint16 , v16cint16 , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8_2_conf(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 , int , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8_2_conf(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 , int , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8_2_conf(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8_2_conf(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_2_conf(v16cint16 , int , v16cint16 , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_2_conf(v16cint16 , int , v16cint16 , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_2_conf(v16cint16 , int , v16cint16 , int , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_2_conf(v16cint16 , int , v16cint16 , int , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8_2_conf(v16cint16 , int , v16cint16 , int , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8_2_conf(v16cint16 , int , v16cint16 , int , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8_2_conf(v16cint16 , int , v16cint16 , int , v8cacc64 , v8cacc64 , int , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8_2_conf(v16cint16 , int , v16cint16 , int , v8cacc64 , v8cacc64 , int , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8_2_conf(v16cint16 , int , v16cint16 , int , v8cacc64 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8_2_conf(v16cint16 , int , v16cint16 , int , v8cacc64 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_2_cc(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_2_cc(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_2_cc(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_2_cc(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8_2_cc(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8_2_cc(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8_2_cc(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8_2_cc(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8_2_cc(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8_2_cc(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_2_cn(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_2_cn(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_2_cn(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_2_cn(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8_2_cn(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8_2_cn(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8_2_cn(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8_2_cn(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8_2_cn(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8_2_cn(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_2_nc(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_2_nc(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_2_nc(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_2_nc(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8_2_nc(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8_2_nc(v16cint16 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8_2_nc(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8_2_nc(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8_2_nc(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8_2_nc(v16cint16 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8(v8cint32 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8(v8cint32 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_conf(v8cint32 , v16cint16 , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_conf(v8cint32 , v16cint16 , int , int ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_conf(v8cint32 , v16cint16 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_conf(v8cint32 , v16cint16 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8_conf(v8cint32 , v16cint16 , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8_conf(v8cint32 , v16cint16 , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8_conf(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 , int , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8_conf(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 , int , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8_conf(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8_conf(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_conf(v8cint32 , int , v16cint16 , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_conf(v8cint32 , int , v16cint16 , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_conf(v8cint32 , int , v16cint16 , int , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_conf(v8cint32 , int , v16cint16 , int , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8_conf(v8cint32 , int , v16cint16 , int , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8_conf(v8cint32 , int , v16cint16 , int , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8_conf(v8cint32 , int , v16cint16 , int , v8cacc64 , v8cacc64 , int , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8_conf(v8cint32 , int , v16cint16 , int , v8cacc64 , v8cacc64 , int , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8_conf(v8cint32 , int , v16cint16 , int , v8cacc64 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8_conf(v8cint32 , int , v16cint16 , int , v8cacc64 , v8cacc64 , int , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_cc(v8cint32 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_cc(v8cint32 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_cc(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_cc(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8_cc(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8_cc(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8_cc(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8_cc(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8_cc(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8_cc(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_cn(v8cint32 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_cn(v8cint32 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_cn(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_cn(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8_cn(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8_cn(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8_cn(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8_cn(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8_cn(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8_cn(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_nc(v8cint32 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_nc(v8cint32 , v16cint16 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_nc(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_nc(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmac_elem_8_nc(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negmsc_elem_8_nc(v8cint32 , v16cint16 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmac_elem_8_nc(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 addmsc_elem_8_nc(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submac_elem_8_nc(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 submsc_elem_8_nc(v8cint32 , v16cint16 , v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_2(v16cbfloat16 , v16cbfloat16 ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_2(v16cbfloat16 , v16cbfloat16 ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_2(v16cbfloat16 , v16cbfloat16 , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_2(v16cbfloat16 , v16cbfloat16 , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat addmac_elem_8_2(v16cbfloat16 , v16cbfloat16 , v8caccfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat addmsc_elem_8_2(v16cbfloat16 , v16cbfloat16 , v8caccfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_2(v16bfloat16 , v16cbfloat16 ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_2(v16bfloat16 , v16cbfloat16 ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_2(v16bfloat16 , v16cbfloat16 , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_2(v16bfloat16 , v16cbfloat16 , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat addmac_elem_8_2(v16bfloat16 , v16cbfloat16 , v8caccfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat addmsc_elem_8_2(v16bfloat16 , v16cbfloat16 , v8caccfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_2(v16cbfloat16 , v16bfloat16 ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_2(v16cbfloat16 , v16bfloat16 ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_2(v16cbfloat16 , v16bfloat16 , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_2(v16cbfloat16 , v16bfloat16 , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat addmac_elem_8_2(v16cbfloat16 , v16bfloat16 , v8caccfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat addmsc_elem_8_2(v16cbfloat16 , v16bfloat16 , v8caccfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_2_conf(v16cbfloat16 , v16cbfloat16 , int ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_2_conf(v16cbfloat16 , v16cbfloat16 , int ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_2_conf(v16cbfloat16 , v16cbfloat16 , v8caccfloat , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_2_conf(v16cbfloat16 , v16cbfloat16 , v8caccfloat , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat addmac_elem_8_2_conf(v16cbfloat16 , v16cbfloat16 , v8caccfloat , v8caccfloat , int , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat addmsc_elem_8_2_conf(v16cbfloat16 , v16cbfloat16 , v8caccfloat , v8caccfloat , int , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_2_conf(v16cbfloat16 , v16bfloat16 , int ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_2_conf(v16cbfloat16 , v16bfloat16 , int ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_2_conf(v16cbfloat16 , v16bfloat16 , v8caccfloat , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_2_conf(v16cbfloat16 , v16bfloat16 , v8caccfloat , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat addmac_elem_8_2_conf(v16cbfloat16 , v16bfloat16 , v8caccfloat , v8caccfloat , int , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat addmsc_elem_8_2_conf(v16cbfloat16 , v16bfloat16 , v8caccfloat , v8caccfloat , int , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_2_conf(v16bfloat16 , v16cbfloat16 , int ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_2_conf(v16bfloat16 , v16cbfloat16 , int ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_2_conf(v16bfloat16 , v16cbfloat16 , v8caccfloat , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_2_conf(v16bfloat16 , v16cbfloat16 , v8caccfloat , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat addmac_elem_8_2_conf(v16bfloat16 , v16cbfloat16 , v8caccfloat , v8caccfloat , int , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat addmsc_elem_8_2_conf(v16bfloat16 , v16cbfloat16 , v8caccfloat , v8caccfloat , int , int , int , int ); +inline __attribute__((always_inline)) v32acc32 negsub(v32acc32 , v32acc32 ); +inline __attribute__((always_inline)) v32acc32 negsub_conf(v32acc32 , v32acc32 , int , int , int , int ); +inline __attribute__((always_inline)) v16acc64 negsub(v16acc64 , v16acc64 ); +inline __attribute__((always_inline)) v16acc64 negsub_conf(v16acc64 , v16acc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 add(v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 sub(v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negadd(v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 negsub(v8cacc64 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 neg(v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 clr16c(); +inline __attribute__((always_inline)) v8cacc64 broadcast_zero_to_v8cacc64(); +inline __attribute__((always_inline)) v8cacc64 add_conf(v8cacc64 , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 sub_conf(v8cacc64 , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negadd_conf(v8cacc64 , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 negsub_conf(v8cacc64 , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 neg_conf(v8cacc64 , int , int , int ); +inline __attribute__((always_inline)) v16accfloat negsub(v16accfloat , v16accfloat ); +inline __attribute__((always_inline)) v16accfloat negsub_conf(v16accfloat , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat add(v8caccfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat sub(v8caccfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat negadd(v8caccfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat negsub(v8caccfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat neg(v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat add_conf(v8caccfloat , v8caccfloat , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat sub_conf(v8caccfloat , v8caccfloat , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat negadd_conf(v8caccfloat , v8caccfloat , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat negsub_conf(v8caccfloat , v8caccfloat , int , int , int ); +inline __attribute__((always_inline)) v8caccfloat neg_conf(v8caccfloat , int , int ); +inline __attribute__((always_inline)) v16caccfloat add(v16caccfloat , v16caccfloat ); +inline __attribute__((always_inline)) v16caccfloat sub(v16caccfloat , v16caccfloat ); +inline __attribute__((always_inline)) v16caccfloat negadd(v16caccfloat , v16caccfloat ); +inline __attribute__((always_inline)) v16caccfloat negsub(v16caccfloat , v16caccfloat ); +inline __attribute__((always_inline)) v16caccfloat neg(v16caccfloat ); +inline __attribute__((always_inline)) v16caccfloat add_conf(v16caccfloat , v16caccfloat , int , int , int ); +inline __attribute__((always_inline)) v16caccfloat sub_conf(v16caccfloat , v16caccfloat , int , int , int ); +inline __attribute__((always_inline)) v16caccfloat negadd_conf(v16caccfloat , v16caccfloat , int , int , int ); +inline __attribute__((always_inline)) v16caccfloat negsub_conf(v16caccfloat , v16caccfloat , int , int , int ); +inline __attribute__((always_inline)) v16caccfloat neg_conf(v16caccfloat , int , int ); +inline __attribute__((always_inline)) v4caccfloat add(v4caccfloat , v4caccfloat ); +inline __attribute__((always_inline)) v4caccfloat sub(v4caccfloat , v4caccfloat ); +inline __attribute__((always_inline)) v4caccfloat negadd(v4caccfloat , v4caccfloat ); +inline __attribute__((always_inline)) v4caccfloat negsub(v4caccfloat , v4caccfloat ); +inline __attribute__((always_inline)) v4caccfloat neg(v4caccfloat ); +inline __attribute__((always_inline)) v4caccfloat add_conf(v4caccfloat , v4caccfloat , int , int , int ); +inline __attribute__((always_inline)) v4caccfloat sub_conf(v4caccfloat , v4caccfloat , int , int , int ); +inline __attribute__((always_inline)) v4caccfloat negadd_conf(v4caccfloat , v4caccfloat , int , int , int ); +inline __attribute__((always_inline)) v4caccfloat negsub_conf(v4caccfloat , v4caccfloat , int , int , int ); +inline __attribute__((always_inline)) v4caccfloat neg_conf(v4caccfloat , int , int ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8(v8float , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8(v8cfloat , v8float ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_accuracy_low(v8float , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_accuracy_low(v8cfloat , v8float ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_accuracy_low(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_accuracy_fast(v8float , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_accuracy_fast(v8cfloat , v8float ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_accuracy_fast(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_accuracy_safe(v8float , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_accuracy_safe(v8cfloat , v8float ); +inline __attribute__((always_inline)) v8caccfloat mul_elem_8_accuracy_safe(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v16accfloat negmul_elem_16(v16float , v16float ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8(v8float , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8(v8cfloat , v8float ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_accuracy_low(v8float , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_accuracy_low(v8cfloat , v8float ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_accuracy_low(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_accuracy_fast(v8float , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_accuracy_fast(v8cfloat , v8float ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_accuracy_fast(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_accuracy_safe(v8float , v8cfloat ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_accuracy_safe(v8cfloat , v8float ); +inline __attribute__((always_inline)) v8caccfloat negmul_elem_8_accuracy_safe(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v16accfloat mac_elem_16(v16float , v16float , v16accfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8(v8float , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8(v8cfloat , v8float , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8(v8cfloat , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_accuracy_safe(v8float , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_accuracy_safe(v8cfloat , v8float , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_accuracy_safe(v8cfloat , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_accuracy_fast(v8float , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_accuracy_fast(v8cfloat , v8float , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_accuracy_fast(v8cfloat , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_accuracy_low(v8float , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_accuracy_low(v8cfloat , v8float , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat mac_elem_8_accuracy_low(v8cfloat , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v16accfloat addmac_elem_16(v16float , v16float , v16accfloat , v16accfloat ); +inline __attribute__((always_inline)) v16accfloat msc_elem_16(v16float , v16float , v16accfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8(v8float , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8(v8cfloat , v8float , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8(v8cfloat , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_accuracy_safe(v8float , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_accuracy_safe(v8cfloat , v8float , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_accuracy_safe(v8cfloat , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_accuracy_fast(v8float , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_accuracy_fast(v8cfloat , v8float , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_accuracy_fast(v8cfloat , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_accuracy_low(v8float , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_accuracy_low(v8cfloat , v8float , v8caccfloat ); +inline __attribute__((always_inline)) v8caccfloat msc_elem_8_accuracy_low(v8cfloat , v8cfloat , v8caccfloat ); +inline __attribute__((always_inline)) v16accfloat addmsc_elem_16(v16float , v16float , v16accfloat , v16accfloat ); +inline __attribute__((always_inline)) v4caccfloat mul_2x8_8x2(v16float , v16cfloat ); +inline __attribute__((always_inline)) v4caccfloat mul_2x8_8x2_accuracy_safe(v16float , v16cfloat ); +inline __attribute__((always_inline)) v4caccfloat mul_2x8_8x2_accuracy_fast(v16float , v16cfloat ); +inline __attribute__((always_inline)) v4caccfloat mul_2x8_8x2_accuracy_low(v16float , v16cfloat ); +inline __attribute__((always_inline)) v4caccfloat mul_2x8_8x2(v16bfloat16 , v16cbfloat16 ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8(v8cint32 , v8cint32 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8(v8cint32 , v8cint32 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_conf(v8cint32 , v8cint32 , int ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_conf(v8cint32 , v8cint32 , int ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_conf(v8cint32 , v8cint32 , v8cacc64 , int , int ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_conf(v8cint32 , v8cint32 , v8cacc64 , int , int ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_conf(v8cint32 , v8cint32 , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_conf(v8cint32 , v8cint32 , v8cacc64 , int , int , int , int ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_cc(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_cc(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_cc(v8cint32 , v8cint32 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_cc(v8cint32 , v8cint32 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_cn(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_cn(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_cn(v8cint32 , v8cint32 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_cn(v8cint32 , v8cint32 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 mul_elem_8_nc(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cacc64 negmul_elem_8_nc(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cacc64 mac_elem_8_nc(v8cint32 , v8cint32 , v8cacc64 ); +inline __attribute__((always_inline)) v8cacc64 msc_elem_8_nc(v8cint32 , v8cint32 , v8cacc64 ); +inline __attribute__((always_inline)) v32int8 pack_conf(v32int16 , crsat_t ); +inline __attribute__((always_inline)) v32uint8 pack_conf(v32uint16 , crsat_t ); +inline __attribute__((always_inline)) v64int4 pack_conf(v64int8 , crsat_t ); +inline __attribute__((always_inline)) v64uint4 pack_conf(v64uint8 , crsat_t ); +inline __attribute__((always_inline)) v32int8 pack_conf(v32int16 , int , crsat_t ); +inline __attribute__((always_inline)) v32uint8 pack_conf(v32uint16 , int , crsat_t ); +inline __attribute__((always_inline)) v64int4 pack_conf(v64int8 , int , crsat_t ); +inline __attribute__((always_inline)) v64uint4 pack_conf(v64uint8 , int , crsat_t ); +inline __attribute__((always_inline)) void * compress_add(void * ); +inline __attribute__((always_inline)) v64int4 compr_pop(v64int4_compress *& ); +inline __attribute__((always_inline)) v64int4 compr_peek(v64int4_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v64int4_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v64int4_compress *& ); +inline __attribute__((always_inline)) v64int4 compr_pop(v64int4_compress *& , v64int4_compress *& ); +inline __attribute__((always_inline)) v64int4 compr_peek(v64int4_compress *& , v64int4_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v64int4_compress *& , v64int4_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v64int4_compress *& , v64int4_compress *& ); +inline __attribute__((always_inline)) v64int4 compr_pop_and_get_pointer(v64int4_compress *& ); +inline __attribute__((always_inline)) v64int4 compr_peek_and_get_pointer(v64int4_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v64int4_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v64int4_compress *& ); +inline __attribute__((always_inline)) void compr_pop8(v64int4_compress *& , v64int4_compress *& , v64int4 & , v64int4 & , v64int4 & , v64int4 & , v64int4 & , v64int4 & , v64int4 & , v64int4 & ); +inline __attribute__((always_inline)) void compr_pop4(v64int4_compress *& , v64int4_compress *& , v64int4 & , v64int4 & , v64int4 & , v64int4 & ); +inline __attribute__((always_inline)) void compr_pop2(v64int4_compress *& , v64int4_compress *& , v64int4 & , v64int4 & ); +inline __attribute__((always_inline)) void compr_pop1(v64int4_compress *& , v64int4_compress *& , v64int4 & ); +inline __attribute__((always_inline)) void compr_pop8(v64int4_compress *& , v64int4 & , v64int4 & , v64int4 & , v64int4 & , v64int4 & , v64int4 & , v64int4 & , v64int4 & ); +inline __attribute__((always_inline)) void compr_pop4(v64int4_compress *& , v64int4 & , v64int4 & , v64int4 & , v64int4 & ); +inline __attribute__((always_inline)) void compr_pop2(v64int4_compress *& , v64int4 & , v64int4 & ); +inline __attribute__((always_inline)) void compr_pop1(v64int4_compress *& , v64int4 & ); +inline __attribute__((always_inline)) v64uint4 compr_pop(v64uint4_compress *& ); +inline __attribute__((always_inline)) v64uint4 compr_peek(v64uint4_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v64uint4_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v64uint4_compress *& ); +inline __attribute__((always_inline)) v64uint4 compr_pop(v64uint4_compress *& , v64uint4_compress *& ); +inline __attribute__((always_inline)) v64uint4 compr_peek(v64uint4_compress *& , v64uint4_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v64uint4_compress *& , v64uint4_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v64uint4_compress *& , v64uint4_compress *& ); +inline __attribute__((always_inline)) v64uint4 compr_pop_and_get_pointer(v64uint4_compress *& ); +inline __attribute__((always_inline)) v64uint4 compr_peek_and_get_pointer(v64uint4_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v64uint4_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v64uint4_compress *& ); +inline __attribute__((always_inline)) void compr_pop8(v64uint4_compress *& , v64uint4_compress *& , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & ); +inline __attribute__((always_inline)) void compr_pop4(v64uint4_compress *& , v64uint4_compress *& , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & ); +inline __attribute__((always_inline)) void compr_pop2(v64uint4_compress *& , v64uint4_compress *& , v64uint4 & , v64uint4 & ); +inline __attribute__((always_inline)) void compr_pop1(v64uint4_compress *& , v64uint4_compress *& , v64uint4 & ); +inline __attribute__((always_inline)) void compr_pop8(v64uint4_compress *& , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & ); +inline __attribute__((always_inline)) void compr_pop4(v64uint4_compress *& , v64uint4 & , v64uint4 & , v64uint4 & , v64uint4 & ); +inline __attribute__((always_inline)) void compr_pop2(v64uint4_compress *& , v64uint4 & , v64uint4 & ); +inline __attribute__((always_inline)) void compr_pop1(v64uint4_compress *& , v64uint4 & ); +inline __attribute__((always_inline)) v32int8 compr_pop(v32int8_compress *& ); +inline __attribute__((always_inline)) v32int8 compr_peek(v32int8_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v32int8_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v32int8_compress *& ); +inline __attribute__((always_inline)) v32int8 compr_pop(v32int8_compress *& , v32int8_compress *& ); +inline __attribute__((always_inline)) v32int8 compr_peek(v32int8_compress *& , v32int8_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v32int8_compress *& , v32int8_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v32int8_compress *& , v32int8_compress *& ); +inline __attribute__((always_inline)) v32int8 compr_pop_and_get_pointer(v32int8_compress *& ); +inline __attribute__((always_inline)) v32int8 compr_peek_and_get_pointer(v32int8_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v32int8_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v32int8_compress *& ); +inline __attribute__((always_inline)) void compr_pop8(v32int8_compress *& , v32int8_compress *& , v32int8 & , v32int8 & , v32int8 & , v32int8 & , v32int8 & , v32int8 & , v32int8 & , v32int8 & ); +inline __attribute__((always_inline)) void compr_pop4(v32int8_compress *& , v32int8_compress *& , v32int8 & , v32int8 & , v32int8 & , v32int8 & ); +inline __attribute__((always_inline)) void compr_pop2(v32int8_compress *& , v32int8_compress *& , v32int8 & , v32int8 & ); +inline __attribute__((always_inline)) void compr_pop1(v32int8_compress *& , v32int8_compress *& , v32int8 & ); +inline __attribute__((always_inline)) void compr_pop8(v32int8_compress *& , v32int8 & , v32int8 & , v32int8 & , v32int8 & , v32int8 & , v32int8 & , v32int8 & , v32int8 & ); +inline __attribute__((always_inline)) void compr_pop4(v32int8_compress *& , v32int8 & , v32int8 & , v32int8 & , v32int8 & ); +inline __attribute__((always_inline)) void compr_pop2(v32int8_compress *& , v32int8 & , v32int8 & ); +inline __attribute__((always_inline)) void compr_pop1(v32int8_compress *& , v32int8 & ); +inline __attribute__((always_inline)) v32uint8 compr_pop(v32uint8_compress *& ); +inline __attribute__((always_inline)) v32uint8 compr_peek(v32uint8_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v32uint8_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v32uint8_compress *& ); +inline __attribute__((always_inline)) v32uint8 compr_pop(v32uint8_compress *& , v32uint8_compress *& ); +inline __attribute__((always_inline)) v32uint8 compr_peek(v32uint8_compress *& , v32uint8_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v32uint8_compress *& , v32uint8_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v32uint8_compress *& , v32uint8_compress *& ); +inline __attribute__((always_inline)) v32uint8 compr_pop_and_get_pointer(v32uint8_compress *& ); +inline __attribute__((always_inline)) v32uint8 compr_peek_and_get_pointer(v32uint8_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v32uint8_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v32uint8_compress *& ); +inline __attribute__((always_inline)) void compr_pop8(v32uint8_compress *& , v32uint8_compress *& , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & ); +inline __attribute__((always_inline)) void compr_pop4(v32uint8_compress *& , v32uint8_compress *& , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & ); +inline __attribute__((always_inline)) void compr_pop2(v32uint8_compress *& , v32uint8_compress *& , v32uint8 & , v32uint8 & ); +inline __attribute__((always_inline)) void compr_pop1(v32uint8_compress *& , v32uint8_compress *& , v32uint8 & ); +inline __attribute__((always_inline)) void compr_pop8(v32uint8_compress *& , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & ); +inline __attribute__((always_inline)) void compr_pop4(v32uint8_compress *& , v32uint8 & , v32uint8 & , v32uint8 & , v32uint8 & ); +inline __attribute__((always_inline)) void compr_pop2(v32uint8_compress *& , v32uint8 & , v32uint8 & ); +inline __attribute__((always_inline)) void compr_pop1(v32uint8_compress *& , v32uint8 & ); +inline __attribute__((always_inline)) v16int16 compr_pop(v16int16_compress *& ); +inline __attribute__((always_inline)) v16int16 compr_peek(v16int16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16int16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16int16_compress *& ); +inline __attribute__((always_inline)) v16int16 compr_pop(v16int16_compress *& , v16int16_compress *& ); +inline __attribute__((always_inline)) v16int16 compr_peek(v16int16_compress *& , v16int16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16int16_compress *& , v16int16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16int16_compress *& , v16int16_compress *& ); +inline __attribute__((always_inline)) v16int16 compr_pop_and_get_pointer(v16int16_compress *& ); +inline __attribute__((always_inline)) v16int16 compr_peek_and_get_pointer(v16int16_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v16int16_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v16int16_compress *& ); +inline __attribute__((always_inline)) void compr_pop8(v16int16_compress *& , v16int16_compress *& , v16int16 & , v16int16 & , v16int16 & , v16int16 & , v16int16 & , v16int16 & , v16int16 & , v16int16 & ); +inline __attribute__((always_inline)) void compr_pop4(v16int16_compress *& , v16int16_compress *& , v16int16 & , v16int16 & , v16int16 & , v16int16 & ); +inline __attribute__((always_inline)) void compr_pop2(v16int16_compress *& , v16int16_compress *& , v16int16 & , v16int16 & ); +inline __attribute__((always_inline)) void compr_pop1(v16int16_compress *& , v16int16_compress *& , v16int16 & ); +inline __attribute__((always_inline)) void compr_pop8(v16int16_compress *& , v16int16 & , v16int16 & , v16int16 & , v16int16 & , v16int16 & , v16int16 & , v16int16 & , v16int16 & ); +inline __attribute__((always_inline)) void compr_pop4(v16int16_compress *& , v16int16 & , v16int16 & , v16int16 & , v16int16 & ); +inline __attribute__((always_inline)) void compr_pop2(v16int16_compress *& , v16int16 & , v16int16 & ); +inline __attribute__((always_inline)) void compr_pop1(v16int16_compress *& , v16int16 & ); +inline __attribute__((always_inline)) v16uint16 compr_pop(v16uint16_compress *& ); +inline __attribute__((always_inline)) v16uint16 compr_peek(v16uint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16uint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16uint16_compress *& ); +inline __attribute__((always_inline)) v16uint16 compr_pop(v16uint16_compress *& , v16uint16_compress *& ); +inline __attribute__((always_inline)) v16uint16 compr_peek(v16uint16_compress *& , v16uint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16uint16_compress *& , v16uint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16uint16_compress *& , v16uint16_compress *& ); +inline __attribute__((always_inline)) v16uint16 compr_pop_and_get_pointer(v16uint16_compress *& ); +inline __attribute__((always_inline)) v16uint16 compr_peek_and_get_pointer(v16uint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v16uint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v16uint16_compress *& ); +inline __attribute__((always_inline)) void compr_pop8(v16uint16_compress *& , v16uint16_compress *& , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & ); +inline __attribute__((always_inline)) void compr_pop4(v16uint16_compress *& , v16uint16_compress *& , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & ); +inline __attribute__((always_inline)) void compr_pop2(v16uint16_compress *& , v16uint16_compress *& , v16uint16 & , v16uint16 & ); +inline __attribute__((always_inline)) void compr_pop1(v16uint16_compress *& , v16uint16_compress *& , v16uint16 & ); +inline __attribute__((always_inline)) void compr_pop8(v16uint16_compress *& , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & ); +inline __attribute__((always_inline)) void compr_pop4(v16uint16_compress *& , v16uint16 & , v16uint16 & , v16uint16 & , v16uint16 & ); +inline __attribute__((always_inline)) void compr_pop2(v16uint16_compress *& , v16uint16 & , v16uint16 & ); +inline __attribute__((always_inline)) void compr_pop1(v16uint16_compress *& , v16uint16 & ); +inline __attribute__((always_inline)) v8int32 compr_pop(v8int32_compress *& ); +inline __attribute__((always_inline)) v8int32 compr_peek(v8int32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v8int32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v8int32_compress *& ); +inline __attribute__((always_inline)) v8int32 compr_pop(v8int32_compress *& , v8int32_compress *& ); +inline __attribute__((always_inline)) v8int32 compr_peek(v8int32_compress *& , v8int32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v8int32_compress *& , v8int32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v8int32_compress *& , v8int32_compress *& ); +inline __attribute__((always_inline)) v8int32 compr_pop_and_get_pointer(v8int32_compress *& ); +inline __attribute__((always_inline)) v8int32 compr_peek_and_get_pointer(v8int32_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v8int32_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v8int32_compress *& ); +inline __attribute__((always_inline)) void compr_pop8(v8int32_compress *& , v8int32_compress *& , v8int32 & , v8int32 & , v8int32 & , v8int32 & , v8int32 & , v8int32 & , v8int32 & , v8int32 & ); +inline __attribute__((always_inline)) void compr_pop4(v8int32_compress *& , v8int32_compress *& , v8int32 & , v8int32 & , v8int32 & , v8int32 & ); +inline __attribute__((always_inline)) void compr_pop2(v8int32_compress *& , v8int32_compress *& , v8int32 & , v8int32 & ); +inline __attribute__((always_inline)) void compr_pop1(v8int32_compress *& , v8int32_compress *& , v8int32 & ); +inline __attribute__((always_inline)) void compr_pop8(v8int32_compress *& , v8int32 & , v8int32 & , v8int32 & , v8int32 & , v8int32 & , v8int32 & , v8int32 & , v8int32 & ); +inline __attribute__((always_inline)) void compr_pop4(v8int32_compress *& , v8int32 & , v8int32 & , v8int32 & , v8int32 & ); +inline __attribute__((always_inline)) void compr_pop2(v8int32_compress *& , v8int32 & , v8int32 & ); +inline __attribute__((always_inline)) void compr_pop1(v8int32_compress *& , v8int32 & ); +inline __attribute__((always_inline)) v8uint32 compr_pop(v8uint32_compress *& ); +inline __attribute__((always_inline)) v8uint32 compr_peek(v8uint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v8uint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v8uint32_compress *& ); +inline __attribute__((always_inline)) v8uint32 compr_pop(v8uint32_compress *& , v8uint32_compress *& ); +inline __attribute__((always_inline)) v8uint32 compr_peek(v8uint32_compress *& , v8uint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v8uint32_compress *& , v8uint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v8uint32_compress *& , v8uint32_compress *& ); +inline __attribute__((always_inline)) v8uint32 compr_pop_and_get_pointer(v8uint32_compress *& ); +inline __attribute__((always_inline)) v8uint32 compr_peek_and_get_pointer(v8uint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v8uint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v8uint32_compress *& ); +inline __attribute__((always_inline)) void compr_pop8(v8uint32_compress *& , v8uint32_compress *& , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & ); +inline __attribute__((always_inline)) void compr_pop4(v8uint32_compress *& , v8uint32_compress *& , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & ); +inline __attribute__((always_inline)) void compr_pop2(v8uint32_compress *& , v8uint32_compress *& , v8uint32 & , v8uint32 & ); +inline __attribute__((always_inline)) void compr_pop1(v8uint32_compress *& , v8uint32_compress *& , v8uint32 & ); +inline __attribute__((always_inline)) void compr_pop8(v8uint32_compress *& , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & ); +inline __attribute__((always_inline)) void compr_pop4(v8uint32_compress *& , v8uint32 & , v8uint32 & , v8uint32 & , v8uint32 & ); +inline __attribute__((always_inline)) void compr_pop2(v8uint32_compress *& , v8uint32 & , v8uint32 & ); +inline __attribute__((always_inline)) void compr_pop1(v8uint32_compress *& , v8uint32 & ); +inline __attribute__((always_inline)) v8cint16 compr_pop(v8cint16_compress *& ); +inline __attribute__((always_inline)) v8cint16 compr_peek(v8cint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v8cint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v8cint16_compress *& ); +inline __attribute__((always_inline)) v8cint16 compr_pop(v8cint16_compress *& , v8cint16_compress *& ); +inline __attribute__((always_inline)) v8cint16 compr_peek(v8cint16_compress *& , v8cint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v8cint16_compress *& , v8cint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v8cint16_compress *& , v8cint16_compress *& ); +inline __attribute__((always_inline)) v8cint16 compr_pop_and_get_pointer(v8cint16_compress *& ); +inline __attribute__((always_inline)) v8cint16 compr_peek_and_get_pointer(v8cint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v8cint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v8cint16_compress *& ); +inline __attribute__((always_inline)) void compr_pop8(v8cint16_compress *& , v8cint16_compress *& , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & ); +inline __attribute__((always_inline)) void compr_pop4(v8cint16_compress *& , v8cint16_compress *& , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & ); +inline __attribute__((always_inline)) void compr_pop2(v8cint16_compress *& , v8cint16_compress *& , v8cint16 & , v8cint16 & ); +inline __attribute__((always_inline)) void compr_pop1(v8cint16_compress *& , v8cint16_compress *& , v8cint16 & ); +inline __attribute__((always_inline)) void compr_pop8(v8cint16_compress *& , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & ); +inline __attribute__((always_inline)) void compr_pop4(v8cint16_compress *& , v8cint16 & , v8cint16 & , v8cint16 & , v8cint16 & ); +inline __attribute__((always_inline)) void compr_pop2(v8cint16_compress *& , v8cint16 & , v8cint16 & ); +inline __attribute__((always_inline)) void compr_pop1(v8cint16_compress *& , v8cint16 & ); +inline __attribute__((always_inline)) v4cint32 compr_pop(v4cint32_compress *& ); +inline __attribute__((always_inline)) v4cint32 compr_peek(v4cint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v4cint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v4cint32_compress *& ); +inline __attribute__((always_inline)) v4cint32 compr_pop(v4cint32_compress *& , v4cint32_compress *& ); +inline __attribute__((always_inline)) v4cint32 compr_peek(v4cint32_compress *& , v4cint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v4cint32_compress *& , v4cint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v4cint32_compress *& , v4cint32_compress *& ); +inline __attribute__((always_inline)) v4cint32 compr_pop_and_get_pointer(v4cint32_compress *& ); +inline __attribute__((always_inline)) v4cint32 compr_peek_and_get_pointer(v4cint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v4cint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v4cint32_compress *& ); +inline __attribute__((always_inline)) void compr_pop8(v4cint32_compress *& , v4cint32_compress *& , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & ); +inline __attribute__((always_inline)) void compr_pop4(v4cint32_compress *& , v4cint32_compress *& , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & ); +inline __attribute__((always_inline)) void compr_pop2(v4cint32_compress *& , v4cint32_compress *& , v4cint32 & , v4cint32 & ); +inline __attribute__((always_inline)) void compr_pop1(v4cint32_compress *& , v4cint32_compress *& , v4cint32 & ); +inline __attribute__((always_inline)) void compr_pop8(v4cint32_compress *& , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & ); +inline __attribute__((always_inline)) void compr_pop4(v4cint32_compress *& , v4cint32 & , v4cint32 & , v4cint32 & , v4cint32 & ); +inline __attribute__((always_inline)) void compr_pop2(v4cint32_compress *& , v4cint32 & , v4cint32 & ); +inline __attribute__((always_inline)) void compr_pop1(v4cint32_compress *& , v4cint32 & ); +inline __attribute__((always_inline)) v16bfloat16 compr_pop(v16bfloat16_compress *& ); +inline __attribute__((always_inline)) v16bfloat16 compr_peek(v16bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16bfloat16_compress *& ); +inline __attribute__((always_inline)) v16bfloat16 compr_pop(v16bfloat16_compress *& , v16bfloat16_compress *& ); +inline __attribute__((always_inline)) v16bfloat16 compr_peek(v16bfloat16_compress *& , v16bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16bfloat16_compress *& , v16bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16bfloat16_compress *& , v16bfloat16_compress *& ); +inline __attribute__((always_inline)) v16bfloat16 compr_pop_and_get_pointer(v16bfloat16_compress *& ); +inline __attribute__((always_inline)) v16bfloat16 compr_peek_and_get_pointer(v16bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v16bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v16bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_pop8(v16bfloat16_compress *& , v16bfloat16_compress *& , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop4(v16bfloat16_compress *& , v16bfloat16_compress *& , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop2(v16bfloat16_compress *& , v16bfloat16_compress *& , v16bfloat16 & , v16bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop1(v16bfloat16_compress *& , v16bfloat16_compress *& , v16bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop8(v16bfloat16_compress *& , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop4(v16bfloat16_compress *& , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & , v16bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop2(v16bfloat16_compress *& , v16bfloat16 & , v16bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop1(v16bfloat16_compress *& , v16bfloat16 & ); +inline __attribute__((always_inline)) v128int4 compr_pop(v128int4_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v128int4_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v128int4_compress *& ); +inline __attribute__((always_inline)) v128int4 compr_pop(v128int4_compress *& , v128int4_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v128int4_compress *& , v128int4_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v128int4_compress *& , v128int4_compress *& ); +inline __attribute__((always_inline)) v128int4 compr_pop_and_get_pointer(v128int4_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v128int4_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v128int4_compress *& ); +inline __attribute__((always_inline)) void compr_pop1(v128int4_compress *& , v128int4_compress *& , v128int4 & ); +inline __attribute__((always_inline)) void compr_pop2(v128int4_compress *& , v128int4_compress *& , v128int4 & , v128int4 & ); +inline __attribute__((always_inline)) void compr_pop4(v128int4_compress *& , v128int4_compress *& , v128int4 & , v128int4 & , v128int4 & , v128int4 & ); +inline __attribute__((always_inline)) void compr_pop1(v128int4_compress *& , v128int4 & ); +inline __attribute__((always_inline)) void compr_pop2(v128int4_compress *& , v128int4 & , v128int4 & ); +inline __attribute__((always_inline)) void compr_pop4(v128int4_compress *& , v128int4 & , v128int4 & , v128int4 & , v128int4 & ); +inline __attribute__((always_inline)) v128uint4 compr_pop(v128uint4_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v128uint4_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v128uint4_compress *& ); +inline __attribute__((always_inline)) v128uint4 compr_pop(v128uint4_compress *& , v128uint4_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v128uint4_compress *& , v128uint4_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v128uint4_compress *& , v128uint4_compress *& ); +inline __attribute__((always_inline)) v128uint4 compr_pop_and_get_pointer(v128uint4_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v128uint4_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v128uint4_compress *& ); +inline __attribute__((always_inline)) void compr_pop1(v128uint4_compress *& , v128uint4_compress *& , v128uint4 & ); +inline __attribute__((always_inline)) void compr_pop2(v128uint4_compress *& , v128uint4_compress *& , v128uint4 & , v128uint4 & ); +inline __attribute__((always_inline)) void compr_pop4(v128uint4_compress *& , v128uint4_compress *& , v128uint4 & , v128uint4 & , v128uint4 & , v128uint4 & ); +inline __attribute__((always_inline)) void compr_pop1(v128uint4_compress *& , v128uint4 & ); +inline __attribute__((always_inline)) void compr_pop2(v128uint4_compress *& , v128uint4 & , v128uint4 & ); +inline __attribute__((always_inline)) void compr_pop4(v128uint4_compress *& , v128uint4 & , v128uint4 & , v128uint4 & , v128uint4 & ); +inline __attribute__((always_inline)) v64int8 compr_pop(v64int8_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v64int8_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v64int8_compress *& ); +inline __attribute__((always_inline)) v64int8 compr_pop(v64int8_compress *& , v64int8_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v64int8_compress *& , v64int8_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v64int8_compress *& , v64int8_compress *& ); +inline __attribute__((always_inline)) v64int8 compr_pop_and_get_pointer(v64int8_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v64int8_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v64int8_compress *& ); +inline __attribute__((always_inline)) void compr_pop1(v64int8_compress *& , v64int8_compress *& , v64int8 & ); +inline __attribute__((always_inline)) void compr_pop2(v64int8_compress *& , v64int8_compress *& , v64int8 & , v64int8 & ); +inline __attribute__((always_inline)) void compr_pop4(v64int8_compress *& , v64int8_compress *& , v64int8 & , v64int8 & , v64int8 & , v64int8 & ); +inline __attribute__((always_inline)) void compr_pop1(v64int8_compress *& , v64int8 & ); +inline __attribute__((always_inline)) void compr_pop2(v64int8_compress *& , v64int8 & , v64int8 & ); +inline __attribute__((always_inline)) void compr_pop4(v64int8_compress *& , v64int8 & , v64int8 & , v64int8 & , v64int8 & ); +inline __attribute__((always_inline)) v64uint8 compr_pop(v64uint8_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v64uint8_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v64uint8_compress *& ); +inline __attribute__((always_inline)) v64uint8 compr_pop(v64uint8_compress *& , v64uint8_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v64uint8_compress *& , v64uint8_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v64uint8_compress *& , v64uint8_compress *& ); +inline __attribute__((always_inline)) v64uint8 compr_pop_and_get_pointer(v64uint8_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v64uint8_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v64uint8_compress *& ); +inline __attribute__((always_inline)) void compr_pop1(v64uint8_compress *& , v64uint8_compress *& , v64uint8 & ); +inline __attribute__((always_inline)) void compr_pop2(v64uint8_compress *& , v64uint8_compress *& , v64uint8 & , v64uint8 & ); +inline __attribute__((always_inline)) void compr_pop4(v64uint8_compress *& , v64uint8_compress *& , v64uint8 & , v64uint8 & , v64uint8 & , v64uint8 & ); +inline __attribute__((always_inline)) void compr_pop1(v64uint8_compress *& , v64uint8 & ); +inline __attribute__((always_inline)) void compr_pop2(v64uint8_compress *& , v64uint8 & , v64uint8 & ); +inline __attribute__((always_inline)) void compr_pop4(v64uint8_compress *& , v64uint8 & , v64uint8 & , v64uint8 & , v64uint8 & ); +inline __attribute__((always_inline)) v32int16 compr_pop(v32int16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v32int16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v32int16_compress *& ); +inline __attribute__((always_inline)) v32int16 compr_pop(v32int16_compress *& , v32int16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v32int16_compress *& , v32int16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v32int16_compress *& , v32int16_compress *& ); +inline __attribute__((always_inline)) v32int16 compr_pop_and_get_pointer(v32int16_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v32int16_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v32int16_compress *& ); +inline __attribute__((always_inline)) void compr_pop1(v32int16_compress *& , v32int16_compress *& , v32int16 & ); +inline __attribute__((always_inline)) void compr_pop2(v32int16_compress *& , v32int16_compress *& , v32int16 & , v32int16 & ); +inline __attribute__((always_inline)) void compr_pop4(v32int16_compress *& , v32int16_compress *& , v32int16 & , v32int16 & , v32int16 & , v32int16 & ); +inline __attribute__((always_inline)) void compr_pop1(v32int16_compress *& , v32int16 & ); +inline __attribute__((always_inline)) void compr_pop2(v32int16_compress *& , v32int16 & , v32int16 & ); +inline __attribute__((always_inline)) void compr_pop4(v32int16_compress *& , v32int16 & , v32int16 & , v32int16 & , v32int16 & ); +inline __attribute__((always_inline)) v32uint16 compr_pop(v32uint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v32uint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v32uint16_compress *& ); +inline __attribute__((always_inline)) v32uint16 compr_pop(v32uint16_compress *& , v32uint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v32uint16_compress *& , v32uint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v32uint16_compress *& , v32uint16_compress *& ); +inline __attribute__((always_inline)) v32uint16 compr_pop_and_get_pointer(v32uint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v32uint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v32uint16_compress *& ); +inline __attribute__((always_inline)) void compr_pop1(v32uint16_compress *& , v32uint16_compress *& , v32uint16 & ); +inline __attribute__((always_inline)) void compr_pop2(v32uint16_compress *& , v32uint16_compress *& , v32uint16 & , v32uint16 & ); +inline __attribute__((always_inline)) void compr_pop4(v32uint16_compress *& , v32uint16_compress *& , v32uint16 & , v32uint16 & , v32uint16 & , v32uint16 & ); +inline __attribute__((always_inline)) void compr_pop1(v32uint16_compress *& , v32uint16 & ); +inline __attribute__((always_inline)) void compr_pop2(v32uint16_compress *& , v32uint16 & , v32uint16 & ); +inline __attribute__((always_inline)) void compr_pop4(v32uint16_compress *& , v32uint16 & , v32uint16 & , v32uint16 & , v32uint16 & ); +inline __attribute__((always_inline)) v16int32 compr_pop(v16int32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16int32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16int32_compress *& ); +inline __attribute__((always_inline)) v16int32 compr_pop(v16int32_compress *& , v16int32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16int32_compress *& , v16int32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16int32_compress *& , v16int32_compress *& ); +inline __attribute__((always_inline)) v16int32 compr_pop_and_get_pointer(v16int32_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v16int32_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v16int32_compress *& ); +inline __attribute__((always_inline)) void compr_pop1(v16int32_compress *& , v16int32_compress *& , v16int32 & ); +inline __attribute__((always_inline)) void compr_pop2(v16int32_compress *& , v16int32_compress *& , v16int32 & , v16int32 & ); +inline __attribute__((always_inline)) void compr_pop4(v16int32_compress *& , v16int32_compress *& , v16int32 & , v16int32 & , v16int32 & , v16int32 & ); +inline __attribute__((always_inline)) void compr_pop1(v16int32_compress *& , v16int32 & ); +inline __attribute__((always_inline)) void compr_pop2(v16int32_compress *& , v16int32 & , v16int32 & ); +inline __attribute__((always_inline)) void compr_pop4(v16int32_compress *& , v16int32 & , v16int32 & , v16int32 & , v16int32 & ); +inline __attribute__((always_inline)) v16uint32 compr_pop(v16uint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16uint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16uint32_compress *& ); +inline __attribute__((always_inline)) v16uint32 compr_pop(v16uint32_compress *& , v16uint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16uint32_compress *& , v16uint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16uint32_compress *& , v16uint32_compress *& ); +inline __attribute__((always_inline)) v16uint32 compr_pop_and_get_pointer(v16uint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v16uint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v16uint32_compress *& ); +inline __attribute__((always_inline)) void compr_pop1(v16uint32_compress *& , v16uint32_compress *& , v16uint32 & ); +inline __attribute__((always_inline)) void compr_pop2(v16uint32_compress *& , v16uint32_compress *& , v16uint32 & , v16uint32 & ); +inline __attribute__((always_inline)) void compr_pop4(v16uint32_compress *& , v16uint32_compress *& , v16uint32 & , v16uint32 & , v16uint32 & , v16uint32 & ); +inline __attribute__((always_inline)) void compr_pop1(v16uint32_compress *& , v16uint32 & ); +inline __attribute__((always_inline)) void compr_pop2(v16uint32_compress *& , v16uint32 & , v16uint32 & ); +inline __attribute__((always_inline)) void compr_pop4(v16uint32_compress *& , v16uint32 & , v16uint32 & , v16uint32 & , v16uint32 & ); +inline __attribute__((always_inline)) v16cint16 compr_pop(v16cint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16cint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16cint16_compress *& ); +inline __attribute__((always_inline)) v16cint16 compr_pop(v16cint16_compress *& , v16cint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v16cint16_compress *& , v16cint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v16cint16_compress *& , v16cint16_compress *& ); +inline __attribute__((always_inline)) v16cint16 compr_pop_and_get_pointer(v16cint16_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v16cint16_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v16cint16_compress *& ); +inline __attribute__((always_inline)) void compr_pop1(v16cint16_compress *& , v16cint16_compress *& , v16cint16 & ); +inline __attribute__((always_inline)) void compr_pop2(v16cint16_compress *& , v16cint16_compress *& , v16cint16 & , v16cint16 & ); +inline __attribute__((always_inline)) void compr_pop4(v16cint16_compress *& , v16cint16_compress *& , v16cint16 & , v16cint16 & , v16cint16 & , v16cint16 & ); +inline __attribute__((always_inline)) void compr_pop1(v16cint16_compress *& , v16cint16 & ); +inline __attribute__((always_inline)) void compr_pop2(v16cint16_compress *& , v16cint16 & , v16cint16 & ); +inline __attribute__((always_inline)) void compr_pop4(v16cint16_compress *& , v16cint16 & , v16cint16 & , v16cint16 & , v16cint16 & ); +inline __attribute__((always_inline)) v8cint32 compr_pop(v8cint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v8cint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v8cint32_compress *& ); +inline __attribute__((always_inline)) v8cint32 compr_pop(v8cint32_compress *& , v8cint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v8cint32_compress *& , v8cint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v8cint32_compress *& , v8cint32_compress *& ); +inline __attribute__((always_inline)) v8cint32 compr_pop_and_get_pointer(v8cint32_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v8cint32_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v8cint32_compress *& ); +inline __attribute__((always_inline)) void compr_pop1(v8cint32_compress *& , v8cint32_compress *& , v8cint32 & ); +inline __attribute__((always_inline)) void compr_pop2(v8cint32_compress *& , v8cint32_compress *& , v8cint32 & , v8cint32 & ); +inline __attribute__((always_inline)) void compr_pop4(v8cint32_compress *& , v8cint32_compress *& , v8cint32 & , v8cint32 & , v8cint32 & , v8cint32 & ); +inline __attribute__((always_inline)) void compr_pop1(v8cint32_compress *& , v8cint32 & ); +inline __attribute__((always_inline)) void compr_pop2(v8cint32_compress *& , v8cint32 & , v8cint32 & ); +inline __attribute__((always_inline)) void compr_pop4(v8cint32_compress *& , v8cint32 & , v8cint32 & , v8cint32 & , v8cint32 & ); +inline __attribute__((always_inline)) v32bfloat16 compr_pop(v32bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v32bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v32bfloat16_compress *& ); +inline __attribute__((always_inline)) v32bfloat16 compr_pop(v32bfloat16_compress *& , v32bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_reset(v32bfloat16_compress *& , v32bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_fill(v32bfloat16_compress *& , v32bfloat16_compress *& ); +inline __attribute__((always_inline)) v32bfloat16 compr_pop_and_get_pointer(v32bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_reset_and_get_pointer(v32bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_fill_and_get_pointer(v32bfloat16_compress *& ); +inline __attribute__((always_inline)) void compr_pop1(v32bfloat16_compress *& , v32bfloat16_compress *& , v32bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop2(v32bfloat16_compress *& , v32bfloat16_compress *& , v32bfloat16 & , v32bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop4(v32bfloat16_compress *& , v32bfloat16_compress *& , v32bfloat16 & , v32bfloat16 & , v32bfloat16 & , v32bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop1(v32bfloat16_compress *& , v32bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop2(v32bfloat16_compress *& , v32bfloat16 & , v32bfloat16 & ); +inline __attribute__((always_inline)) void compr_pop4(v32bfloat16_compress *& , v32bfloat16 & , v32bfloat16 & , v32bfloat16 & , v32bfloat16 & ); +inline __attribute__((always_inline)) v64uint8 neg_gtz(v64uint8 , bool , unsigned long long & ); +inline __attribute__((always_inline)) v64int8 neg_gtz(v64int8 , bool , unsigned long long & ); +inline __attribute__((always_inline)) v32uint16 neg_gtz(v32uint16 , bool , unsigned int & ); +inline __attribute__((always_inline)) v32int16 neg_gtz(v32int16 , bool , unsigned int & ); +inline __attribute__((always_inline)) v16cint16 add(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v16cint16 sub(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v16cint16 neg(v16cint16 ); +inline __attribute__((always_inline)) v16cint16 band(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v16cint16 bor(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v16cint16 bneg(v16cint16 ); +inline __attribute__((always_inline)) v16cint16 bxor(v16cint16 , v16cint16 ); +inline __attribute__((always_inline)) v16cint16 sel(v16cint16 , v16cint16 , unsigned int ); +inline __attribute__((always_inline)) v16uint32 neg_gtz(v16uint32 , bool , unsigned int & ); +inline __attribute__((always_inline)) v16int32 neg_gtz(v16int32 , bool , unsigned int & ); +inline __attribute__((always_inline)) v8cint32 add(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cint32 sub(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cint32 neg(v8cint32 ); +inline __attribute__((always_inline)) v8cint32 band(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cint32 bor(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cint32 bneg(v8cint32 ); +inline __attribute__((always_inline)) v8cint32 bxor(v8cint32 , v8cint32 ); +inline __attribute__((always_inline)) v8cint32 sel(v8cint32 , v8cint32 , unsigned int ); +inline __attribute__((always_inline)) v8cfloat band(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v8cfloat bor(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v8cfloat bneg(v8cfloat ); +inline __attribute__((always_inline)) v8cfloat bxor(v8cfloat , v8cfloat ); +inline __attribute__((always_inline)) v8cfloat sel(v8cfloat , v8cfloat , unsigned int ); +inline __attribute__((always_inline)) v16cbfloat16 band(v16cbfloat16 , v16cbfloat16 ); +inline __attribute__((always_inline)) v16cbfloat16 bor(v16cbfloat16 , v16cbfloat16 ); +inline __attribute__((always_inline)) v16cbfloat16 bneg(v16cbfloat16 ); +inline __attribute__((always_inline)) v16cbfloat16 bxor(v16cbfloat16 , v16cbfloat16 ); +inline __attribute__((always_inline)) v16cbfloat16 sel(v16cbfloat16 , v16cbfloat16 , unsigned int ); +inline __attribute__((always_inline)) unsigned int le(v16float , v16float ); +inline __attribute__((always_inline)) unsigned int eq(v16float , v16float ); +inline __attribute__((always_inline)) unsigned int ne(v16float , v16float ); +inline __attribute__((always_inline)) void * byte_incr(void * , int ); +inline __attribute__((always_inline)) int delay1(int ); +inline __attribute__((always_inline)) int delay2(int ); +inline __attribute__((always_inline)) int delay3(int ); +inline __attribute__((always_inline)) int delay4(int ); +inline __attribute__((always_inline)) int delay5(int ); +inline __attribute__((always_inline)) int delay6(int ); +inline __attribute__((always_inline)) void * delay1(void * ); +inline __attribute__((always_inline)) void * delay2(void * ); +inline __attribute__((always_inline)) void * delay3(void * ); +inline __attribute__((always_inline)) void * delay4(void * ); +inline __attribute__((always_inline)) void * delay5(void * ); +inline __attribute__((always_inline)) void * delay6(void * ); +inline __attribute__((always_inline)) v16cint16_compress * floor(v16cint16_compress * ); +inline __attribute__((always_inline)) unsigned long long get_cycles(); +inline __attribute__((always_inline)) unsigned int packet_header(unsigned int , unsigned int ); +inline __attribute__((always_inline)) unsigned int ctrl_packet_header(unsigned int , unsigned int , unsigned int , unsigned int ); +inline __attribute__((always_inline)) v16accfloat mul_elem_16_accuracy_low_inner(v16float , v16float , int ); +inline __attribute__((always_inline)) v16accfloat mac_elem_16_accuracy_low_inner(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_elem_16_accuracy_low_inner(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_elem_16_accuracy_fast_inner(v16float , v16float , int ); +inline __attribute__((always_inline)) v16accfloat mac_elem_16_accuracy_fast_inner(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_elem_16_accuracy_fast_inner(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_elem_16_accuracy_safe_inner(v16float , v16float , int ); +inline __attribute__((always_inline)) v16accfloat mac_elem_16_accuracy_safe_inner(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_elem_16_accuracy_safe_inner(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_4x8_8x4_accuracy_safe_inner(v32float , v32float , int ); +inline __attribute__((always_inline)) v16accfloat mac_4x8_8x4_accuracy_safe_inner(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_4x8_8x4_accuracy_safe_inner(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_4x8_8x4_accuracy_fast_inner(v32float , v32float , int ); +inline __attribute__((always_inline)) v16accfloat mac_4x8_8x4_accuracy_fast_inner(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_4x8_8x4_accuracy_fast_inner(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_4x8_8x4_accuracy_low_inner(v32float , v32float , int ); +inline __attribute__((always_inline)) v16accfloat mac_4x8_8x4_accuracy_low_inner(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_4x8_8x4_accuracy_low_inner(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_elem_16_accuracy_low(v16float , v16float , int ); +inline __attribute__((always_inline)) v16accfloat mac_elem_16_accuracy_low(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_elem_16_accuracy_low(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_4x8_8x4_accuracy_low(v32float , v32float , int ); +inline __attribute__((always_inline)) v16accfloat mac_4x8_8x4_accuracy_low(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_4x8_8x4_accuracy_low(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_elem_16_accuracy_safe(v16float , v16float , int ); +inline __attribute__((always_inline)) v16accfloat mac_elem_16_accuracy_safe(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_elem_16_accuracy_safe(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_4x8_8x4_accuracy_safe(v32float , v32float , int ); +inline __attribute__((always_inline)) v16accfloat mac_4x8_8x4_accuracy_safe(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_4x8_8x4_accuracy_safe(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_elem_16_accuracy_fast(v16float , v16float , int ); +inline __attribute__((always_inline)) v16accfloat mac_elem_16_accuracy_fast(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_elem_16_accuracy_fast(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_4x8_8x4_accuracy_fast(v32float , v32float , int ); +inline __attribute__((always_inline)) v16accfloat mac_4x8_8x4_accuracy_fast(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_4x8_8x4_accuracy_fast(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_elem_16_conf(v16float , v16float , int ); +inline __attribute__((always_inline)) v16accfloat mac_elem_16_conf(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_elem_16_conf(v16float , v16float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat addmac_elem_16_conf(v16float , v16float , v16accfloat , v16accfloat , int , int , int , int ); +inline __attribute__((always_inline)) v16accfloat addmsc_elem_16_conf(v16float , v16float , v16accfloat , v16accfloat , int , int , int , int ); +inline __attribute__((always_inline)) v16accfloat mul_4x8_8x4_conf(v32float , v32float , int ); +inline __attribute__((always_inline)) v16accfloat mac_4x8_8x4_conf(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat msc_4x8_8x4_conf(v32float , v32float , v16accfloat , int , int , int ); +inline __attribute__((always_inline)) v16accfloat addmac_4x8_8x4_conf(v32float , v32float , v16accfloat , v16accfloat , int , int , int , int ); +inline __attribute__((always_inline)) v16accfloat addmsc_4x8_8x4_conf(v32float , v32float , v16accfloat , v16accfloat , int , int , int , int ); +// clang-format on + +#endif // __AIEV2_AIE_API_COMPAT_H diff --git a/clang/lib/Headers/aiev2_core.h b/clang/lib/Headers/aiev2_core.h index e51e2132cfa3..0fd29f517e40 100644 --- a/clang/lib/Headers/aiev2_core.h +++ b/clang/lib/Headers/aiev2_core.h @@ -10,6 +10,31 @@ #ifndef __AIEV2_CORE_H #define __AIEV2_CORE_H + +#ifndef OP_TERM_NEG_COMPLEX +#define OP_TERM_NEG_COMPLEX 0x0A +#endif + +#ifndef OP_TERM_NEG_COMPLEX_CONJUGATE_X +#define OP_TERM_NEG_COMPLEX_CONJUGATE_X 0xA0 +#endif + +#ifndef OP_TERM_NEG_COMPLEX_CONJUGATE_Y +#define OP_TERM_NEG_COMPLEX_CONJUGATE_Y 0x50 +#endif + +#ifndef OP_TERM_NEG_COMPLEX_CONJUGATE_X_Y +#define OP_TERM_NEG_COMPLEX_CONJUGATE_X_Y 0xFA +#endif + +#ifndef OP_TERM_NEG_COMPLEX_CONJUGATE_BUTTERFLY +#define OP_TERM_NEG_COMPLEX_CONJUGATE_BUTTERFLY 0xC6 +#endif + +#ifndef OP_TERM_NEG_COMPLEX_BUTTERFLY +#define OP_TERM_NEG_COMPLEX_BUTTERFLY 0x9C +#endif + #ifdef mul_elem_16 #undef mul_elem_16 #endif @@ -1890,4 +1915,84 @@ template static constexpr bool aie_dm_resource_is_same_v = (Resource == aie_dm_resource_get_v); +enum class aie_stream_resource_in { none, a, b }; +enum class aie_stream_resource_out { none, a, b }; + +#define __aie_stream_resource_in_a +#define __aie_stream_resource_in_b +#define __aie_stream_resource_out_a +#define __aie_stream_resource_out_b + +template struct aie_stream_resource_remove { + using type = T; +}; + +template +struct aie_stream_resource_remove { + using type = T; +}; +template +struct aie_stream_resource_remove { + using type = const T; +}; +template +struct aie_stream_resource_remove { + using type = T; +}; +template +struct aie_stream_resource_remove { + using type = const T; +}; + +template +using aie_stream_resource_remove_t = + typename aie_stream_resource_remove::type; + +template +struct aie_stream_resource_in_set { + using type = T; +}; + +template +struct aie_stream_resource_out_set { + using type = T; +}; + +template +using aie_stream_resource_in_set_t = + typename aie_stream_resource_in_set, + Resource>::type; + +template +using aie_stream_resource_out_set_t = + typename aie_stream_resource_out_set, + Resource>::type; + +template struct aie_stream_resource_in_get { + static constexpr aie_stream_resource_in value = aie_stream_resource_in::none; +}; + +template struct aie_stream_resource_out_get { + static constexpr aie_stream_resource_out value = + aie_stream_resource_out::none; +}; + +template +static constexpr aie_stream_resource_in aie_stream_resource_in_get_v = + aie_stream_resource_in_get::value; + +template +static constexpr aie_stream_resource_out aie_stream_resource_out_get_v = + aie_stream_resource_out_get::value; + +template +static constexpr bool aie_stream_resource_in_is_same_v = + (Resource == aie_stream_resource_in_get_v); + +template +static constexpr bool aie_stream_resource_out_is_same_v = + (Resource == aie_stream_resource_out_get_v); + #endif // __AIEV2_CORE_H diff --git a/clang/lib/Headers/aiev2intrin.h b/clang/lib/Headers/aiev2intrin.h index 52a29ebd4999..adae70734b54 100644 --- a/clang/lib/Headers/aiev2intrin.h +++ b/clang/lib/Headers/aiev2intrin.h @@ -44,6 +44,7 @@ #include "aiev2_addr.h" #include "aiev2_core.h" #include "aiev2_vld_sparse.h" +#include "aiev2_aie_api_compat.h" // clang-format on #endif /* __cplusplus */ @@ -69,4 +70,19 @@ write_tm(uint32 regVal, uint32 regAddr, uint32 TMAddrSpaceStart = 0x80000) { #endif /* __cplusplus && !(__AIECC__DISABLE_READ_WRITE_TM) */ +INTRINSIC(float) as_float(int x) { return __builtin_bit_cast(float, x); } +INTRINSIC(int) as_int32(float x) { return __builtin_bit_cast(int, x); } +INTRINSIC(long long) as_int64(double x) { + return __builtin_bit_cast(long long, x); +} +INTRINSIC(double) as_double(long long x) { + return __builtin_bit_cast(double, x); +} +INTRINSIC(bfloat16) as_bfloat16(short x) { + return __builtin_bit_cast(bfloat16, x); +} + +// FIXME: this belongs to libc's stdio.h +void printf(const char *__restrict, ...); + #endif /* __AIEV2INTRIN_H */