Successfully identified regression in *gcc* in CI configuration tcwg_gcc_bootstrap/master-aarch64-bootstrap_ubsan. So far, this commit has regressed CI configurations: - tcwg_gcc_bootstrap/master-aarch64-bootstrap_ubsan
Culprit: <cut> commit d1819df86fbe42125cccb2fc2959a0bf51e524d6 Author: Jonathan Wright jonathan.wright@arm.com Date: Mon Aug 16 14:37:18 2021 +0100
aarch64: Remove macros for vld4[q]_lane Neon intrinsics
Remove macros for vld4[q]_lane Neon intrinsics. This is a preparatory step before adding new modes for structures of Advanced SIMD vectors.
gcc/ChangeLog:
2021-08-16 Jonathan Wright jonathan.wright@arm.com
* config/aarch64/arm_neon.h (__LD4_LANE_FUNC): Delete. (__LD4Q_LANE_FUNC): Likewise. (vld4_lane_u8): Define without macro. (vld4_lane_u16): Likewise. (vld4_lane_u32): Likewise. (vld4_lane_u64): Likewise. (vld4_lane_s8): Likewise. (vld4_lane_s16): Likewise. (vld4_lane_s32): Likewise. (vld4_lane_s64): Likewise. (vld4_lane_f16): Likewise. (vld4_lane_f32): Likewise. (vld4_lane_f64): Likewise. (vld4_lane_p8): Likewise. (vld4_lane_p16): Likewise. (vld4_lane_p64): Likewise. (vld4q_lane_u8): Likewise. (vld4q_lane_u16): Likewise. (vld4q_lane_u32): Likewise. (vld4q_lane_u64): Likewise. (vld4q_lane_s8): Likewise. (vld4q_lane_s16): Likewise. (vld4q_lane_s32): Likewise. (vld4q_lane_s64): Likewise. (vld4q_lane_f16): Likewise. (vld4q_lane_f32): Likewise. (vld4q_lane_f64): Likewise. (vld4q_lane_p8): Likewise. (vld4q_lane_p16): Likewise. (vld4q_lane_p64): Likewise. (vld4_lane_bf16): Likewise. (vld4q_lane_bf16): Likewise. </cut>
Results regressed to (for first_bad == d1819df86fbe42125cccb2fc2959a0bf51e524d6) # reset_artifacts: -10 # true: 0 # build_abe binutils: 1 # First few build errors in logs: # 00:10:53 make[3]: [Makefile:1769: aarch64-unknown-linux-gnu/bits/largefile-config.h] Error 1 (ignored) # 00:10:53 make[3]: [Makefile:1770: aarch64-unknown-linux-gnu/bits/largefile-config.h] Error 1 (ignored) # 00:26:15 /home/tcwg-buildslave/workspace/tcwg_gnu_2/abe/builds/aarch64-unknown-linux-gnu/aarch64-unknown-linux-gnu/gcc-gcc.git~master-stage2/prev-gcc/include/arm_neon.h:21081:11: error: cannot convert ‘float*’ to ‘const int*’ # 00:26:15 /home/tcwg-buildslave/workspace/tcwg_gnu_2/abe/builds/aarch64-unknown-linux-gnu/aarch64-unknown-linux-gnu/gcc-gcc.git~master-stage2/prev-gcc/include/arm_neon.h:21384:9: error: cannot convert ‘long int*’ to ‘const double*’ # 00:26:16 make[3]: *** [Makefile:226: lex.o] Error 1 # 00:26:30 make[2]: *** [Makefile:9758: all-stage2-libcpp] Error 2 # 00:28:15 make[1]: *** [Makefile:25899: stage2-bubble] Error 2 # 00:28:15 make: *** [Makefile:1010: all] Error 2
from (for last_good == 08f83812e5c5fdd9a7a4a1b9e46bb33725185c5a) # reset_artifacts: -10 # true: 0 # build_abe binutils: 1 # build_abe bootstrap_ubsan: 2
Artifacts of last_good build: https://ci.linaro.org/job/tcwg_gcc_bootstrap-bisect-master-aarch64-bootstrap... Artifacts of first_bad build: https://ci.linaro.org/job/tcwg_gcc_bootstrap-bisect-master-aarch64-bootstrap... Build top page/logs: https://ci.linaro.org/job/tcwg_gcc_bootstrap-bisect-master-aarch64-bootstrap...
Configuration details:
Reproduce builds: <cut> mkdir investigate-gcc-d1819df86fbe42125cccb2fc2959a0bf51e524d6 cd investigate-gcc-d1819df86fbe42125cccb2fc2959a0bf51e524d6
git clone https://git.linaro.org/toolchain/jenkins-scripts
mkdir -p artifacts/manifests curl -o artifacts/manifests/build-baseline.sh https://ci.linaro.org/job/tcwg_gcc_bootstrap-bisect-master-aarch64-bootstrap... --fail curl -o artifacts/manifests/build-parameters.sh https://ci.linaro.org/job/tcwg_gcc_bootstrap-bisect-master-aarch64-bootstrap... --fail curl -o artifacts/test.sh https://ci.linaro.org/job/tcwg_gcc_bootstrap-bisect-master-aarch64-bootstrap... --fail chmod +x artifacts/test.sh
# Reproduce the baseline build (build all pre-requisites) ./jenkins-scripts/tcwg_gnu-build.sh @@ artifacts/manifests/build-baseline.sh
# Save baseline build state (which is then restored in artifacts/test.sh) mkdir -p ./bisect rsync -a --del --delete-excluded --exclude /bisect/ --exclude /artifacts/ --exclude /gcc/ ./ ./bisect/baseline/
cd gcc
# Reproduce first_bad build git checkout --detach d1819df86fbe42125cccb2fc2959a0bf51e524d6 ../artifacts/test.sh
# Reproduce last_good build git checkout --detach 08f83812e5c5fdd9a7a4a1b9e46bb33725185c5a ../artifacts/test.sh
cd .. </cut>
History of pending regressions and results: https://git.linaro.org/toolchain/ci/base-artifacts.git/log/?h=linaro-local/c...
Artifacts: https://ci.linaro.org/job/tcwg_gcc_bootstrap-bisect-master-aarch64-bootstrap... Build log: https://ci.linaro.org/job/tcwg_gcc_bootstrap-bisect-master-aarch64-bootstrap...
Full commit (up to 1000 lines): <cut> commit d1819df86fbe42125cccb2fc2959a0bf51e524d6 Author: Jonathan Wright jonathan.wright@arm.com Date: Mon Aug 16 14:37:18 2021 +0100
aarch64: Remove macros for vld4[q]_lane Neon intrinsics
Remove macros for vld4[q]_lane Neon intrinsics. This is a preparatory step before adding new modes for structures of Advanced SIMD vectors.
gcc/ChangeLog:
2021-08-16 Jonathan Wright jonathan.wright@arm.com
* config/aarch64/arm_neon.h (__LD4_LANE_FUNC): Delete. (__LD4Q_LANE_FUNC): Likewise. (vld4_lane_u8): Define without macro. (vld4_lane_u16): Likewise. (vld4_lane_u32): Likewise. (vld4_lane_u64): Likewise. (vld4_lane_s8): Likewise. (vld4_lane_s16): Likewise. (vld4_lane_s32): Likewise. (vld4_lane_s64): Likewise. (vld4_lane_f16): Likewise. (vld4_lane_f32): Likewise. (vld4_lane_f64): Likewise. (vld4_lane_p8): Likewise. (vld4_lane_p16): Likewise. (vld4_lane_p64): Likewise. (vld4q_lane_u8): Likewise. (vld4q_lane_u16): Likewise. (vld4q_lane_u32): Likewise. (vld4q_lane_u64): Likewise. (vld4q_lane_s8): Likewise. (vld4q_lane_s16): Likewise. (vld4q_lane_s32): Likewise. (vld4q_lane_s64): Likewise. (vld4q_lane_f16): Likewise. (vld4q_lane_f32): Likewise. (vld4q_lane_f64): Likewise. (vld4q_lane_p8): Likewise. (vld4q_lane_p16): Likewise. (vld4q_lane_p64): Likewise. (vld4_lane_bf16): Likewise. (vld4q_lane_bf16): Likewise. --- gcc/config/aarch64/arm_neon.h | 728 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 624 insertions(+), 104 deletions(-)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 29b62988a91..d8b29706a20 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -20856,110 +20856,595 @@ vld3q_lane_p64 (const poly64_t * __ptr, poly64x2x3_t __b, const int __c)
/* vld4_lane */
-#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ - qmode, ptrmode, funcsuffix, signedtype) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_xi __o; \ - largetype __temp; \ - __temp.val[0] = \ - vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ - __temp.val[1] = \ - vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ - __temp.val[2] = \ - vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ - __temp.val[3] = \ - vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[0], \ - 0); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[1], \ - 1); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[2], \ - 2); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[3], \ - 3); \ - __o = __builtin_aarch64_ld4_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \ - __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \ - __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \ - __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \ - return __b; \ +__extension__ extern __inline uint8x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_u8 (const uint8_t * __ptr, uint8x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint8x16x4_t __temp; + __temp.val[0] = vcombine_u8 (__b.val[0], vcreate_u8 (0)); + __temp.val[1] = vcombine_u8 (__b.val[1], vcreate_u8 (0)); + __temp.val[2] = vcombine_u8 (__b.val[2], vcreate_u8 (0)); + __temp.val[3] = vcombine_u8 (__b.val[3], vcreate_u8 (0)); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; }
-/* vld4q_lane */ +__extension__ extern __inline uint16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_u16 (const uint16_t * __ptr, uint16x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint16x8x4_t __temp; + __temp.val[0] = vcombine_u16 (__b.val[0], vcreate_u16 (0)); + __temp.val[1] = vcombine_u16 (__b.val[1], vcreate_u16 (0)); + __temp.val[2] = vcombine_u16 (__b.val[2], vcreate_u16 (0)); + __temp.val[3] = vcombine_u16 (__b.val[3], vcreate_u16 (0)); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline uint32x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_u32 (const uint32_t * __ptr, uint32x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint32x4x4_t __temp; + __temp.val[0] = vcombine_u32 (__b.val[0], vcreate_u32 (0)); + __temp.val[1] = vcombine_u32 (__b.val[1], vcreate_u32 (0)); + __temp.val[2] = vcombine_u32 (__b.val[2], vcreate_u32 (0)); + __temp.val[3] = vcombine_u32 (__b.val[3], vcreate_u32 (0)); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + __b.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline uint64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_u64 (const uint64_t * __ptr, uint64x1x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint64x2x4_t __temp; + __temp.val[0] = vcombine_u64 (__b.val[0], vcreate_u64 (0)); + __temp.val[1] = vcombine_u64 (__b.val[1], vcreate_u64 (0)); + __temp.val[2] = vcombine_u64 (__b.val[2], vcreate_u64 (0)); + __temp.val[3] = vcombine_u64 (__b.val[3], vcreate_u64 (0)); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline int8x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_s8 (const int8_t * __ptr, int8x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int8x16x4_t __temp; + __temp.val[0] = vcombine_s8 (__b.val[0], vcreate_s8 (0)); + __temp.val[1] = vcombine_s8 (__b.val[1], vcreate_s8 (0)); + __temp.val[2] = vcombine_s8 (__b.val[2], vcreate_s8 (0)); + __temp.val[3] = vcombine_s8 (__b.val[3], vcreate_s8 (0)); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (int8x8_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (int8x8_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (int8x8_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (int8x8_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline int16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_s16 (const int16_t * __ptr, int16x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int16x8x4_t __temp; + __temp.val[0] = vcombine_s16 (__b.val[0], vcreate_s16 (0)); + __temp.val[1] = vcombine_s16 (__b.val[1], vcreate_s16 (0)); + __temp.val[2] = vcombine_s16 (__b.val[2], vcreate_s16 (0)); + __temp.val[3] = vcombine_s16 (__b.val[3], vcreate_s16 (0)); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (int16x4_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (int16x4_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (int16x4_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (int16x4_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline int32x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_s32 (const int32_t * __ptr, int32x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int32x4x4_t __temp; + __temp.val[0] = vcombine_s32 (__b.val[0], vcreate_s32 (0)); + __temp.val[1] = vcombine_s32 (__b.val[1], vcreate_s32 (0)); + __temp.val[2] = vcombine_s32 (__b.val[2], vcreate_s32 (0)); + __temp.val[3] = vcombine_s32 (__b.val[3], vcreate_s32 (0)); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + __b.val[0] = (int32x2_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (int32x2_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (int32x2_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (int32x2_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline int64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_s64 (const int64_t * __ptr, int64x1x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int64x2x4_t __temp; + __temp.val[0] = vcombine_s64 (__b.val[0], vcreate_s64 (0)); + __temp.val[1] = vcombine_s64 (__b.val[1], vcreate_s64 (0)); + __temp.val[2] = vcombine_s64 (__b.val[2], vcreate_s64 (0)); + __temp.val[3] = vcombine_s64 (__b.val[3], vcreate_s64 (0)); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline float16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_f16 (const float16_t * __ptr, float16x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float16x8x4_t __temp; + __temp.val[0] = vcombine_f16 (__b.val[0], vcreate_f16 (0)); + __temp.val[1] = vcombine_f16 (__b.val[1], vcreate_f16 (0)); + __temp.val[2] = vcombine_f16 (__b.val[2], vcreate_f16 (0)); + __temp.val[3] = vcombine_f16 (__b.val[3], vcreate_f16 (0)); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4hf ( + (__builtin_aarch64_simd_hf *) __ptr, __o, __c); + __b.val[0] = (float16x4_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (float16x4_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (float16x4_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (float16x4_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline float32x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_f32 (const float32_t * __ptr, float32x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float32x4x4_t __temp; + __temp.val[0] = vcombine_f32 (__b.val[0], vcreate_f32 (0)); + __temp.val[1] = vcombine_f32 (__b.val[1], vcreate_f32 (0)); + __temp.val[2] = vcombine_f32 (__b.val[2], vcreate_f32 (0)); + __temp.val[3] = vcombine_f32 (__b.val[3], vcreate_f32 (0)); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2si ( + (__builtin_aarch64_simd_sf *) __ptr, __o, __c); + __b.val[0] = (float32x2_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (float32x2_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (float32x2_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (float32x2_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline float64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_f64 (const float64_t * __ptr, float64x1x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float64x2x4_t __temp; + __temp.val[0] = vcombine_f64 (__b.val[0], vcreate_f64 (0)); + __temp.val[1] = vcombine_f64 (__b.val[1], vcreate_f64 (0)); + __temp.val[2] = vcombine_f64 (__b.val[2], vcreate_f64 (0)); + __temp.val[3] = vcombine_f64 (__b.val[3], vcreate_f64 (0)); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanedf ( + (__builtin_aarch64_simd_df *) __ptr, __o, __c); + __b.val[0] = (float64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (float64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (float64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (float64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline poly8x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_p8 (const poly8_t * __ptr, poly8x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly8x16x4_t __temp; + __temp.val[0] = vcombine_p8 (__b.val[0], vcreate_p8 (0)); + __temp.val[1] = vcombine_p8 (__b.val[1], vcreate_p8 (0)); + __temp.val[2] = vcombine_p8 (__b.val[2], vcreate_p8 (0)); + __temp.val[3] = vcombine_p8 (__b.val[3], vcreate_p8 (0)); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + __b.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +}
-__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf, - v8hf, hf, f16, float16x8_t) -__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf, - sf, f32, float32x4_t) -__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df, - df, f64, float64x2_t) -__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, - p16, int16x8_t) -__LD4_LANE_FUNC (poly64x1x4_t, poly64x1_t, poly64x2x4_t, poly64_t, di, - v2di_ssps, di, p64, poly64x2_t) -__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, di, v2di, di, s64, - int64x2_t) -__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, - u16, int16x8_t) -__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v2si, v4si, si, - u32, int32x4_t) -__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di, - u64, int64x2_t) +__extension__ extern __inline poly16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_p16 (const poly16_t * __ptr, poly16x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly16x8x4_t __temp; + __temp.val[0] = vcombine_p16 (__b.val[0], vcreate_p16 (0)); + __temp.val[1] = vcombine_p16 (__b.val[1], vcreate_p16 (0)); + __temp.val[2] = vcombine_p16 (__b.val[2], vcreate_p16 (0)); + __temp.val[3] = vcombine_p16 (__b.val[3], vcreate_p16 (0)); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + __b.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline poly64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_p64 (const poly64_t * __ptr, poly64x1x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly64x2x4_t __temp; + __temp.val[0] = vcombine_p64 (__b.val[0], vcreate_p64 (0)); + __temp.val[1] = vcombine_p64 (__b.val[1], vcreate_p64 (0)); + __temp.val[2] = vcombine_p64 (__b.val[2], vcreate_p64 (0)); + __temp.val[3] = vcombine_p64 (__b.val[3], vcreate_p64 (0)); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanedi ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + __b.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +}
/* vld4q_lane */
-#define __LD4Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_xi __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \ - __o = __builtin_aarch64_ld4_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0); \ - ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1); \ - ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2); \ - ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3); \ - return ret; \ -} - -__LD4Q_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD4Q_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD4Q_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64) -__LD4Q_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD4Q_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD4Q_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD4Q_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD4Q_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD4Q_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32) -__LD4Q_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64) -__LD4Q_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD4Q_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD4Q_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD4Q_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64) +__extension__ extern __inline uint8x16x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_u8 (const uint8_t * __ptr, uint8x16x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint8x16x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline uint16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_u16 (const uint16_t * __ptr, uint16x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint16x8x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline uint32x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_u32 (const uint32_t * __ptr, uint32x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint32x4x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline uint64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_u64 (const uint64_t * __ptr, uint64x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + uint64x2x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline int8x16x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_s8 (const int8_t * __ptr, int8x16x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int8x16x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline int16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_s16 (const int16_t * __ptr, int16x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int16x8x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline int32x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_s32 (const int32_t * __ptr, int32x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int32x4x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4si ( + (__builtin_aarch64_simd_si *) __ptr, __o, __c); + ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline int64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_s64 (const int64_t * __ptr, int64x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + int64x2x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline float16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_f16 (const float16_t * __ptr, float16x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float16x8x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8hf ( + (__builtin_aarch64_simd_hf *) __ptr, __o, __c); + ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline float32x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_f32 (const float32_t * __ptr, float32x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float32x4x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4sf ( + (__builtin_aarch64_simd_sf *) __ptr, __o, __c); + ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline float64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_f64 (const float64_t * __ptr, float64x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + float64x2x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2df ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline poly8x16x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_p8 (const poly8_t * __ptr, poly8x16x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly8x16x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev16qi ( + (__builtin_aarch64_simd_qi *) __ptr, __o, __c); + ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline poly16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_p16 (const poly16_t * __ptr, poly16x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly16x8x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8hi ( + (__builtin_aarch64_simd_hi *) __ptr, __o, __c); + ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline poly64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_p64 (const poly64_t * __ptr, poly64x2x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + poly64x2x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev2di ( + (__builtin_aarch64_simd_di *) __ptr, __o, __c); + ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +}
/* vmax */
@@ -35441,9 +35926,47 @@ vld3q_lane_bf16 (const bfloat16_t * __ptr, bfloat16x8x3_t __b, const int __c) return ret; }
-__LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, - v8bf, bf, bf16, bfloat16x8_t) -__LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4_lane_bf16 (const bfloat16_t * __ptr, bfloat16x4x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t __temp; + __temp.val[0] = vcombine_bf16 (__b.val[0], vcreate_bf16 (0)); + __temp.val[1] = vcombine_bf16 (__b.val[1], vcreate_bf16 (0)); + __temp.val[2] = vcombine_bf16 (__b.val[2], vcreate_bf16 (0)); + __temp.val[3] = vcombine_bf16 (__b.val[3], vcreate_bf16 (0)); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3); + __o = __builtin_aarch64_ld4_lanev4bf ( + (__builtin_aarch64_simd_bf *) __ptr, __o, __c); + __b.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxidi (__o, 0); + __b.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxidi (__o, 1); + __b.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxidi (__o, 2); + __b.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxidi (__o, 3); + return __b; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) +vld4q_lane_bf16 (const bfloat16_t * __ptr, bfloat16x8x4_t __b, const int __c) +{ + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t ret; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); + __o = __builtin_aarch64_ld4_lanev8bf ( + (__builtin_aarch64_simd_bf *) __ptr, __o, __c); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +}
__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -35739,7 +36262,4 @@ vaddq_p128 (poly128_t __a, poly128_t __b) #undef __aarch64_vdupq_laneq_u32 #undef __aarch64_vdupq_laneq_u64
-#undef __LD4_LANE_FUNC -#undef __LD4Q_LANE_FUNC - #endif </cut>