target/arm: Rewrite helper_sve_ld[234]*_r

Use the same *_tlb primitives as we use for ld1.

For linux-user, this hoists the set of helper_retaddr. For softmmu,
hoists the computation of the current mmu_idx outside the loop,
fixes the endianness problem, and moves the main loop out of a
macro and into an inlined function.

Backports commit f27d4dc2af0de9b7b45c955882b8420905c6efe8 from qemu
This commit is contained in:
Richard Henderson 2018-10-08 11:40:37 -04:00 committed by Lioncash
parent 5b88176e1d
commit 4978d77039
No known key found for this signature in database
GPG key ID: 4E3C3CC1031BA9C7

View file

@ -3806,109 +3806,133 @@ DO_LD1_2(ld1dd, 3, 3)
#undef DO_LD1_1 #undef DO_LD1_1
#undef DO_LD1_2 #undef DO_LD1_2
#define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \ /*
void HELPER(NAME)(CPUARMState *env, void *vg, \ * Common helpers for all contiguous 2,3,4-register predicated loads.
target_ulong addr, uint32_t desc) \ */
{ \ static void sve_ld2_r(CPUARMState *env, void *vg, target_ulong addr,
intptr_t i, oprsz = simd_oprsz(desc); \ uint32_t desc, int size, uintptr_t ra,
intptr_t ra = GETPC(); \ sve_ld1_tlb_fn *tlb_fn)
unsigned rd = simd_data(desc); \ {
void *d1 = &env->vfp.zregs[rd]; \ const int mmu_idx = cpu_mmu_index(env, false);
void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \ intptr_t i, oprsz = simd_oprsz(desc);
for (i = 0; i < oprsz; ) { \ unsigned rd = simd_data(desc);
uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ ARMVectorReg scratch[2] = { };
do { \
TYPEM m1 = 0, m2 = 0; \ set_helper_retaddr(ra);
if (pg & 1) { \ for (i = 0; i < oprsz; ) {
m1 = FN(env, addr, ra); \ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
m2 = FN(env, addr + sizeof(TYPEM), ra); \ do {
} \ if (pg & 1) {
*(TYPEE *)(d1 + H(i)) = m1; \ tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
*(TYPEE *)(d2 + H(i)) = m2; \ tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ }
addr += 2 * sizeof(TYPEM); \ i += size, pg >>= size;
} while (i & 15); \ addr += 2 * size;
} \ } while (i & 15);
}
set_helper_retaddr(0);
/* Wait until all exceptions have been raised to write back. */
memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
} }
#define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \ static void sve_ld3_r(CPUARMState *env, void *vg, target_ulong addr,
void HELPER(NAME)(CPUARMState *env, void *vg, \ uint32_t desc, int size, uintptr_t ra,
target_ulong addr, uint32_t desc) \ sve_ld1_tlb_fn *tlb_fn)
{ \ {
intptr_t i, oprsz = simd_oprsz(desc); \ const int mmu_idx = cpu_mmu_index(env, false);
intptr_t ra = GETPC(); \ intptr_t i, oprsz = simd_oprsz(desc);
unsigned rd = simd_data(desc); \ unsigned rd = simd_data(desc);
void *d1 = &env->vfp.zregs[rd]; \ ARMVectorReg scratch[3] = { };
void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \ set_helper_retaddr(ra);
for (i = 0; i < oprsz; ) { \ for (i = 0; i < oprsz; ) {
uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
do { \ do {
TYPEM m1 = 0, m2 = 0, m3 = 0; \ if (pg & 1) {
if (pg & 1) { \ tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
m1 = FN(env, addr, ra); \ tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
m2 = FN(env, addr + sizeof(TYPEM), ra); \ tlb_fn(env, &scratch[2], i, addr + 2 * size, mmu_idx, ra);
m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \ }
} \ i += size, pg >>= size;
*(TYPEE *)(d1 + H(i)) = m1; \ addr += 3 * size;
*(TYPEE *)(d2 + H(i)) = m2; \ } while (i & 15);
*(TYPEE *)(d3 + H(i)) = m3; \ }
i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ set_helper_retaddr(0);
addr += 3 * sizeof(TYPEM); \
} while (i & 15); \ /* Wait until all exceptions have been raised to write back. */
} \ memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
} }
#define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \ static void sve_ld4_r(CPUARMState *env, void *vg, target_ulong addr,
void HELPER(NAME)(CPUARMState *env, void *vg, \ uint32_t desc, int size, uintptr_t ra,
target_ulong addr, uint32_t desc) \ sve_ld1_tlb_fn *tlb_fn)
{ \ {
intptr_t i, oprsz = simd_oprsz(desc); \ const int mmu_idx = cpu_mmu_index(env, false);
intptr_t ra = GETPC(); \ intptr_t i, oprsz = simd_oprsz(desc);
unsigned rd = simd_data(desc); \ unsigned rd = simd_data(desc);
void *d1 = &env->vfp.zregs[rd]; \ ARMVectorReg scratch[4] = { };
void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \ set_helper_retaddr(ra);
void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \ for (i = 0; i < oprsz; ) {
for (i = 0; i < oprsz; ) { \ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ do {
do { \ if (pg & 1) {
TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \ tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
if (pg & 1) { \ tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
m1 = FN(env, addr, ra); \ tlb_fn(env, &scratch[2], i, addr + 2 * size, mmu_idx, ra);
m2 = FN(env, addr + sizeof(TYPEM), ra); \ tlb_fn(env, &scratch[3], i, addr + 3 * size, mmu_idx, ra);
m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \ }
m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \ i += size, pg >>= size;
} \ addr += 4 * size;
*(TYPEE *)(d1 + H(i)) = m1; \ } while (i & 15);
*(TYPEE *)(d2 + H(i)) = m2; \ }
*(TYPEE *)(d3 + H(i)) = m3; \ set_helper_retaddr(0);
*(TYPEE *)(d4 + H(i)) = m4; \
i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ /* Wait until all exceptions have been raised to write back. */
addr += 4 * sizeof(TYPEM); \ memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
} while (i & 15); \ memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
} \ memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
memcpy(&env->vfp.zregs[(rd + 3) & 31], &scratch[3], oprsz);
} }
DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) #define DO_LDN_1(N) \
DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) void __attribute__((flatten)) HELPER(sve_ld##N##bb_r) \
DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
{ \
sve_ld##N##_r(env, vg, addr, desc, 1, GETPC(), sve_ld1bb_tlb); \
}
DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) #define DO_LDN_2(N, SUFF, SIZE) \
DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) void __attribute__((flatten)) HELPER(sve_ld##N##SUFF##_r) \
DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
{ \
sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
arm_cpu_data_is_big_endian(env) \
? sve_ld1##SUFF##_be_tlb : sve_ld1##SUFF##_le_tlb); \
}
DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) DO_LDN_1(2)
DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) DO_LDN_1(3)
DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) DO_LDN_1(4)
DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) DO_LDN_2(2, hh, 2)
DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) DO_LDN_2(3, hh, 2)
DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) DO_LDN_2(4, hh, 2)
#undef DO_LD2 DO_LDN_2(2, ss, 4)
#undef DO_LD3 DO_LDN_2(3, ss, 4)
#undef DO_LD4 DO_LDN_2(4, ss, 4)
DO_LDN_2(2, dd, 8)
DO_LDN_2(3, dd, 8)
DO_LDN_2(4, dd, 8)
#undef DO_LDN_1
#undef DO_LDN_2
/* /*
* Load contiguous data, first-fault and no-fault. * Load contiguous data, first-fault and no-fault.