diff --git a/qemu/target/arm/translate.c b/qemu/target/arm/translate.c index 5296b6e8..a52e7cb0 100644 --- a/qemu/target/arm/translate.c +++ b/qemu/target/arm/translate.c @@ -3104,20 +3104,6 @@ static void gen_vfp_msr(DisasContext *s, TCGv_i32 tmp) tcg_temp_free_i32(tcg_ctx, tmp); } -static void gen_neon_dup_u8(DisasContext *s, TCGv_i32 var, int shift) -{ - TCGContext *tcg_ctx = s->uc->tcg_ctx; - TCGv_i32 tmp = tcg_temp_new_i32(tcg_ctx); - if (shift) - tcg_gen_shri_i32(tcg_ctx, var, var, shift); - tcg_gen_ext8u_i32(tcg_ctx, var, var); - tcg_gen_shli_i32(tcg_ctx, tmp, var, 8); - tcg_gen_or_i32(tcg_ctx, var, var, tmp); - tcg_gen_shli_i32(tcg_ctx, tmp, var, 16); - tcg_gen_or_i32(tcg_ctx, var, var, tmp); - tcg_temp_free_i32(tcg_ctx, tmp); -} - static void gen_neon_dup_low16(DisasContext *s, TCGv_i32 var) { TCGContext *tcg_ctx = s->uc->tcg_ctx; @@ -3138,29 +3124,6 @@ static void gen_neon_dup_high16(DisasContext *s, TCGv_i32 var) tcg_temp_free_i32(tcg_ctx, tmp); } -static TCGv_i32 gen_load_and_replicate(DisasContext *s, TCGv_i32 addr, int size) -{ - /* Load a single Neon element and replicate into a 32 bit TCG reg */ - TCGContext *tcg_ctx = s->uc->tcg_ctx; - TCGv_i32 tmp = tcg_temp_new_i32(tcg_ctx); - switch (size) { - case 0: - gen_aa32_ld8u(s, tmp, addr, get_mem_index(s)); - gen_neon_dup_u8(s, tmp, 0); - break; - case 1: - gen_aa32_ld16u(s, tmp, addr, get_mem_index(s)); - gen_neon_dup_low16(s, tmp); - break; - case 2: - gen_aa32_ld32u(s, tmp, addr, get_mem_index(s)); - break; - default: /* Avoid compiler warnings. */ - abort(); - } - return tmp; -} - static int handle_vsel(DisasContext *s, uint32_t insn, uint32_t rd, uint32_t rn, uint32_t rm, uint32_t dp) { @@ -5097,6 +5060,7 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn) int load; int shift; int n; + int vec_size; TCGv_i32 addr; TCGv_i32 tmp; TCGv_i32 tmp2; @@ -5266,28 +5230,33 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn) } addr = tcg_temp_new_i32(tcg_ctx); load_reg_var(s, addr, rn); - if (nregs == 1) { - /* VLD1 to all lanes: bit 5 indicates how many Dregs to write */ - tmp = gen_load_and_replicate(s, addr, size); - tcg_gen_st_i32(tcg_ctx, tmp, tcg_ctx->cpu_env, neon_reg_offset(rd, 0)); - tcg_gen_st_i32(tcg_ctx, tmp, tcg_ctx->cpu_env, neon_reg_offset(rd, 1)); - if (insn & (1 << 5)) { - tcg_gen_st_i32(tcg_ctx, tmp, tcg_ctx->cpu_env, neon_reg_offset(rd + 1, 0)); - tcg_gen_st_i32(tcg_ctx, tmp, tcg_ctx->cpu_env, neon_reg_offset(rd + 1, 1)); - } - tcg_temp_free_i32(tcg_ctx, tmp); - } else { - /* VLD2/3/4 to all lanes: bit 5 indicates register stride */ - stride = (insn & (1 << 5)) ? 2 : 1; - for (reg = 0; reg < nregs; reg++) { - tmp = gen_load_and_replicate(s, addr, size); - tcg_gen_st_i32(tcg_ctx, tmp, tcg_ctx->cpu_env, neon_reg_offset(rd, 0)); - tcg_gen_st_i32(tcg_ctx, tmp, tcg_ctx->cpu_env, neon_reg_offset(rd, 1)); - tcg_temp_free_i32(tcg_ctx, tmp); - tcg_gen_addi_i32(tcg_ctx, addr, addr, 1 << size); - rd += stride; + + /* VLD1 to all lanes: bit 5 indicates how many Dregs to write. + * VLD2/3/4 to all lanes: bit 5 indicates register stride. + */ + stride = (insn & (1 << 5)) ? 2 : 1; + vec_size = nregs == 1 ? stride * 8 : 8; + + tmp = tcg_temp_new_i32(tcg_ctx); + for (reg = 0; reg < nregs; reg++) { + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), + s->be_data | size); + if ((rd & 1) && vec_size == 16) { + /* We cannot write 16 bytes at once because the + * destination is unaligned. + */ + tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(rd, 0), + 8, 8, tmp); + tcg_gen_gvec_mov(tcg_ctx, 0, neon_reg_offset(rd + 1, 0), + neon_reg_offset(rd, 0), 8, 8); + } else { + tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(rd, 0), + vec_size, vec_size, tmp); } + tcg_gen_addi_i32(tcg_ctx, addr, addr, 1 << size); + rd += stride; } + tcg_temp_free_i32(tcg_ctx, tmp); tcg_temp_free_i32(tcg_ctx, addr); stride = (1 << size) * nregs; } else {