target/arm: Convert Neon VSHLL, VMOVL to decodetree

Convert the VSHLL and VMOVL insns from the 2-reg-shift group to decodetree. Since the loop always has two passes, we unroll it to avoid the awkward reassignment of one TCGv to another. Backports commit 968bf842742a5ffbb0041cb31089e61a9f7a833d from qemu
2025-08-03 22:41:13 +00:00 · 2020-06-15 12:35:30 -04:00 · 2020-06-15 12:35:30 -04:00 · a5f903b2a5
parent 6fc8fdaa2b
commit a5f903b2a5
3 changed files with 100 additions and 44 deletions
--- a/qemu/target/arm/neon-dp.decode
+++ b/qemu/target/arm/neon-dp.decode
@ -243,6 +243,14 @@ VMINNM_fp_3s     1111 001 1 0 . 1 . .... .... 1111 ... 1 .... @3same_fp
                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0 \
                 shift=%neon_rshift_i3

+# Long left shifts: again Q is part of opcode decode
+@2reg_shll_s     .... ... . . . 1 shift:5    .... .... 0 . . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0
+@2reg_shll_h     .... ... . . . 01 shift:4   .... .... 0 . . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0
+@2reg_shll_b     .... ... . . . 001 shift:3  .... .... 0 . . . .... \
+                 &2reg_shift vm=%vm_dp vd=%vd_dp size=0 q=0
+
 VSHR_S_2sh       1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
 VSHR_S_2sh       1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
 VSHR_S_2sh       1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h
@ -348,3 +356,11 @@ VQSHRN_U16_2sh   1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_h
 VQRSHRN_U64_2sh  1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_d
 VQRSHRN_U32_2sh  1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_s
 VQRSHRN_U16_2sh  1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_h
+
+VSHLL_S_2sh      1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
+VSHLL_S_2sh      1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
+VSHLL_S_2sh      1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
+
+VSHLL_U_2sh      1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
+VSHLL_U_2sh      1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
+VSHLL_U_2sh      1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
--- a/qemu/target/arm/translate-neon.inc.c
+++ b/qemu/target/arm/translate-neon.inc.c
@ -1599,3 +1599,85 @@ DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
+
+static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
+                         NeonGenWidenFn *widenfn, bool u)
+{
+    TCGContext *tcg_ctx = s->uc->tcg_ctx;
+    TCGv_i64 tmp;
+    TCGv_i32 rm0, rm1;
+    uint64_t widen_mask = 0;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (a->vd & 1) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * This is a widen-and-shift operation. The shift is always less
+     * than the width of the source type, so after widening the input
+     * vector we can simply shift the whole 64-bit widened register,
+     * and then clear the potential overflow bits resulting from left
+     * bits of the narrow input appearing as right bits of the left
+     * neighbour narrow input. Calculate a mask of bits to clear.
+     */
+    if ((a->shift != 0) && (a->size < 2 || u)) {
+        int esize = 8 << a->size;
+        widen_mask = MAKE_64BIT_MASK(0, esize);
+        widen_mask >>= esize - a->shift;
+        widen_mask = dup_const(a->size + 1, widen_mask);
+    }
+
+    rm0 = neon_load_reg(s, a->vm, 0);
+    rm1 = neon_load_reg(s, a->vm, 1);
+    tmp = tcg_temp_new_i64(tcg_ctx);
+
+    widenfn(tcg_ctx, tmp, rm0);
+    if (a->shift != 0) {
+        tcg_gen_shli_i64(tcg_ctx, tmp, tmp, a->shift);
+        tcg_gen_andi_i64(tcg_ctx, tmp, tmp, ~widen_mask);
+    }
+    neon_store_reg64(s, tmp, a->vd);
+
+    widenfn(tcg_ctx, tmp, rm1);
+    if (a->shift != 0) {
+        tcg_gen_shli_i64(tcg_ctx, tmp, tmp, a->shift);
+        tcg_gen_andi_i64(tcg_ctx, tmp, tmp, ~widen_mask);
+    }
+    neon_store_reg64(s, tmp, a->vd + 1);
+    tcg_temp_free_i64(tcg_ctx, tmp);
+    return true;
+}
+
+static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
+{
+    NeonGenWidenFn *widenfn[] = {
+        gen_helper_neon_widen_s8,
+        gen_helper_neon_widen_s16,
+        tcg_gen_ext_i32_i64,
+    };
+    return do_vshll_2sh(s, a, widenfn[a->size], false);
+}
+
+static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
+{
+    NeonGenWidenFn *widenfn[] = {
+        gen_helper_neon_widen_u8,
+        gen_helper_neon_widen_u16,
+        tcg_gen_extu_i32_i64,
+    };
+    return do_vshll_2sh(s, a, widenfn[a->size], true);
+}
--- a/qemu/target/arm/translate.c
+++ b/qemu/target/arm/translate.c
@ -5373,6 +5373,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
            case 7: /* VQSHL */
            case 8: /* VSHRN, VRSHRN, VQSHRUN, VQRSHRUN */
            case 9: /* VQSHRN, VQRSHRN */
+            case 10: /* VSHLL, including VMOVL */
                return 1; /* handled by decodetree */
            default:
                break;
@ -5390,50 +5391,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                    size--;
            }
            shift = (insn >> 16) & ((1 << (3 + size)) - 1);
-            if (op == 10) {
-                /* VSHLL, VMOVL */
-                if (q || (rd & 1)) {
-                    return 1;
-                }
-                tmp = neon_load_reg(s, rm, 0);
-                tmp2 = neon_load_reg(s, rm, 1);
-                for (pass = 0; pass < 2; pass++) {
-                    if (pass == 1)
-                        tmp = tmp2;
-
-                    gen_neon_widen(s, s->V0, tmp, size, u);
-
-                    if (shift != 0) {
-                        /* The shift is less than the width of the source
-                           type, so we can just shift the whole register.  */
-                        tcg_gen_shli_i64(tcg_ctx, s->V0, s->V0, shift);
-                        /* Widen the result of shift: we need to clear
-                         * the potential overflow bits resulting from
-                         * left bits of the narrow input appearing as
-                         * right bits of left the neighbour narrow
-                         * input.  */
-                        if (size < 2 || !u) {
-                            uint64_t imm64;
-                            if (size == 0) {
-                                imm = (0xffu >> (8 - shift));
-                                imm |= imm << 16;
-                            } else if (size == 1) {
-                                imm = 0xffff >> (16 - shift);
-                            } else {
-                                /* size == 2 */
-                                imm = 0xffffffff >> (32 - shift);
-                            }
-                            if (size < 2) {
-                                imm64 = imm | (((uint64_t)imm) << 32);
-                            } else {
-                                imm64 = imm;
-                            }
-                            tcg_gen_andi_i64(tcg_ctx, s->V0, s->V0, ~imm64);
-                        }
-                    }
-                    neon_store_reg64(s, s->V0, rd + pass);
-                }
-            } else if (op >= 14) {
+            if (op >= 14) {
                /* VCVT fixed-point.  */
                TCGv_ptr fpst;
                TCGv_i32 shiftv;