target/arm: Convert Neon 'load single structure to all lanes' to decodetree

Convert the Neon "load single structure to all lanes" insns to decodetree. Backports commit 3698747c48db871d876a398592c5a23d7580ed4a from qemu
2025-08-22 04:51:15 +00:00 · 2020-05-07 09:28:59 -04:00 · 2020-05-07 09:28:59 -04:00 · 302506f2f6
parent 7aad825fa6
commit 302506f2f6
3 changed files with 81 additions and 53 deletions
--- a/qemu/target/arm/neon-ls.decode
+++ b/qemu/target/arm/neon-ls.decode
@ -34,3 +34,8 @@

 VLDST_multiple 1111 0100 0 . l:1 0 rn:4 .... itype:4 size:2 align:2 rm:4 \
               vd=%vd_dp
+
+# Neon load single element to all lanes
+
+VLD_all_lanes  1111 0100 1 . 1 0 rn:4 .... 11 n:2 size:2 t:1 a:1 rm:4 \
+               vd=%vd_dp
--- a/qemu/target/arm/translate-neon.inc.c
+++ b/qemu/target/arm/translate-neon.inc.c
@ -408,3 +408,77 @@ static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
    gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
    return true;
 }
+
+static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
+{
+    /* Neon load single structure to all lanes */
+    int reg, stride, vec_size;
+    int vd = a->vd;
+    int size = a->size;
+    int nregs = a->n + 1;
+    TCGv_i32 addr, tmp;
+    TCGContext *tcg_ctx = s->uc->tcg_ctx;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
+        return false;
+    }
+
+    if (size == 3) {
+        if (nregs != 4 || a->a == 0) {
+            return false;
+        }
+        /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
+        size = 2;
+    }
+    if (nregs == 1 && a->a == 1 && size == 0) {
+        return false;
+    }
+    if (nregs == 3 && a->a == 1) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /*
+     * VLD1 to all lanes: T bit indicates how many Dregs to write.
+     * VLD2/3/4 to all lanes: T bit indicates register stride.
+     */
+    stride = a->t ? 2 : 1;
+    vec_size = nregs == 1 ? stride * 8 : 8;
+
+    tmp = tcg_temp_new_i32(tcg_ctx);
+    addr = tcg_temp_new_i32(tcg_ctx);
+    load_reg_var(s, addr, a->rn);
+    for (reg = 0; reg < nregs; reg++) {
+        gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
+                        s->be_data | size);
+        if ((vd & 1) && vec_size == 16) {
+            /*
+             * We cannot write 16 bytes at once because the
+             * destination is unaligned.
+             */
+            tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(vd, 0),
+                                 8, 8, tmp);
+            tcg_gen_gvec_mov(tcg_ctx, 0, neon_reg_offset(vd + 1, 0),
+                             neon_reg_offset(vd, 0), 8, 8);
+        } else {
+            tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(vd, 0),
+                                 vec_size, vec_size, tmp);
+        }
+        tcg_gen_addi_i32(tcg_ctx, addr, addr, 1 << size);
+        vd += stride;
+    }
+    tcg_temp_free_i32(tcg_ctx, tmp);
+    tcg_temp_free_i32(tcg_ctx, addr);
+
+    gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
+
+    return true;
+}
--- a/qemu/target/arm/translate.c
+++ b/qemu/target/arm/translate.c
@ -3342,7 +3342,6 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn)
    int size;
    int reg;
    int load;
-    int vec_size;
    TCGv_i32 addr;
    TCGv_i32 tmp;

@ -3372,58 +3371,8 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn)
    } else {
        size = (insn >> 10) & 3;
        if (size == 3) {
-            /* Load single element to all lanes.  */
-            int a = (insn >> 4) & 1;
-            if (!load) {
-                return 1;
-            }
-            size = (insn >> 6) & 3;
-            nregs = ((insn >> 8) & 3) + 1;
-
-            if (size == 3) {
-                if (nregs != 4 || a == 0) {
-                    return 1;
-                }
-                /* For VLD4 size==3 a == 1 means 32 bits at 16 byte alignment */
-                size = 2;
-            }
-            if (nregs == 1 && a == 1 && size == 0) {
-                return 1;
-            }
-            if (nregs == 3 && a == 1) {
-                return 1;
-            }
-            addr = tcg_temp_new_i32(tcg_ctx);
-            load_reg_var(s, addr, rn);
-
-            /* VLD1 to all lanes: bit 5 indicates how many Dregs to write.
-             * VLD2/3/4 to all lanes: bit 5 indicates register stride.
-             */
-            stride = (insn & (1 << 5)) ? 2 : 1;
-            vec_size = nregs == 1 ? stride * 8 : 8;
-
-            tmp = tcg_temp_new_i32(tcg_ctx);
-            for (reg = 0; reg < nregs; reg++) {
-                gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
-                                s->be_data | size);
-                if ((rd & 1) && vec_size == 16) {
-                    /* We cannot write 16 bytes at once because the
-                     * destination is unaligned.
-                     */
-                    tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(rd, 0),
-                                         8, 8, tmp);
-                    tcg_gen_gvec_mov(tcg_ctx, 0, neon_reg_offset(rd + 1, 0),
-                                     neon_reg_offset(rd, 0), 8, 8);
-                } else {
-                    tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(rd, 0),
-                                         vec_size, vec_size, tmp);
-                }
-                tcg_gen_addi_i32(tcg_ctx, addr, addr, 1 << size);
-                rd += stride;
-            }
-            tcg_temp_free_i32(tcg_ctx, tmp);
-            tcg_temp_free_i32(tcg_ctx, addr);
-            stride = (1 << size) * nregs;
+            /* Load single element to all lanes -- handled by decodetree  */
+            return 1;
        } else {
            /* Single element.  */
            int idx = (insn >> 4) & 0xf;