target/arm: Expand vector registers for SVE

Change vfp.regs as a uint64_t to vfp.zregs as an ARMVectorReg. The previous patches have made the change in representation relatively painless. Backports commit c39c2b9043ec59516c80f2c6f3e8193e99d04d4b from qemu
2025-12-15 16:51:35 +00:00 · 2018-03-07 11:29:34 -05:00 · 2018-03-07 11:29:34 -05:00 · 834e3a1d04
parent 5439b4a542
commit 834e3a1d04
3 changed files with 46 additions and 27 deletions
--- a/qemu/target/arm/cpu.h
+++ b/qemu/target/arm/cpu.h
@ -158,6 +158,41 @@ typedef struct {
    uint32_t base_mask;
 } TCR;

+/* Define a maximum sized vector register.
+ * For 32-bit, this is a 128-bit NEON/AdvSIMD register.
+ * For 64-bit, this is a 2048-bit SVE register.
+ *
+ * Note that the mapping between S, D, and Q views of the register bank
+ * differs between AArch64 and AArch32.
+ * In AArch32:
+ *  Qn = regs[n].d[1]:regs[n].d[0]
+ *  Dn = regs[n / 2].d[n & 1]
+ *  Sn = regs[n / 4].d[n % 4 / 2],
+ *       bits 31..0 for even n, and bits 63..32 for odd n
+ *       (and regs[16] to regs[31] are inaccessible)
+ * In AArch64:
+ *  Zn = regs[n].d[*]
+ *  Qn = regs[n].d[1]:regs[n].d[0]
+ *  Dn = regs[n].d[0]
+ *  Sn = regs[n].d[0] bits 31..0
+ *
+ * This corresponds to the architecturally defined mapping between
+ * the two execution states, and means we do not need to explicitly
+ * map these registers when changing states.
+ *
+ * Align the data for use with TCG host vector operations.
+ */
+
+#ifdef TARGET_AARCH64
+# define ARM_MAX_VQ    16
+#else
+# define ARM_MAX_VQ    1
+#endif
+
+typedef struct ARMVectorReg {
+    uint64_t QEMU_ALIGNED(16, d[2 * ARM_MAX_VQ]);
+} ARMVectorReg;
+
 typedef struct CPUARMState {
    /* Regs for current mode.  */
    uint32_t regs[16];
@ -482,22 +517,7 @@ typedef struct CPUARMState {

    /* VFP coprocessor state.  */
    struct {
-        /* VFP/Neon register state. Note that the mapping between S, D and Q
-         * views of the register bank differs between AArch64 and AArch32:
-         * In AArch32:
-         *  Qn = regs[2n+1]:regs[2n]
-         *  Dn = regs[n]
-         *  Sn = regs[n/2] bits 31..0 for even n, and bits 63..32 for odd n
-         * (and regs[32] to regs[63] are inaccessible)
-         * In AArch64:
-         *  Qn = regs[2n+1]:regs[2n]
-         *  Dn = regs[2n]
-         *  Sn = regs[2n] bits 31..0
-         * This corresponds to the architecturally defined mapping between
-         * the two execution states, and means we do not need to explicitly
-         * map these registers when changing states.
-         */
-        uint64_t QEMU_ALIGNED(16, regs[64]);
+        ARMVectorReg zregs[32];

        uint32_t xregs[16];
        /* We store these fpcsr fields separately for convenience.  */
@ -2768,7 +2788,7 @@ static inline void *arm_get_el_change_hook_opaque(ARMCPU *cpu)
 */
 static inline uint64_t *aa32_vfp_dreg(CPUARMState *env, unsigned regno)
 {
-    return &env->vfp.regs[regno];
+    return &env->vfp.zregs[regno >> 1].d[regno & 1];
 }

 /**
@ -2777,7 +2797,7 @@ static inline uint64_t *aa32_vfp_dreg(CPUARMState *env, unsigned regno)
 */
 static inline uint64_t *aa32_vfp_qreg(CPUARMState *env, unsigned regno)
 {
-    return &env->vfp.regs[2 * regno];
+    return &env->vfp.zregs[regno].d[0];
 }

 /**
@ -2786,7 +2806,7 @@ static inline uint64_t *aa32_vfp_qreg(CPUARMState *env, unsigned regno)
 */
 static inline uint64_t *aa64_vfp_qreg(CPUARMState *env, unsigned regno)
 {
-    return &env->vfp.regs[2 * regno];
+    return &env->vfp.zregs[regno].d[0];
 }

 #endif
--- a/qemu/target/arm/translate-a64.c
+++ b/qemu/target/arm/translate-a64.c
@ -552,8 +552,8 @@ static inline int vec_reg_offset(DisasContext *s, int regno,
 {
    int offs = 0;
 #ifdef HOST_WORDS_BIGENDIAN
-    /* This is complicated slightly because vfp.regs[2n] is
-     * still the low half and  vfp.regs[2n+1] the high half
+    /* This is complicated slightly because vfp.zregs[n].d[0] is
+     * still the low half and vfp.zregs[n].d[1] the high half
     * of the 128 bit vector, even on big endian systems.
     * Calculate the offset assuming a fully bigendian 128 bits,
     * then XOR to account for the order of the two 64 bit halves.
@ -563,7 +563,7 @@ static inline int vec_reg_offset(DisasContext *s, int regno,
 #else
    offs += element * (1 << size);
 #endif
-    offs += offsetof(CPUARMState, vfp.regs[regno * 2]);
+    offs += offsetof(CPUARMState, vfp.zregs[regno]);
    assert_fp_access_checked(s);
    return offs;
 }
@ -572,7 +572,7 @@ static inline int vec_reg_offset(DisasContext *s, int regno,
 static inline int vec_full_reg_offset(DisasContext *s, int regno)
 {
    assert_fp_access_checked(s);
-    return offsetof(CPUARMState, vfp.regs[regno * 2]);
+    return offsetof(CPUARMState, vfp.zregs[regno]);
 }

 /* Return a newly allocated pointer to the vector register.  */
--- a/qemu/target/arm/translate.c
+++ b/qemu/target/arm/translate.c
@ -1574,13 +1574,12 @@ static inline void gen_vfp_st(DisasContext *s, int dp, TCGv_i32 addr)
    }
 }

-static inline long
-vfp_reg_offset (int dp, int reg)
+static inline long vfp_reg_offset(bool dp, unsigned reg)
 {
    if (dp) {
-        return offsetof(CPUARMState, vfp.regs[reg]);
+        return offsetof(CPUARMState, vfp.zregs[reg >> 1].d[reg & 1]);
    } else {
-        long ofs = offsetof(CPUARMState, vfp.regs[reg >> 1]);
+        long ofs = offsetof(CPUARMState, vfp.zregs[reg >> 2].d[(reg >> 1) & 1]);
        if (reg & 1) {
            ofs += offsetof(CPU_DoubleU, l.upper);
        } else {