target/arm: Promote consecutive memory ops for aa64

For a sequence of loads or stores from a single register, little-endian operations can be promoted to an 8-byte op. This can reduce the number of operations by a factor of 8. Backports commit 87f9a7f0c8d5122c36743885158782c2348a6d21 from qemu
2025-10-04 05:27:11 +00:00 · 2018-11-10 09:44:52 -05:00 · 2018-11-10 09:44:52 -05:00 · 931b49fb06
parent e6707b900c
commit 931b49fb06
1 changed files with 40 additions and 26 deletions
--- a/qemu/target/arm/translate-a64.c
+++ b/qemu/target/arm/translate-a64.c
@ -1269,27 +1269,25 @@ static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src,

 /* Store from vector register to memory */
 static void do_vec_st(DisasContext *s, int srcidx, int element,
-                      TCGv_i64 tcg_addr, int size)
+                      TCGv_i64 tcg_addr, int size, TCGMemOp endian)
 {
    TCGContext *tcg_ctx = s->uc->tcg_ctx;
-    TCGMemOp memop = s->be_data + size;
    TCGv_i64 tcg_tmp = tcg_temp_new_i64(tcg_ctx);

    read_vec_element(s, tcg_tmp, srcidx, element, size);
-    tcg_gen_qemu_st_i64(s->uc, tcg_tmp, tcg_addr, get_mem_index(s), memop);
+    tcg_gen_qemu_st_i64(s->uc, tcg_tmp, tcg_addr, get_mem_index(s), endian | size);

    tcg_temp_free_i64(tcg_ctx, tcg_tmp);
 }

 /* Load from memory to vector register */
 static void do_vec_ld(DisasContext *s, int destidx, int element,
-                      TCGv_i64 tcg_addr, int size)
+                      TCGv_i64 tcg_addr, int size, TCGMemOp endian)
 {
    TCGContext *tcg_ctx = s->uc->tcg_ctx;
-    TCGMemOp memop = s->be_data + size;
    TCGv_i64 tcg_tmp = tcg_temp_new_i64(tcg_ctx);

-    tcg_gen_qemu_ld_i64(s->uc, tcg_tmp, tcg_addr, get_mem_index(s), memop);
+    tcg_gen_qemu_ld_i64(s->uc, tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
    write_vec_element(s, tcg_tmp, destidx, element, size);

    tcg_temp_free_i64(tcg_ctx, tcg_tmp);
@ -3088,9 +3086,10 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
    bool is_postidx = extract32(insn, 23, 1);
    bool is_q = extract32(insn, 30, 1);
    TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
+    TCGMemOp endian = s->be_data;

-    int ebytes = 1 << size;
-    int elements = (is_q ? 128 : 64) / (8 << size);
+    int ebytes;   /* bytes per element */
+    int elements; /* elements per vector */
    int rpt;    /* num iterations */
    int selem;  /* structure elements */
    int r;
@ -3149,6 +3148,20 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
        gen_check_sp_alignment(s);
    }

+    /* For our purposes, bytes are always little-endian.  */
+    if (size == 0) {
+        endian = MO_LE;
+    }
+
+    /* Consecutive little-endian elements from a single register
+     * can be promoted to a larger little-endian operation.
+     */
+    if (selem == 1 && endian == MO_LE) {
+        size = 3;
+    }
+    ebytes = 1 << size;
+    elements = (is_q ? 16 : 8) / ebytes;
+
    tcg_rn = cpu_reg_sp(s, rn);
    tcg_addr = tcg_temp_new_i64(tcg_ctx);
    tcg_gen_mov_i64(tcg_ctx, tcg_addr, tcg_rn);
@ -3157,32 +3170,33 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
    for (r = 0; r < rpt; r++) {
        int e;
        for (e = 0; e < elements; e++) {
-            int tt = (rt + r) % 32;
            int xs;
            for (xs = 0; xs < selem; xs++) {
+                int tt = (rt + r + xs) % 32;
                if (is_store) {
-                    do_vec_st(s, tt, e, tcg_addr, size);
+                    do_vec_st(s, tt, e, tcg_addr, size, endian);
                } else {
-                    do_vec_ld(s, tt, e, tcg_addr, size);
-
-                    /* For non-quad operations, setting a slice of the low
-                     * 64 bits of the register clears the high 64 bits (in
-                     * the ARM ARM pseudocode this is implicit in the fact
-                     * that 'rval' is a 64 bit wide variable).
-                     * For quad operations, we might still need to zero the
-                     * high bits of SVE.  We optimize by noticing that we only
-                     * need to do this the first time we touch a register.
-                     */
-                    if (e == 0 && (r == 0 || xs == selem - 1)) {
-                        clear_vec_high(s, is_q, tt);
-                    }
+                    do_vec_ld(s, tt, e, tcg_addr, size, endian);
                }
                tcg_gen_add_i64(tcg_ctx, tcg_addr, tcg_addr, tcg_ebytes);
-                tt = (tt + 1) % 32;
            }
        }
    }

+    if (!is_store) {
+        /* For non-quad operations, setting a slice of the low
+         * 64 bits of the register clears the high 64 bits (in
+         * the ARM ARM pseudocode this is implicit in the fact
+         * that 'rval' is a 64 bit wide variable).
+         * For quad operations, we might still need to zero the
+         * high bits of SVE.
+         */
+        for (r = 0; r < rpt * selem; r++) {
+            int tt = (rt + r) % 32;
+            clear_vec_high(s, is_q, tt);
+        }
+    }
+
    if (is_postidx) {
        int rm = extract32(insn, 16, 5);
        if (rm == 31) {
@ -3304,9 +3318,9 @@ static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
        } else {
            /* Load/store one element per register */
            if (is_load) {
-                do_vec_ld(s, rt, index, tcg_addr, scale);
+                do_vec_ld(s, rt, index, tcg_addr, scale, s->be_data);
            } else {
-                do_vec_st(s, rt, index, tcg_addr, scale);
+                do_vec_st(s, rt, index, tcg_addr, scale, s->be_data);
            }
        }
        tcg_gen_add_i64(tcg_ctx, tcg_addr, tcg_addr, tcg_ebytes);