tcg/arm: Improve tlb load for armv7

Use UBFX to avoid limitation on CPU_TLB_BITS. Since we're dropping the initial shift, we need to replace the page masking. We can use MOVW+BIC to do this without shifting. The result is the same size as the armv6 path with one less conditional instruction. Backports commit 647ab96aaf5defeb138e48d610f7f633c587b40d from qemu
2025-07-08 05:10:36 +00:00 · 2018-03-04 22:56:25 -05:00 · 2018-03-04 22:56:25 -05:00 · e4d05c2567
parent b3fd6a8c8c
commit e4d05c2567
1 changed files with 52 additions and 18 deletions
--- a/qemu/tcg/arm/tcg-target.inc.c
+++ b/qemu/tcg/arm/tcg-target.inc.c
@ -1182,18 +1182,33 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
    unsigned s_bits = opc & MO_SIZE;
    unsigned a_bits = get_alignment_bits(opc);
-    /* Should generate something like the following:
+    /* V7 generates the following:
-     *   shr    tmp, addrlo, #TARGET_PAGE_BITS                    (1)
+     *   ubfx   r0, addrlo, #TARGET_PAGE_BITS, #CPU_TLB_BITS
     *   shr    tmp, addrlo, #TARGET_PAGE_BITS
     *   add    r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS
     *   ldr    r0, [r2, #cmp]
     *   ldr    r2, [r2, #add]
     *   movw   tmp, #page_align_mask
     *   bic    tmp, addrlo, tmp
     *   cmp    r0, tmp
     *
     * Otherwise we generate:
     *   shr    tmp, addrlo, #TARGET_PAGE_BITS
     *   add    r2, env, #high
-     *   and    r0, tmp, #(CPU_TLB_SIZE - 1)                      (2)
+     *   and    r0, tmp, #(CPU_TLB_SIZE - 1)
-     *   add    r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS               (3)
+     *   add    r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS
-     *   ldr    r0, [r2, #cmp]                                    (4)
+     *   ldr    r0, [r2, #cmp]
     *   ldr    r2, [r2, #add]
     *   tst    addrlo, #s_mask
     *   ldr    r2, [r2, #add]                                    (5)
     *   cmpeq  r0, tmp, lsl #TARGET_PAGE_BITS
     */
-    tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
+    if (use_armv7_instructions) {
-                    0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
+        tcg_out_extract(s, COND_AL, TCG_REG_R0, addrlo,
                        TARGET_PAGE_BITS, CPU_TLB_BITS);
    } else {
        tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
                        0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
    }
    /* We checked that the offset is contained within 16 bits above.  */
    if (add_off > 0xfff || (use_armv6_instructions && cmp_off > 0xff)) {
@ -1204,8 +1219,10 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
        cmp_off &= 0xff;
    }
-    tcg_out_dat_imm(s, COND_AL, ARITH_AND,
+    if (!use_armv7_instructions) {
-                    TCG_REG_R0, TCG_REG_TMP, CPU_TLB_SIZE - 1);
+        tcg_out_dat_imm(s, COND_AL, ARITH_AND,
                        TCG_REG_R0, TCG_REG_TMP, CPU_TLB_SIZE - 1);
    }
    tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
                    TCG_REG_R0, SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS));
@ -1221,24 +1238,41 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
        }
    }
    /* Load the tlb addend.  */
    tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2, add_off);
    /* Check alignment.  We don't support inline unaligned acceses,
       but we can easily support overalignment checks.  */
    if (a_bits < s_bits) {
        a_bits = s_bits;
    }
    if (a_bits) {
        tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, (1 << a_bits) - 1);
    }
    /* Load the tlb addend.  */
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2, add_off);
+    if (use_armv7_instructions) {
        tcg_target_ulong mask = ~(TARGET_PAGE_MASK | ((1 << a_bits) - 1));
        int rot = encode_imm(mask);
-    tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP, 0,
+        if (rot >= 0) {
-                    TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
+            tcg_out_dat_imm(s, COND_AL, ARITH_BIC, TCG_REG_TMP, addrlo,
                            rotl(mask, rot) | (rot << 7));
        } else {
            tcg_out_movi32(s, COND_AL, TCG_REG_TMP, mask);
            tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP,
                            addrlo, TCG_REG_TMP, 0);
        }
        tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R0, TCG_REG_TMP, 0);
    } else {
        if (a_bits) {
            tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo,
                            (1 << a_bits) - 1);
        }
        tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP,
                        0, TCG_REG_R0, TCG_REG_TMP,
                        SHIFT_IMM_LSL(TARGET_PAGE_BITS));
    }
    if (TARGET_LONG_BITS == 64) {
-        tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0,
+        tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R1, addrhi, 0);
                        TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0));
    }
    return TCG_REG_R2;