tcg/arm: Improve tlb load for armv7

Use UBFX to avoid limitation on CPU_TLB_BITS. Since we're dropping
the initial shift, we need to replace the page masking. We can use
MOVW+BIC to do this without shifting. The result is the same size
as the armv6 path with one less conditional instruction.

Backports commit 647ab96aaf5defeb138e48d610f7f633c587b40d from qemu
This commit is contained in:
Richard Henderson 2018-03-04 22:56:25 -05:00 committed by Lioncash
parent b3fd6a8c8c
commit e4d05c2567
No known key found for this signature in database
GPG key ID: 4E3C3CC1031BA9C7

View file

@ -1182,18 +1182,33 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
unsigned s_bits = opc & MO_SIZE;
unsigned a_bits = get_alignment_bits(opc);
/* Should generate something like the following:
* shr tmp, addrlo, #TARGET_PAGE_BITS (1)
/* V7 generates the following:
* ubfx r0, addrlo, #TARGET_PAGE_BITS, #CPU_TLB_BITS
* shr tmp, addrlo, #TARGET_PAGE_BITS
* add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS
* ldr r0, [r2, #cmp]
* ldr r2, [r2, #add]
* movw tmp, #page_align_mask
* bic tmp, addrlo, tmp
* cmp r0, tmp
*
* Otherwise we generate:
* shr tmp, addrlo, #TARGET_PAGE_BITS
* add r2, env, #high
* and r0, tmp, #(CPU_TLB_SIZE - 1) (2)
* add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS (3)
* ldr r0, [r2, #cmp] (4)
* and r0, tmp, #(CPU_TLB_SIZE - 1)
* add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS
* ldr r0, [r2, #cmp]
* ldr r2, [r2, #add]
* tst addrlo, #s_mask
* ldr r2, [r2, #add] (5)
* cmpeq r0, tmp, lsl #TARGET_PAGE_BITS
*/
if (use_armv7_instructions) {
tcg_out_extract(s, COND_AL, TCG_REG_R0, addrlo,
TARGET_PAGE_BITS, CPU_TLB_BITS);
} else {
tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
}
/* We checked that the offset is contained within 16 bits above. */
if (add_off > 0xfff || (use_armv6_instructions && cmp_off > 0xff)) {
@ -1204,8 +1219,10 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
cmp_off &= 0xff;
}
if (!use_armv7_instructions) {
tcg_out_dat_imm(s, COND_AL, ARITH_AND,
TCG_REG_R0, TCG_REG_TMP, CPU_TLB_SIZE - 1);
}
tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
TCG_REG_R0, SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS));
@ -1221,24 +1238,41 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
}
}
/* Load the tlb addend. */
tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2, add_off);
/* Check alignment. We don't support inline unaligned acceses,
but we can easily support overalignment checks. */
if (a_bits < s_bits) {
a_bits = s_bits;
}
if (a_bits) {
tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, (1 << a_bits) - 1);
}
/* Load the tlb addend. */
tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2, add_off);
if (use_armv7_instructions) {
tcg_target_ulong mask = ~(TARGET_PAGE_MASK | ((1 << a_bits) - 1));
int rot = encode_imm(mask);
tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP, 0,
TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
if (rot >= 0) {
tcg_out_dat_imm(s, COND_AL, ARITH_BIC, TCG_REG_TMP, addrlo,
rotl(mask, rot) | (rot << 7));
} else {
tcg_out_movi32(s, COND_AL, TCG_REG_TMP, mask);
tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP,
addrlo, TCG_REG_TMP, 0);
}
tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R0, TCG_REG_TMP, 0);
} else {
if (a_bits) {
tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo,
(1 << a_bits) - 1);
}
tcg_out_dat_reg(s, (a_bits ? COND_EQ : COND_AL), ARITH_CMP,
0, TCG_REG_R0, TCG_REG_TMP,
SHIFT_IMM_LSL(TARGET_PAGE_BITS));
}
if (TARGET_LONG_BITS == 64) {
tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0,
TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0));
tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R1, addrhi, 0);
}
return TCG_REG_R2;