tcg-mips: Move bswap code to a subroutine

Without the mips32r2 instructions to perform swapping, bswap is quite large,
dominating the size of each reverse-endian qemu_ld/qemu_st operation.

Create a subroutine in the prologue block. The subroutine requires extra
reserved registers (TCG_TMP[2, 3]). Using these within qemu_ld means that
we need not place additional restrictions on the qemu_ld outputs.

Backports commit bb08afe9f0aee1a3f5c23508e2511b882ca31e1b from qemu
This commit is contained in:
Jin Guojie 2018-03-01 12:34:44 -05:00 committed by Lioncash
parent 5d4a5e9ba7
commit d2aa49e9d3
No known key found for this signature in database
GPG key ID: 4E3C3CC1031BA9C7
2 changed files with 139 additions and 65 deletions

View file

@ -117,11 +117,11 @@ extern bool use_mips32r2_instructions;
#define TCG_TARGET_HAS_muls2_i32 (!use_mips32r6_instructions)
#define TCG_TARGET_HAS_muluh_i32 1
#define TCG_TARGET_HAS_mulsh_i32 1
#define TCG_TARGET_HAS_bswap32_i32 1
/* optional instructions detected at runtime */
#define TCG_TARGET_HAS_movcond_i32 use_movnz_instructions
#define TCG_TARGET_HAS_bswap16_i32 use_mips32r2_instructions
#define TCG_TARGET_HAS_bswap32_i32 use_mips32r2_instructions
#define TCG_TARGET_HAS_deposit_i32 use_mips32r2_instructions
#define TCG_TARGET_HAS_ext8s_i32 use_mips32r2_instructions
#define TCG_TARGET_HAS_ext16s_i32 use_mips32r2_instructions

View file

@ -74,6 +74,8 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
#define TCG_TMP0 TCG_REG_AT
#define TCG_TMP1 TCG_REG_T9
#define TCG_TMP2 TCG_REG_T8
#define TCG_TMP3 TCG_REG_T7
/* check if we really need so many registers :P */
static const int tcg_target_reg_alloc_order[] = {
@ -122,6 +124,7 @@ static const TCGReg tcg_target_call_oarg_regs[2] = {
};
static tcg_insn_unit *tb_ret_addr;
static tcg_insn_unit *bswap32_addr;
static inline uint32_t reloc_pc16_val(tcg_insn_unit *pc, tcg_insn_unit *target)
{
@ -177,12 +180,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
ct->ct |= TCG_CT_REG;
tcg_regset_set(ct->u.regs, 0xffffffff);
break;
case 'L': /* qemu_ld output arg constraint */
ct->ct |= TCG_CT_REG;
tcg_regset_set(ct->u.regs, 0xffffffff);
tcg_regset_reset_reg(ct->u.regs, TCG_REG_V0);
break;
case 'l': /* qemu_ld input arg constraint */
case 'L': /* qemu_ld input arg constraint */
ct->ct |= TCG_CT_REG;
tcg_regset_set(ct->u.regs, 0xffffffff);
tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
@ -513,7 +511,13 @@ static inline void tcg_out_bswap16s(TCGContext *s, TCGReg ret, TCGReg arg)
}
}
static inline void tcg_out_bswap32(TCGContext *s, TCGReg ret, TCGReg arg)
static void tcg_out_bswap_subr(TCGContext *s, tcg_insn_unit *sub)
{
bool ok = tcg_out_opc_jmp(s, OPC_JAL, sub);
tcg_debug_assert(ok);
}
static void tcg_out_bswap32(TCGContext *s, TCGReg ret, TCGReg arg)
{
if (use_mips32r2_instructions) {
tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
@ -524,18 +528,10 @@ static inline void tcg_out_bswap32(TCGContext *s, TCGReg ret, TCGReg arg)
tcg_abort();
}
tcg_out_opc_sa(s, OPC_SLL, ret, arg, 24);
tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, arg, 24);
tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, arg, 0xff00);
tcg_out_opc_sa(s, OPC_SLL, TCG_TMP0, TCG_TMP0, 8);
tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, arg, 8);
tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, TCG_TMP0, 0xff00);
tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
tcg_out_bswap_subr(s, bswap32_addr);
/* delay slot -- never omit the insn, like tcg_out_mov might. */
tcg_out_opc_reg(s, OPC_OR, TCG_TMP0, arg, TCG_REG_ZERO);
tcg_out_mov(s, TCG_TYPE_I32, ret, TCG_TMP3);
}
}
@ -995,7 +991,7 @@ static int tcg_out_call_iarg_reg2(TCGContext *s, int i, TCGReg al, TCGReg ah)
}
/* Perform the tlb comparison operation. The complete host address is
placed in BASE. Clobbers AT, T0, A0. */
placed in BASE. Clobbers TMP0, TMP1, A0. */
static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
TCGReg addrh, TCGMemOpIdx oi,
tcg_insn_unit *label_ptr[2], bool is_load)
@ -1178,46 +1174,72 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
}
#endif
static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg lo, TCGReg hi,
TCGReg base, TCGMemOp opc)
{
switch (opc & (MO_SSIZE | MO_BSWAP)) {
case MO_UB:
tcg_out_opc_imm(s, OPC_LBU, datalo, base, 0);
tcg_out_opc_imm(s, OPC_LBU, lo, base, 0);
break;
case MO_SB:
tcg_out_opc_imm(s, OPC_LB, datalo, base, 0);
tcg_out_opc_imm(s, OPC_LB, lo, base, 0);
break;
case MO_UW | MO_BSWAP:
tcg_out_opc_imm(s, OPC_LHU, TCG_TMP1, base, 0);
tcg_out_bswap16(s, datalo, TCG_TMP1);
tcg_out_bswap16(s, lo, TCG_TMP1);
break;
case MO_UW:
tcg_out_opc_imm(s, OPC_LHU, datalo, base, 0);
tcg_out_opc_imm(s, OPC_LHU, lo, base, 0);
break;
case MO_SW | MO_BSWAP:
tcg_out_opc_imm(s, OPC_LHU, TCG_TMP1, base, 0);
tcg_out_bswap16s(s, datalo, TCG_TMP1);
tcg_out_bswap16s(s, lo, TCG_TMP1);
break;
case MO_SW:
tcg_out_opc_imm(s, OPC_LH, datalo, base, 0);
tcg_out_opc_imm(s, OPC_LH, lo, base, 0);
break;
case MO_UL | MO_BSWAP:
tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, 0);
tcg_out_bswap32(s, datalo, TCG_TMP1);
if (use_mips32r2_instructions) {
tcg_out_opc_imm(s, OPC_LW, lo, base, 0);
tcg_out_bswap32(s, lo, lo);
} else {
tcg_out_bswap_subr(s, bswap32_addr);
/* delay slot */
tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, base, 0);
tcg_out_mov(s, TCG_TYPE_I32, lo, TCG_TMP3);
}
break;
case MO_UL:
tcg_out_opc_imm(s, OPC_LW, datalo, base, 0);
tcg_out_opc_imm(s, OPC_LW, lo, base, 0);
break;
case MO_Q | MO_BSWAP:
tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, HI_OFF);
tcg_out_bswap32(s, datalo, TCG_TMP1);
tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, LO_OFF);
tcg_out_bswap32(s, datahi, TCG_TMP1);
if (use_mips32r2_instructions) {
tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, base, 0);
tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, 4);
tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP0, 0, TCG_TMP0);
tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP1, 0, TCG_TMP1);
tcg_out_opc_sa(s, OPC_ROTR, MIPS_BE ? lo : hi, TCG_TMP0, 16);
tcg_out_opc_sa(s, OPC_ROTR, MIPS_BE ? hi : lo, TCG_TMP1, 16);
} else {
tcg_out_bswap_subr(s, bswap32_addr);
/* delay slot */
tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, base, 0);
tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, base, 4);
tcg_out_bswap_subr(s, bswap32_addr);
/* delay slot */
tcg_out_mov(s, TCG_TYPE_I32, MIPS_BE ? lo : hi, TCG_TMP3);
tcg_out_mov(s, TCG_TYPE_I32, MIPS_BE ? hi : lo, TCG_TMP3);
}
break;
case MO_Q:
tcg_out_opc_imm(s, OPC_LW, datalo, base, LO_OFF);
tcg_out_opc_imm(s, OPC_LW, datahi, base, HI_OFF);
/* Prefer to load from offset 0 first, but allow for overlap. */
if (MIPS_BE ? hi != base : lo == base) {
tcg_out_opc_imm(s, OPC_LW, hi, base, HI_OFF);
tcg_out_opc_imm(s, OPC_LW, lo, base, LO_OFF);
} else {
tcg_out_opc_imm(s, OPC_LW, lo, base, LO_OFF);
tcg_out_opc_imm(s, OPC_LW, hi, base, HI_OFF);
}
break;
default:
tcg_abort();
@ -1233,9 +1255,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
#if defined(CONFIG_SOFTMMU)
tcg_insn_unit *label_ptr[2];
#endif
/* Note that we've eliminated V0 from the output registers,
so we won't overwrite the base register during loading. */
TCGReg base = TCG_REG_V0;
TCGReg base = TCG_REG_A0;
data_regl = *args++;
data_regh = (is_64 ? *args++ : 0);
@ -1262,40 +1282,54 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
#endif
}
static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg lo, TCGReg hi,
TCGReg base, TCGMemOp opc)
{
/* Don't clutter the code below with checks to avoid bswapping ZERO. */
if ((lo | hi) == 0) {
opc &= ~MO_BSWAP;
}
switch (opc & (MO_SIZE | MO_BSWAP)) {
case MO_8:
tcg_out_opc_imm(s, OPC_SB, datalo, base, 0);
tcg_out_opc_imm(s, OPC_SB, lo, base, 0);
break;
case MO_16 | MO_BSWAP:
tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, datalo, 0xffff);
tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, lo, 0xffff);
tcg_out_bswap16(s, TCG_TMP1, TCG_TMP1);
datalo = TCG_TMP1;
lo = TCG_TMP1;
/* FALLTHRU */
case MO_16:
tcg_out_opc_imm(s, OPC_SH, datalo, base, 0);
tcg_out_opc_imm(s, OPC_SH, lo, base, 0);
break;
case MO_32 | MO_BSWAP:
tcg_out_bswap32(s, TCG_TMP1, datalo);
datalo = TCG_TMP1;
tcg_out_bswap32(s, TCG_TMP3, lo);
lo = TCG_TMP3;
/* FALLTHRU */
case MO_32:
tcg_out_opc_imm(s, OPC_SW, datalo, base, 0);
tcg_out_opc_imm(s, OPC_SW, lo, base, 0);
break;
case MO_64 | MO_BSWAP:
tcg_out_bswap32(s, TCG_TMP1, datalo);
tcg_out_opc_imm(s, OPC_SW, TCG_TMP1, base, HI_OFF);
tcg_out_bswap32(s, TCG_TMP1, datahi);
tcg_out_opc_imm(s, OPC_SW, TCG_TMP1, base, LO_OFF);
if (use_mips32r2_instructions) {
tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP0, 0, MIPS_BE ? lo : hi);
tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP1, 0, MIPS_BE ? hi : lo);
tcg_out_opc_sa(s, OPC_ROTR, TCG_TMP0, TCG_TMP0, 16);
tcg_out_opc_sa(s, OPC_ROTR, TCG_TMP1, TCG_TMP1, 16);
tcg_out_opc_imm(s, OPC_SW, TCG_TMP0, base, 0);
tcg_out_opc_imm(s, OPC_SW, TCG_TMP1, base, 4);
} else {
tcg_out_bswap32(s, TCG_TMP3, MIPS_BE ? lo : hi);
tcg_out_opc_imm(s, OPC_SW, TCG_TMP3, base, 0);
tcg_out_bswap32(s, TCG_TMP3, MIPS_BE ? hi : lo);
tcg_out_opc_imm(s, OPC_SW, TCG_TMP3, base, 4);
}
break;
case MO_64:
tcg_out_opc_imm(s, OPC_SW, datalo, base, LO_OFF);
tcg_out_opc_imm(s, OPC_SW, datahi, base, HI_OFF);
tcg_out_opc_imm(s, OPC_SW, MIPS_BE ? hi : lo, base, 0);
tcg_out_opc_imm(s, OPC_SW, MIPS_BE ? lo : hi, base, 4);
break;
default:
@ -1355,12 +1389,13 @@ static void tcg_out_addsub2(TCGContext *s, TCGReg rl, TCGReg rh, TCGReg al,
static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
{
TCGReg addr_regl, addr_regh QEMU_UNUSED_VAR;
TCGReg data_regl, data_regh, base;
TCGReg data_regl, data_regh;
TCGMemOpIdx oi;
TCGMemOp opc;
#if defined(CONFIG_SOFTMMU)
tcg_insn_unit *label_ptr[2];
#endif
TCGReg base = TCG_REG_A0;
data_regl = *args++;
data_regh = (is_64 ? *args++ : 0);
@ -1370,9 +1405,6 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
opc = get_memop(oi);
#if defined(CONFIG_SOFTMMU)
/* Note that we eliminated the helper's address argument,
so we can reuse that for the base. */
base = (TARGET_LONG_BITS == 32 ? TCG_REG_A1 : TCG_REG_A2);
tcg_out_tlb_load(s, base, addr_regl, addr_regh, oi, label_ptr, 0);
tcg_out_qemu_st_direct(s, data_regl, data_regh, base, opc);
add_qemu_ldst_label(s, 0, oi, data_regl, data_regh, addr_regl, addr_regh,
@ -1381,7 +1413,6 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
if (GUEST_BASE == 0) {
base = addr_regl;
} else {
base = TCG_REG_A0;
if (GUEST_BASE == (int16_t)GUEST_BASE) {
tcg_out_opc_imm(s, OPC_ADDIU, base, addr_regl, GUEST_BASE);
} else {
@ -1640,8 +1671,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
break;
case INDEX_op_bswap32_i32:
tcg_out_opc_reg(s, OPC_WSBH, a0, 0, a1);
tcg_out_opc_sa(s, OPC_ROTR, a0, a0, 16);
tcg_out_bswap32(s, a0, a1);
break;
case INDEX_op_deposit_i32:
@ -1761,14 +1791,14 @@ static const TCGTargetOpDef mips_op_defs[] = {
{ INDEX_op_brcond2_i32, { "rZ", "rZ", "rZ", "rZ" } },
#if TARGET_LONG_BITS == 32
{ INDEX_op_qemu_ld_i32, { "L", "lZ" } },
{ INDEX_op_qemu_ld_i32, { "r", "LZ" } },
{ INDEX_op_qemu_st_i32, { "SZ", "SZ" } },
{ INDEX_op_qemu_ld_i64, { "L", "L", "lZ" } },
{ INDEX_op_qemu_ld_i64, { "r", "r", "LZ" } },
{ INDEX_op_qemu_st_i64, { "SZ", "SZ", "SZ" } },
#else
{ INDEX_op_qemu_ld_i32, { "L", "lZ", "lZ" } },
{ INDEX_op_qemu_ld_i32, { "r", "LZ", "LZ" } },
{ INDEX_op_qemu_st_i32, { "SZ", "SZ", "SZ" } },
{ INDEX_op_qemu_ld_i64, { "L", "L", "lZ", "lZ" } },
{ INDEX_op_qemu_ld_i64, { "r", "r", "LZ", "LZ" } },
{ INDEX_op_qemu_st_i64, { "SZ", "SZ", "SZ", "SZ" } },
#endif
{ INDEX_op_mb, { } },
@ -1868,6 +1898,16 @@ static void tcg_target_detect_isa(void)
sigaction(SIGILL, &sa_old, NULL);
}
static tcg_insn_unit *align_code_ptr(TCGContext *s)
{
uintptr_t p = (uintptr_t)s->code_ptr;
if (p & 15) {
p = (p + 15) & -16;
s->code_ptr = (void *)p;
}
return s->code_ptr;
}
/* Generate global QEMU prologue and epilogue code */
static void tcg_target_qemu_prologue(TCGContext *s)
{
@ -1902,7 +1942,39 @@ static void tcg_target_qemu_prologue(TCGContext *s)
}
tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_RA, 0);
/* delay slot */
tcg_out_addi(s, TCG_REG_SP, frame_size);
if (use_mips32r2_instructions) {
return;
}
/* Bswap subroutine: Input in TCG_TMP0, output in TCG_TMP3;
clobbers TCG_TMP1, TCG_TMP2. */
/*
* bswap32 -- 32-bit swap (signed result for mips64). a0 = abcd.
*/
bswap32_addr = align_code_ptr(s);
/* t3 = (ssss)d000 */
tcg_out_opc_sa(s, OPC_SLL, TCG_TMP3, TCG_TMP0, 24);
/* t1 = 000a */
tcg_out_opc_sa(s, OPC_SRL, TCG_TMP1, TCG_TMP0, 24);
/* t2 = 00c0 */
tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP2, TCG_TMP0, 0xff00);
/* t3 = d00a */
tcg_out_opc_reg(s, OPC_OR, TCG_TMP3, TCG_TMP3, TCG_TMP1);
/* t1 = 0abc */
tcg_out_opc_sa(s, OPC_SRL, TCG_TMP1, TCG_TMP0, 8);
/* t2 = 0c00 */
tcg_out_opc_sa(s, OPC_SLL, TCG_TMP2, TCG_TMP2, 8);
/* t1 = 00b0 */
tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, TCG_TMP1, 0xff00);
/* t3 = dc0a */
tcg_out_opc_reg(s, OPC_OR, TCG_TMP3, TCG_TMP3, TCG_TMP2);
tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_RA, 0);
/* t3 = dcba -- delay slot */
tcg_out_opc_reg(s, OPC_OR, TCG_TMP3, TCG_TMP3, TCG_TMP1);
}
static void tcg_target_init(TCGContext *s)
@ -1933,6 +2005,8 @@ static void tcg_target_init(TCGContext *s)
tcg_regset_set_reg(s->reserved_regs, TCG_REG_K1); /* kernel use only */
tcg_regset_set_reg(s->reserved_regs, TCG_TMP0); /* internal use */
tcg_regset_set_reg(s->reserved_regs, TCG_TMP1); /* internal use */
tcg_regset_set_reg(s->reserved_regs, TCG_TMP2); /* internal use */
tcg_regset_set_reg(s->reserved_regs, TCG_TMP3); /* internal use */
tcg_regset_set_reg(s->reserved_regs, TCG_REG_RA); /* return address */
tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP); /* stack pointer */
tcg_regset_set_reg(s->reserved_regs, TCG_REG_GP); /* global pointer */