target/arm: Implement v8.1M low-overhead-loop instructions

v8.1M's "low-overhead-loop" extension has three instructions
for looping:
* DLS (start of a do-loop)
* WLS (start of a while-loop)
* LE (end of a loop)

The loop-start instructions are both simple operations to start a
loop whose iteration count (if any) is in LR. The loop-end
instruction handles "decrement iteration count and jump back to loop
start"; it also caches the information about the branch back to the
start of the loop to improve performance of the branch on subsequent
iterations.

As with the branch-future instructions, the architecture permits an
implementation to discard the LO_BRANCH_INFO cache at any time, and
QEMU takes the IMPDEF option to never set it in the first place
(equivalent to discarding it immediately), because for us a "real"
implementation would be unnecessary complexity.

(This implementation only provides the simple looping constructs; the
vector extension MVE (Helium) adds some extra variants to handle
looping across vectors. We'll add those later when we implement
MVE.)

Backports commit b7226369721896ab9ef71544e4fe95b40710e05a
This commit is contained in:
Peter Maydell 2021-03-01 20:29:02 -05:00 committed by Lioncash
parent be197f9857
commit 3ae5543825
2 changed files with 101 additions and 2 deletions

View file

@ -659,4 +659,12 @@ BL 1111 0. .......... 11.1 ............ @branch24
BF 1111 0 boff:4 10 ----- 1110 - ---------- 1 # BF
BF 1111 0 boff:4 11 ----- 1110 0 0000000000 1 # BFX, BFLX
]
[
# LE and WLS immediate
%lob_imm 1:10 11:1 !function=times_2
DLS 1111 0 0000 100 rn:4 1110 0000 0000 0001
WLS 1111 0 0000 100 rn:4 1100 . .......... 1 imm=%lob_imm
LE 1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm
]
}

View file

@ -2562,17 +2562,23 @@ static void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
s->base.is_jmp = DISAS_NORETURN;
}
static inline void gen_jmp(DisasContext *s, uint32_t dest)
/* Jump, specifying which TB number to use if we gen_goto_tb() */
static inline void gen_jmp_tb(DisasContext *s, uint32_t dest, int tbno)
{
if (unlikely(is_singlestepping(s))) {
/* An indirect jump so that we still trigger the debug exception. */
gen_set_pc_im(s, dest);
s->base.is_jmp = DISAS_JUMP;
} else {
gen_goto_tb(s, 0, dest);
gen_goto_tb(s, tbno, dest);
}
}
static inline void gen_jmp(DisasContext *s, uint32_t dest)
{
gen_jmp_tb(s, dest, 0);
}
static inline void gen_mulxy(DisasContext *s, TCGv_i32 t0, TCGv_i32 t1, int x, int y)
{
TCGContext *tcg_ctx = s->uc->tcg_ctx;
@ -8247,6 +8253,91 @@ static bool trans_BF(DisasContext *s, arg_BF *a)
return true;
}
static bool trans_DLS(DisasContext *s, arg_DLS *a)
{
/* M-profile low-overhead loop start */
TCGv_i32 tmp;
if (!dc_isar_feature(aa32_lob, s)) {
return false;
}
if (a->rn == 13 || a->rn == 15) {
/* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
return false;
}
/* Not a while loop, no tail predication: just set LR to the count */
tmp = load_reg(s, a->rn);
store_reg(s, 14, tmp);
return true;
}
static bool trans_WLS(DisasContext *s, arg_WLS *a)
{
/* M-profile low-overhead while-loop start */
TCGv_i32 tmp;
TCGLabel *nextlabel;
TCGContext *tcg_ctx = s->uc->tcg_ctx;
if (!dc_isar_feature(aa32_lob, s)) {
return false;
}
if (a->rn == 13 || a->rn == 15) {
/* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
return false;
}
if (s->condexec_mask) {
/*
* WLS in an IT block is CONSTRAINED UNPREDICTABLE;
* we choose to UNDEF, because otherwise our use of
* gen_goto_tb(1) would clash with the use of TB exit 1
* in the dc->condjmp condition-failed codepath in
* arm_tr_tb_stop() and we'd get an assertion.
*/
return false;
}
nextlabel = gen_new_label(tcg_ctx);
tcg_gen_brcondi_i32(tcg_ctx, TCG_COND_EQ, tcg_ctx->cpu_R[a->rn], 0, nextlabel);
tmp = load_reg(s, a->rn);
store_reg(s, 14, tmp);
gen_jmp_tb(s, s->base.pc_next, 1);
gen_set_label(tcg_ctx, nextlabel);
gen_jmp(s, read_pc(s) + a->imm);
return true;
}
static bool trans_LE(DisasContext *s, arg_LE *a)
{
/*
* M-profile low-overhead loop end. The architecture permits an
* implementation to discard the LO_BRANCH_INFO cache at any time,
* and we take the IMPDEF option to never set it in the first place
* (equivalent to always discarding it immediately), because for QEMU
* a "real" implementation would be complicated and wouldn't execute
* any faster.
*/
TCGv_i32 tmp;
TCGContext *tcg_ctx = s->uc->tcg_ctx;
if (!dc_isar_feature(aa32_lob, s)) {
return false;
}
if (!a->f) {
/* Not loop-forever. If LR <= 1 this is the last loop: do nothing. */
arm_gen_condlabel(s);
tcg_gen_brcondi_i32(tcg_ctx, TCG_COND_LEU, tcg_ctx->cpu_R[14], 1, s->condlabel);
/* Decrement LR */
tmp = load_reg(s, 14);
tcg_gen_addi_i32(tcg_ctx, tmp, tmp, -1);
store_reg(s, 14, tmp);
}
/* Jump back to the loop start */
gen_jmp(s, read_pc(s) - a->imm);
return true;
}
static bool op_tbranch(DisasContext *s, arg_tbranch *a, bool half)
{
TCGContext *tcg_ctx = s->uc->tcg_ctx;