tcg/i386: Implement INDEX_op_rotl{i,s,v}_vec

For immediates, we must continue the special casing of 8-bit
elements. The other element sizes and shift types are trivially
implemented with shifts.

Backports commit 885b1706df6f0211a22e120fac910fb3abf3e733 from qemu
This commit is contained in:
Richard Henderson 2020-06-14 22:09:20 -04:00 committed by Lioncash
parent cc3187b1e4
commit 6b91e9bae1

View file

@ -3239,6 +3239,7 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_shls_vec:
case INDEX_op_shrs_vec:
case INDEX_op_sars_vec:
case INDEX_op_rotls_vec:
case INDEX_op_cmp_vec:
case INDEX_op_x86_shufps_vec:
case INDEX_op_x86_blend_vec:
@ -3277,6 +3278,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
case INDEX_op_xor_vec:
case INDEX_op_andc_vec:
return 1;
case INDEX_op_rotli_vec:
case INDEX_op_cmp_vec:
case INDEX_op_cmpsel_vec:
return -1;
@ -3303,12 +3305,17 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
return vece >= MO_16;
case INDEX_op_sars_vec:
return vece >= MO_16 && vece <= MO_32;
case INDEX_op_rotls_vec:
return vece >= MO_16 ? -1 : 0;
case INDEX_op_shlv_vec:
case INDEX_op_shrv_vec:
return have_avx2 && vece >= MO_32;
case INDEX_op_sarv_vec:
return have_avx2 && vece == MO_32;
case INDEX_op_rotlv_vec:
case INDEX_op_rotrv_vec:
return have_avx2 && vece >= MO_32 ? -1 : 0;
case INDEX_op_mul_vec:
if (vece == MO_8) {
@ -3337,7 +3344,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
}
}
static void expand_vec_shi(TCGContext *s, TCGType type, unsigned vece, bool shr,
static void expand_vec_shi(TCGContext *s, TCGType type, unsigned vece, TCGOpcode opc,
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
TCGv_vec t1, t2;
@ -3347,26 +3354,31 @@ static void expand_vec_shi(TCGContext *s, TCGType type, unsigned vece, bool shr,
t1 = tcg_temp_new_vec(s, type);
t2 = tcg_temp_new_vec(s, type);
/* Unpack to W, shift, and repack. Tricky bits:
(1) Use punpck*bw x,x to produce DDCCBBAA,
i.e. duplicate in other half of the 16-bit lane.
(2) For right-shift, add 8 so that the high half of
the lane becomes zero. For left-shift, we must
shift up and down again.
(3) Step 2 leaves high half zero such that PACKUSWB
(pack with unsigned saturation) does not modify
the quantity. */
/*
* Unpack to W, shift, and repack. Tricky bits:
* (1) Use punpck*bw x,x to produce DDCCBBAA,
* i.e. duplicate in other half of the 16-bit lane.
* (2) For right-shift, add 8 so that the high half of the lane
* becomes zero. For left-shift, and left-rotate, we must
* shift up and down again.
* (3) Step 2 leaves high half zero such that PACKUSWB
* (pack with unsigned saturation) does not modify
* the quantity.
*/
vec_gen_3(s, INDEX_op_x86_punpckl_vec, type, MO_8,
tcgv_vec_arg(s, t1), tcgv_vec_arg(s, v1), tcgv_vec_arg(s, v1));
vec_gen_3(s, INDEX_op_x86_punpckh_vec, type, MO_8,
tcgv_vec_arg(s, t2), tcgv_vec_arg(s, v1), tcgv_vec_arg(s, v1));
if (shr) {
tcg_gen_shri_vec(s, MO_16, t1, t1, imm + 8);
tcg_gen_shri_vec(s, MO_16, t2, t2, imm + 8);
if (opc != INDEX_op_rotli_vec) {
imm += 8;
}
if (opc == INDEX_op_shri_vec) {
tcg_gen_shri_vec(s, MO_16, t1, t1, imm);
tcg_gen_shri_vec(s, MO_16, t2, t2, imm);
} else {
tcg_gen_shli_vec(s, MO_16, t1, t1, imm + 8);
tcg_gen_shli_vec(s, MO_16, t2, t2, imm + 8);
tcg_gen_shli_vec(s, MO_16, t1, t1, imm);
tcg_gen_shli_vec(s, MO_16, t2, t2, imm);
tcg_gen_shri_vec(s, MO_16, t1, t1, 8);
tcg_gen_shri_vec(s, MO_16, t2, t2, 8);
}
@ -3433,6 +3445,61 @@ static void expand_vec_sari(TCGContext *s, TCGType type, unsigned vece,
}
}
static void expand_vec_rotli(TCGContext *s, TCGType type, unsigned vece,
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
{
TCGv_vec t;
if (vece == MO_8) {
expand_vec_shi(s, type, vece, INDEX_op_rotli_vec, v0, v1, imm);
return;
}
t = tcg_temp_new_vec(s, type);
tcg_gen_shli_vec(s, vece, t, v1, imm);
tcg_gen_shri_vec(s, vece, v0, v1, (8 << vece) - imm);
tcg_gen_or_vec(s, vece, v0, v0, t);
tcg_temp_free_vec(s, t);
}
static void expand_vec_rotls(TCGContext *s, TCGType type, unsigned vece,
TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
{
TCGv_i32 rsh;
TCGv_vec t;
tcg_debug_assert(vece != MO_8);
t = tcg_temp_new_vec(s, type);
rsh = tcg_temp_new_i32(s);
tcg_gen_neg_i32(s, rsh, lsh);
tcg_gen_andi_i32(s, rsh, rsh, (8 << vece) - 1);
tcg_gen_shls_vec(s, vece, t, v1, lsh);
tcg_gen_shrs_vec(s, vece, v0, v1, rsh);
tcg_gen_or_vec(s, vece, v0, v0, t);
tcg_temp_free_vec(s, t);
tcg_temp_free_i32(s, rsh);
}
static void expand_vec_rotv(TCGContext *s, TCGType type, unsigned vece, TCGv_vec v0,
TCGv_vec v1, TCGv_vec sh, bool right)
{
TCGv_vec t = tcg_temp_new_vec(s, type);
tcg_gen_dupi_vec(s, vece, t, 8 << vece);
tcg_gen_sub_vec(s, vece, t, t, sh);
if (right) {
tcg_gen_shlv_vec(s, vece, t, v1, t);
tcg_gen_shrv_vec(s, vece, v0, v1, sh);
} else {
tcg_gen_shrv_vec(s, vece, t, v1, t);
tcg_gen_shlv_vec(s, vece, v0, v1, sh);
}
tcg_gen_or_vec(s, vece, v0, v0, t);
tcg_temp_free_vec(s, t);
}
static void expand_vec_mul(TCGContext *s, TCGType type, unsigned vece,
TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
{
@ -3642,13 +3709,30 @@ void tcg_expand_vec_op(TCGContext *s, TCGOpcode opc, TCGType type, unsigned vece
switch (opc) {
case INDEX_op_shli_vec:
case INDEX_op_shri_vec:
expand_vec_shi(s, type, vece, opc == INDEX_op_shri_vec, v0, v1, a2);
expand_vec_shi(s, type, vece, opc, v0, v1, a2);
break;
case INDEX_op_sari_vec:
expand_vec_sari(s, type, vece, v0, v1, a2);
break;
case INDEX_op_rotli_vec:
expand_vec_rotli(s, type, vece, v0, v1, a2);
break;
case INDEX_op_rotls_vec:
expand_vec_rotls(s, type, vece, v0, v1, temp_tcgv_i32(s, arg_temp(a2)));
break;
case INDEX_op_rotlv_vec:
v2 = temp_tcgv_vec(s, arg_temp(a2));
expand_vec_rotv(s, type, vece, v0, v1, v2, false);
break;
case INDEX_op_rotrv_vec:
v2 = temp_tcgv_vec(s, arg_temp(a2));
expand_vec_rotv(s, type, vece, v0, v1, v2, true);
break;
case INDEX_op_mul_vec:
v2 = temp_tcgv_vec(s, arg_temp(a2));
expand_vec_mul(s, type, vece, v0, v1, v2);