From f02f71f38fc3c4a836c0b431ae5f705c7729bdf1 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 7 May 2020 10:19:23 -0400 Subject: [PATCH] tcg: Improve vector tail clearing Better handling of non-power-of-2 tails as seen with Arm 8-byte vector operations. Backports commit f47db80cc073c0a7a22136c8296b5eca20c0e199 from qemu --- qemu/tcg/tcg-op-gvec.c | 82 ++++++++++++++++++++++++++++++++---------- 1 file changed, 63 insertions(+), 19 deletions(-) diff --git a/qemu/tcg/tcg-op-gvec.c b/qemu/tcg/tcg-op-gvec.c index 15c01eed..6e1171b6 100644 --- a/qemu/tcg/tcg-op-gvec.c +++ b/qemu/tcg/tcg-op-gvec.c @@ -327,11 +327,34 @@ void tcg_gen_gvec_5_ptr(TCGContext *s, uint32_t dofs, uint32_t aofs, uint32_t bo in units of LNSZ. This limits the expansion of inline code. */ static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) { - if (oprsz % lnsz == 0) { - uint32_t lnct = oprsz / lnsz; - return lnct >= 1 && lnct <= MAX_UNROLL; + uint32_t q, r; + + if (oprsz < lnsz) { + return false; } - return false; + + q = oprsz / lnsz; + r = oprsz % lnsz; + tcg_debug_assert((r & 7) == 0); + + if (lnsz < 16) { + /* For sizes below 16, accept no remainder. */ + if (r != 0) { + return false; + } + } else { + /* + * Recall that ARM SVE allows vector sizes that are not a + * power of 2, but always a multiple of 16. The intent is + * that e.g. size == 80 would be expanded with 2x32 + 1x16. + * In addition, expand_clr needs to handle a multiple of 8. + * Thus we can handle the tail with one more operation per + * diminishing power of 2. + */ + q += ctpop32(r); + } + + return q <= MAX_UNROLL; } static void expand_clr(TCGContext *s, uint32_t dofs, uint32_t maxsz); @@ -403,22 +426,31 @@ static void gen_dup_i64(TCGContext *s, unsigned vece, TCGv_i64 out, TCGv_i64 in) static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, uint32_t size, bool prefer_i64) { - if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { - /* - * Recall that ARM SVE allows vector sizes that are not a - * power of 2, but always a multiple of 16. The intent is - * that e.g. size == 80 would be expanded with 2x32 + 1x16. - * It is hard to imagine a case in which v256 is supported - * but v128 is not, but check anyway. - */ - if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) - && (size % 32 == 0 - || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) { - return TCG_TYPE_V256; - } + /* + * Recall that ARM SVE allows vector sizes that are not a + * power of 2, but always a multiple of 16. The intent is + * that e.g. size == 80 would be expanded with 2x32 + 1x16. + * It is hard to imagine a case in which v256 is supported + * but v128 is not, but check anyway. + * In addition, expand_clr needs to handle a multiple of 8. + */ + if (TCG_TARGET_HAS_v256 && + check_size_impl(size, 32) && + tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && + (!(size & 16) || + (TCG_TARGET_HAS_v128 && + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && + (!(size & 8) || + (TCG_TARGET_HAS_v64 && + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { + return TCG_TYPE_V256; } - if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) - && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) { + if (TCG_TARGET_HAS_v128 && + check_size_impl(size, 16) && + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && + (!(size & 8) || + (TCG_TARGET_HAS_v64 && + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { return TCG_TYPE_V128; } if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) @@ -433,6 +465,18 @@ static void do_dup_store(TCGContext *s, TCGType type, uint32_t dofs, uint32_t op { uint32_t i = 0; + tcg_debug_assert(oprsz >= 8); + + /* + * This may be expand_clr for the tail of an operation, e.g. + * oprsz == 8 && maxsz == 64. The first 8 bytes of this store + * are misaligned wrt the maximum vector size, so do that first. + */ + if (dofs & 8) { + tcg_gen_stl_vec(s, t_vec, s->cpu_env, dofs + i, TCG_TYPE_V64); + i += 8; + } + switch (type) { case TCG_TYPE_V256: /*