diff --git a/qemu/tcg/tcg-op-gvec.c b/qemu/tcg/tcg-op-gvec.c
index 6e1171b6..8f9c7a96 100644
--- a/qemu/tcg/tcg-op-gvec.c
+++ b/qemu/tcg/tcg-op-gvec.c
@@ -664,17 +664,22 @@ static void expand_clr(TCGContext *s, uint32_t dofs, uint32_t maxsz)
 
 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 static void expand_2_i32(TCGContext *s, uint32_t dofs, uint32_t aofs, uint32_t oprsz,
-                         void (*fni)(TCGContext *, TCGv_i32, TCGv_i32))
+                         bool load_dest, void (*fni)(TCGContext *, TCGv_i32, TCGv_i32))
 {
     TCGv_i32 t0 = tcg_temp_new_i32(s);
+    TCGv_i32 t1 = tcg_temp_new_i32(s);
     uint32_t i;
 
     for (i = 0; i < oprsz; i += 4) {
         tcg_gen_ld_i32(s, t0, s->cpu_env, aofs + i);
-        fni(s, t0, t0);
-        tcg_gen_st_i32(s, t0, s->cpu_env, dofs + i);
+        if (load_dest) {
+            tcg_gen_ld_i32(s, t1, s->cpu_env, dofs + i);
+        }
+        fni(s, t1, t0);
+        tcg_gen_st_i32(s, t1, s->cpu_env, dofs + i);
     }
     tcg_temp_free_i32(s, t0);
+    tcg_temp_free_i32(s, t1);
 }
 
 static void expand_2i_i32(TCGContext *s, uint32_t dofs, uint32_t aofs, uint32_t oprsz,
@@ -794,17 +799,22 @@ static void expand_4_i32(TCGContext *s, uint32_t dofs, uint32_t aofs, uint32_t b
 
 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 static void expand_2_i64(TCGContext *s, uint32_t dofs, uint32_t aofs, uint32_t oprsz,
-                         void (*fni)(TCGContext *, TCGv_i64, TCGv_i64))
+                         bool load_dest, void (*fni)(TCGContext *, TCGv_i64, TCGv_i64))
 {
     TCGv_i64 t0 = tcg_temp_new_i64(s);
+    TCGv_i64 t1 = tcg_temp_new_i64(s);
     uint32_t i;
 
     for (i = 0; i < oprsz; i += 8) {
         tcg_gen_ld_i64(s, t0, s->cpu_env, aofs + i);
-        fni(s, t0, t0);
-        tcg_gen_st_i64(s, t0, s->cpu_env, dofs + i);
+        if (load_dest) {
+            tcg_gen_ld_i64(s, t1, s->cpu_env, dofs + i);
+        }
+        fni(s, t1, t0);
+        tcg_gen_st_i64(s, t1, s->cpu_env, dofs + i);
     }
     tcg_temp_free_i64(s, t0);
+    tcg_temp_free_i64(s, t1);
 }
 
 static void expand_2i_i64(TCGContext *s, uint32_t dofs, uint32_t aofs, uint32_t oprsz,
@@ -925,17 +935,23 @@ static void expand_4_i64(TCGContext *s, uint32_t dofs, uint32_t aofs, uint32_t b
 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 static void expand_2_vec(TCGContext *s, unsigned vece, uint32_t dofs, uint32_t aofs,
                          uint32_t oprsz, uint32_t tysz, TCGType type,
+                         bool load_dest,
                          void (*fni)(TCGContext *, unsigned, TCGv_vec, TCGv_vec))
 {
     TCGv_vec t0 = tcg_temp_new_vec(s, type);
+    TCGv_vec t1 = tcg_temp_new_vec(s, type);
     uint32_t i;
 
     for (i = 0; i < oprsz; i += tysz) {
         tcg_gen_ld_vec(s, t0, s->cpu_env, aofs + i);
-        fni(s, vece, t0, t0);
-        tcg_gen_st_vec(s, t0, s->cpu_env, dofs + i);
+        if (load_dest) {
+            tcg_gen_ld_vec(s, t1, s->cpu_env, dofs + i);
+        }
+        fni(s, vece, t1, t0);
+        tcg_gen_st_vec(s, t1, s->cpu_env, dofs + i);
     }
     tcg_temp_free_vec(s, t0);
+    tcg_temp_free_vec(s, t1);
 }
 
 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
@@ -1089,7 +1105,8 @@ void tcg_gen_gvec_2(TCGContext *s, uint32_t dofs, uint32_t aofs,
          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
          */
         some = QEMU_ALIGN_DOWN(oprsz, 32);
-        expand_2_vec(s, g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
+        expand_2_vec(s, g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
+                     g->load_dest, g->fniv);
         if (some == oprsz) {
             break;
         }
@@ -1099,17 +1116,19 @@ void tcg_gen_gvec_2(TCGContext *s, uint32_t dofs, uint32_t aofs,
         maxsz -= some;
         /* fallthru */
     case TCG_TYPE_V128:
-        expand_2_vec(s, g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
+        expand_2_vec(s, g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
+                     g->load_dest, g->fniv);
         break;
     case TCG_TYPE_V64:
-        expand_2_vec(s, g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
+        expand_2_vec(s, g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
+                     g->load_dest, g->fniv);
         break;
 
     case 0:
         if (g->fni8 && check_size_impl(oprsz, 8)) {
-            expand_2_i64(s, dofs, aofs, oprsz, g->fni8);
+            expand_2_i64(s, dofs, aofs, oprsz, g->load_dest, g->fni8);
         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
-            expand_2_i32(s, dofs, aofs, oprsz, g->fni4);
+            expand_2_i32(s, dofs, aofs, oprsz, g->load_dest, g->fni4);
         } else {
             assert(g->fno != NULL);
             tcg_gen_gvec_2_ool(s, dofs, aofs, oprsz, maxsz, g->data, g->fno);
diff --git a/qemu/tcg/tcg-op-gvec.h b/qemu/tcg/tcg-op-gvec.h
index 24b5ee30..ec8db68b 100644
--- a/qemu/tcg/tcg-op-gvec.h
+++ b/qemu/tcg/tcg-op-gvec.h
@@ -109,6 +109,8 @@ typedef struct {
     uint8_t vece;
     /* Prefer i64 to v64.  */
     bool prefer_i64;
+    /* Load dest as a 2nd source operand.  */
+    bool load_dest;
 } GVecGen2;
 
 typedef struct {