diff --git a/qemu/target/arm/cpu.h b/qemu/target/arm/cpu.h
index 1f6aba3e..b4ed4067 100644
--- a/qemu/target/arm/cpu.h
+++ b/qemu/target/arm/cpu.h
@@ -3334,6 +3334,11 @@ static inline bool isar_feature_aa32_fp_d32(const ARMISARegisters *id)
     return FIELD_EX64(id->mvfr0, MVFR0, SIMDREG) >= 2;
 }
 
+static inline bool isar_feature_aa32_fpshvec(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->mvfr0, MVFR0, FPSHVEC) > 0;
+}
+
 /*
  * We always set the FP and SIMD FP16 fields to indicate identical
  * levels of support (assuming SIMD is implemented at all), so
diff --git a/qemu/target/arm/translate-vfp.inc.c b/qemu/target/arm/translate-vfp.inc.c
index d9beb0f3..1c680668 100644
--- a/qemu/target/arm/translate-vfp.inc.c
+++ b/qemu/target/arm/translate-vfp.inc.c
@@ -1115,3 +1115,210 @@ static bool trans_VLDM_VSTM_dp(DisasContext *s, arg_VLDM_VSTM_dp *a)
 
     return true;
 }
+
+/*
+ * Types for callbacks for do_vfp_3op_sp() and do_vfp_3op_dp().
+ * The callback should emit code to write a value to vd. If
+ * do_vfp_3op_{sp,dp}() was passed reads_vd then the TCGv vd
+ * will contain the old value of the relevant VFP register;
+ * otherwise it must be written to only.
+ */
+typedef void VFPGen3OpSPFn(TCGContext *, TCGv_i32 vd,
+                           TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst);
+typedef void VFPGen3OpDPFn(TCGContext *, TCGv_i64 vd,
+                           TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst);
+
+/*
+ * Perform a 3-operand VFP data processing instruction. fn is the
+ * callback to do the actual operation; this function deals with the
+ * code to handle looping around for VFP vector processing.
+ */
+static bool do_vfp_3op_sp(DisasContext *s, VFPGen3OpSPFn *fn,
+                          int vd, int vn, int vm, bool reads_vd)
+{
+    TCGContext *tcg_ctx = s->uc->tcg_ctx;
+    uint32_t delta_m = 0;
+    uint32_t delta_d = 0;
+    uint32_t bank_mask = 0;
+    int veclen = s->vec_len;
+    TCGv_i32 f0, f1, fd;
+    TCGv_ptr fpst;
+
+    if (!dc_isar_feature(aa32_fpshvec, s) &&
+        (veclen != 0 || s->vec_stride != 0)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (veclen > 0) {
+        bank_mask = 0x18;
+
+        /* Figure out what type of vector operation this is.  */
+        if ((vd & bank_mask) == 0) {
+            /* scalar */
+            veclen = 0;
+        } else {
+            delta_d = s->vec_stride + 1;
+
+            if ((vm & bank_mask) == 0) {
+                /* mixed scalar/vector */
+                delta_m = 0;
+            } else {
+                /* vector */
+                delta_m = delta_d;
+            }
+        }
+    }
+
+    f0 = tcg_temp_new_i32(tcg_ctx);
+    f1 = tcg_temp_new_i32(tcg_ctx);
+    fd = tcg_temp_new_i32(tcg_ctx);
+    fpst = get_fpstatus_ptr(s, 0);
+
+    neon_load_reg32(s, f0, vn);
+    neon_load_reg32(s, f1, vm);
+
+    for (;;) {
+        if (reads_vd) {
+            neon_load_reg32(s, fd, vd);
+        }
+        fn(tcg_ctx, fd, f0, f1, fpst);
+        neon_store_reg32(s, fd, vd);
+
+        if (veclen == 0) {
+            break;
+        }
+
+        /* Set up the operands for the next iteration */
+        veclen--;
+        vd = ((vd + delta_d) & (bank_mask - 1)) | (vd & bank_mask);
+        vn = ((vn + delta_d) & (bank_mask - 1)) | (vn & bank_mask);
+        neon_load_reg32(s, f0, vn);
+        if (delta_m) {
+            vm = ((vm + delta_m) & (bank_mask - 1)) | (vm & bank_mask);
+            neon_load_reg32(s, f1, vm);
+        }
+    }
+
+    tcg_temp_free_i32(tcg_ctx, f0);
+    tcg_temp_free_i32(tcg_ctx, f1);
+    tcg_temp_free_i32(tcg_ctx, fd);
+    tcg_temp_free_ptr(tcg_ctx, fpst);
+
+    return true;
+}
+
+static bool do_vfp_3op_dp(DisasContext *s, VFPGen3OpDPFn *fn,
+                          int vd, int vn, int vm, bool reads_vd)
+{
+    TCGContext *tcg_ctx = s->uc->tcg_ctx;
+    uint32_t delta_m = 0;
+    uint32_t delta_d = 0;
+    uint32_t bank_mask = 0;
+    int veclen = s->vec_len;
+    TCGv_i64 f0, f1, fd;
+    TCGv_ptr fpst;
+
+    /* UNDEF accesses to D16-D31 if they don't exist */
+    if (!dc_isar_feature(aa32_fp_d32, s) && ((vd | vn | vm) & 0x10)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_fpshvec, s) &&
+        (veclen != 0 || s->vec_stride != 0)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    if (veclen > 0) {
+        bank_mask = 0xc;
+
+        /* Figure out what type of vector operation this is.  */
+        if ((vd & bank_mask) == 0) {
+            /* scalar */
+            veclen = 0;
+        } else {
+            delta_d = (s->vec_stride >> 1) + 1;
+
+            if ((vm & bank_mask) == 0) {
+                /* mixed scalar/vector */
+                delta_m = 0;
+            } else {
+                /* vector */
+                delta_m = delta_d;
+            }
+        }
+    }
+
+    f0 = tcg_temp_new_i64(tcg_ctx);
+    f1 = tcg_temp_new_i64(tcg_ctx);
+    fd = tcg_temp_new_i64(tcg_ctx);
+    fpst = get_fpstatus_ptr(s, 0);
+
+    neon_load_reg64(s, f0, vn);
+    neon_load_reg64(s, f1, vm);
+
+    for (;;) {
+        if (reads_vd) {
+            neon_load_reg64(s, fd, vd);
+        }
+        fn(tcg_ctx, fd, f0, f1, fpst);
+        neon_store_reg64(s, fd, vd);
+
+        if (veclen == 0) {
+            break;
+        }
+        /* Set up the operands for the next iteration */
+        veclen--;
+        vd = ((vd + delta_d) & (bank_mask - 1)) | (vd & bank_mask);
+        vn = ((vn + delta_d) & (bank_mask - 1)) | (vn & bank_mask);
+        neon_load_reg64(s, f0, vn);
+        if (delta_m) {
+            vm = ((vm + delta_m) & (bank_mask - 1)) | (vm & bank_mask);
+            neon_load_reg64(s, f1, vm);
+        }
+    }
+
+    tcg_temp_free_i64(tcg_ctx, f0);
+    tcg_temp_free_i64(tcg_ctx, f1);
+    tcg_temp_free_i64(tcg_ctx, fd);
+    tcg_temp_free_ptr(tcg_ctx, fpst);
+
+    return true;
+}
+
+static void gen_VMLA_sp(TCGContext *tcg_ctx, TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm, TCGv_ptr fpst)
+{
+    /* Note that order of inputs to the add matters for NaNs */
+    TCGv_i32 tmp = tcg_temp_new_i32(tcg_ctx);
+
+    gen_helper_vfp_muls(tcg_ctx, tmp, vn, vm, fpst);
+    gen_helper_vfp_adds(tcg_ctx, vd, vd, tmp, fpst);
+    tcg_temp_free_i32(tcg_ctx, tmp);
+}
+
+static bool trans_VMLA_sp(DisasContext *s, arg_VMLA_sp *a)
+{
+    return do_vfp_3op_sp(s, gen_VMLA_sp, a->vd, a->vn, a->vm, true);
+}
+
+static void gen_VMLA_dp(TCGContext *tcg_ctx, TCGv_i64 vd, TCGv_i64 vn, TCGv_i64 vm, TCGv_ptr fpst)
+{
+    /* Note that order of inputs to the add matters for NaNs */
+    TCGv_i64 tmp = tcg_temp_new_i64(tcg_ctx);
+
+    gen_helper_vfp_muld(tcg_ctx, tmp, vn, vm, fpst);
+    gen_helper_vfp_addd(tcg_ctx, vd, vd, tmp, fpst);
+    tcg_temp_free_i64(tcg_ctx, tmp);
+}
+
+static bool trans_VMLA_dp(DisasContext *s, arg_VMLA_sp *a)
+{
+    return do_vfp_3op_dp(s, gen_VMLA_dp, a->vd, a->vn, a->vm, true);
+}
diff --git a/qemu/target/arm/translate.c b/qemu/target/arm/translate.c
index 56119caf..c4a40caf 100644
--- a/qemu/target/arm/translate.c
+++ b/qemu/target/arm/translate.c
@@ -3237,6 +3237,14 @@ static int disas_vfp_insn(DisasContext *s, uint32_t insn)
             op = ((insn >> 20) & 8) | ((insn >> 19) & 6) | ((insn >> 6) & 1);
             rn = VFP_SREG_N(insn);
 
+            switch (op) {
+            case 0:
+                /* Already handled by decodetree */
+                return 1;
+            default:
+                break;
+            }
+
             if (op == 15) {
                 /* rn is opcode, encoded as per VFP_SREG_N. */
                 switch (rn) {
@@ -3416,12 +3424,6 @@ static int disas_vfp_insn(DisasContext *s, uint32_t insn)
             for (;;) {
                 /* Perform the calculation.  */
                 switch (op) {
-                case 0: /* VMLA: fd + (fn * fm) */
-                    /* Note that order of inputs to the add matters for NaNs */
-                    gen_vfp_F1_mul(s, dp);
-                    gen_mov_F0_vreg(s, dp, rd);
-                    gen_vfp_add(s, dp);
-                    break;
                 case 1: /* VMLS: fd + -(fn * fm) */
                     gen_vfp_mul(s, dp);
                     gen_vfp_F1_neg(s, dp);
diff --git a/qemu/target/arm/vfp.decode b/qemu/target/arm/vfp.decode
index 68c9ffcf..9530e17a 100644
--- a/qemu/target/arm/vfp.decode
+++ b/qemu/target/arm/vfp.decode
@@ -96,3 +96,9 @@ VLDM_VSTM_sp ---- 1101 0.1 l:1 rn:4 .... 1010 imm:8 \
              vd=%vd_sp p=1 u=0 w=1
 VLDM_VSTM_dp ---- 1101 0.1 l:1 rn:4 .... 1011 imm:8 \
              vd=%vd_dp p=1 u=0 w=1
+
+# 3-register VFP data-processing; bits [23,21:20,6] identify the operation.
+VMLA_sp      ---- 1110 0.00 .... .... 1010 .0.0 .... \
+             vm=%vm_sp vn=%vn_sp vd=%vd_sp
+VMLA_dp      ---- 1110 0.00 .... .... 1011 .0.0 .... \
+             vm=%vm_dp vn=%vn_dp vd=%vd_dp