diff --git a/qemu/target/arm/neon-dp.decode b/qemu/target/arm/neon-dp.decode
index 105cf2b2..7b069abe 100644
--- a/qemu/target/arm/neon-dp.decode
+++ b/qemu/target/arm/neon-dp.decode
@@ -488,5 +488,8 @@ Vimm_1r          1111 001 . 1 . 000 ... .... cmode:4 0 . op:1 1 .... @1reg_imm
 
     VQDMULH_2sc  1111 001 . 1 . .. .... .... 1100 . 1 . 0 .... @2scalar
     VQRDMULH_2sc 1111 001 . 1 . .. .... .... 1101 . 1 . 0 .... @2scalar
+
+    VQRDMLAH_2sc 1111 001 . 1 . .. .... .... 1110 . 1 . 0 .... @2scalar
+    VQRDMLSH_2sc 1111 001 . 1 . .. .... .... 1111 . 1 . 0 .... @2scalar
   ]
 }
diff --git a/qemu/target/arm/translate-neon.inc.c b/qemu/target/arm/translate-neon.inc.c
index daf3b5ea..daf63fe7 100644
--- a/qemu/target/arm/translate-neon.inc.c
+++ b/qemu/target/arm/translate-neon.inc.c
@@ -2605,3 +2605,78 @@ static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
 
     return do_2scalar(s, a, opfn[a->size], NULL);
 }
+
+static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
+                            NeonGenThreeOpEnvFn *opfn)
+{
+    /*
+     * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
+     * performs a kind of fused op-then-accumulate using a helper
+     * function that takes all of rd, rn and the scalar at once.
+     */
+    TCGContext *tcg_ctx = s->uc->tcg_ctx;
+    TCGv_i32 scalar;
+    int pass;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    if (!dc_isar_feature(aa32_rdm, s)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if (!opfn) {
+        /* Bad size (including size == 3, which is a different insn group) */
+        return false;
+    }
+
+    if (a->q && ((a->vd | a->vn) & 1)) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    scalar = neon_get_scalar(s, a->size, a->vm);
+
+    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+        TCGv_i32 rn = neon_load_reg(s, a->vn, pass);
+        TCGv_i32 rd = neon_load_reg(s, a->vd, pass);
+        opfn(tcg_ctx, rd, tcg_ctx->cpu_env, rn, scalar, rd);
+        tcg_temp_free_i32(tcg_ctx, rn);
+        neon_store_reg(s, a->vd, pass, rd);
+    }
+    tcg_temp_free_i32(tcg_ctx, scalar);
+
+    return true;
+}
+
+static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenThreeOpEnvFn *opfn[] = {
+        NULL,
+        gen_helper_neon_qrdmlah_s16,
+        gen_helper_neon_qrdmlah_s32,
+        NULL,
+    };
+    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
+}
+
+static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
+{
+    static NeonGenThreeOpEnvFn *opfn[] = {
+        NULL,
+        gen_helper_neon_qrdmlsh_s16,
+        gen_helper_neon_qrdmlsh_s32,
+        NULL,
+    };
+    return do_vqrdmlah_2sc(s, a, opfn[a->size]);
+}
diff --git a/qemu/target/arm/translate.c b/qemu/target/arm/translate.c
index eaa388f8..f7f7303e 100644
--- a/qemu/target/arm/translate.c
+++ b/qemu/target/arm/translate.c
@@ -5297,6 +5297,8 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                 case 9: /* Floating point VMUL scalar */
                 case 12: /* VQDMULH scalar */
                 case 13: /* VQRDMULH scalar */
+                case 14: /* VQRDMLAH scalar */
+                case 15: /* VQRDMLSH scalar */
                     return 1; /* handled by decodetree */
 
                 case 3: /* VQDMLAL scalar */
@@ -5356,42 +5358,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                         neon_store_reg64(s, s->V0, rd + pass);
                     }
                     break;
-                case 14: /* VQRDMLAH scalar */
-                case 15: /* VQRDMLSH scalar */
-                    {
-                        NeonGenThreeOpEnvFn *fn;
-
-                        if (!dc_isar_feature(aa32_rdm, s)) {
-                            return 1;
-                        }
-                        if (u && ((rd | rn) & 1)) {
-                            return 1;
-                        }
-                        if (op == 14) {
-                            if (size == 1) {
-                                fn = gen_helper_neon_qrdmlah_s16;
-                            } else {
-                                fn = gen_helper_neon_qrdmlah_s32;
-                            }
-                        } else {
-                            if (size == 1) {
-                                fn = gen_helper_neon_qrdmlsh_s16;
-                            } else {
-                                fn = gen_helper_neon_qrdmlsh_s32;
-                            }
-                        }
-
-                        tmp2 = neon_get_scalar(s, size, rm);
-                        for (pass = 0; pass < (u ? 4 : 2); pass++) {
-                            tmp = neon_load_reg(s, rn, pass);
-                            tmp3 = neon_load_reg(s, rd, pass);
-                            fn(tcg_ctx, tmp, tcg_ctx->cpu_env, tmp, tmp2, tmp3);
-                            tcg_temp_free_i32(tcg_ctx, tmp3);
-                            neon_store_reg(s, rd, pass, tmp);
-                        }
-                        tcg_temp_free_i32(tcg_ctx, tmp2);
-                    }
-                    break;
                 default:
                     g_assert_not_reached();
                 }