Add Sse Opt. for S/Umax_V, S/Umin_V, S/Uaddw_V, S/Usubw_V, Fabs_S/V, Fneg_S/V Inst.; for Fcvtl_V, Fcvtn_V Inst.; and for Fcmp_S Inst.. Add/Improve other Sse Opt.. Add Tests. (#496)

* Update CpuTest.cs * Update CpuTestSimd.cs * Update CpuTestSimdReg.cs * Update InstEmitSimdCmp.cs * Update SoftFloat.cs * Update InstEmitAluHelper.cs * Update InstEmitSimdArithmetic.cs * Update InstEmitSimdHelper.cs * Update VectorHelper.cs * Update InstEmitSimdCvt.cs * Update InstEmitSimdArithmetic.cs * Update CpuTestSimd.cs * Update InstEmitSimdArithmetic.cs * Update OpCodeTable.cs * Update InstEmitSimdArithmetic.cs * Update InstEmitSimdCmp.cs * Update InstEmitSimdCvt.cs * Update CpuTestSimd.cs * Update CpuTestSimdReg.cs * Create CpuTestSimdFcond.cs * Update OpCodeTable.cs * Update InstEmitSimdMove.cs * Update CpuTestSimdIns.cs * Create CpuTestSimdExt.cs * Nit. * Update PackageReference.
2025-07-07 15:00:40 +00:00 · 2018-11-18 03:41:16 +01:00 · 2018-11-18 03:41:16 +01:00 · 7e98b0f6b2
parent 5357291c36
commit 7e98b0f6b2
9 changed files with 1214 additions and 312 deletions
--- a/Instructions/InstEmitAluHelper.cs
+++ b/Instructions/InstEmitAluHelper.cs
@ -190,22 +190,31 @@ namespace ChocolArm64.Instructions
            }
        }

-        public static void EmitSetNzcv(ILEmitterCtx context, int nzcv)
+        public static void EmitSetNzcv(ILEmitterCtx context)
        {
-            context.EmitLdc_I4((nzcv >> 0) & 1);
-
+            context.Emit(OpCodes.Dup);
+            context.Emit(OpCodes.Ldc_I4_1);
+            context.Emit(OpCodes.And);
            context.EmitStflg((int)PState.VBit);

-            context.EmitLdc_I4((nzcv >> 1) & 1);
-
+            context.Emit(OpCodes.Ldc_I4_1);
+            context.Emit(OpCodes.Shr);
+            context.Emit(OpCodes.Dup);
+            context.Emit(OpCodes.Ldc_I4_1);
+            context.Emit(OpCodes.And);
            context.EmitStflg((int)PState.CBit);

-            context.EmitLdc_I4((nzcv >> 2) & 1);
-
+            context.Emit(OpCodes.Ldc_I4_1);
+            context.Emit(OpCodes.Shr);
+            context.Emit(OpCodes.Dup);
+            context.Emit(OpCodes.Ldc_I4_1);
+            context.Emit(OpCodes.And);
            context.EmitStflg((int)PState.ZBit);

-            context.EmitLdc_I4((nzcv >> 3) & 1);
-
+            context.Emit(OpCodes.Ldc_I4_1);
+            context.Emit(OpCodes.Shr);
+            context.Emit(OpCodes.Ldc_I4_1);
+            context.Emit(OpCodes.And);
            context.EmitStflg((int)PState.NBit);
        }
    }
--- a/Instructions/InstEmitSimdArithmetic.cs
+++ b/Instructions/InstEmitSimdArithmetic.cs
@ -186,18 +186,101 @@ namespace ChocolArm64.Instructions

        public static void Fabs_S(ILEmitterCtx context)
        {
-            EmitScalarUnaryOpF(context, () =>
+            if (Optimizations.UseSse2)
            {
-                EmitUnaryMathCall(context, nameof(Math.Abs));
-            });
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                if (op.Size == 0)
+                {
+                    Type[] typesSsv    = new Type[] { typeof(float) };
+                    Type[] typesAndNot = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdc_R4(-0f);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetScalarVector128), typesSsv));
+
+                    context.EmitLdvec(op.Rn);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot), typesAndNot));
+
+                    context.EmitStvec(op.Rd);
+
+                    EmitVectorZero32_128(context, op.Rd);
+                }
+                else /* if (op.Size == 1) */
+                {
+                    Type[] typesSsv    = new Type[] { typeof(double) };
+                    Type[] typesAndNot = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    context.EmitLdc_R8(-0d);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetScalarVector128), typesSsv));
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitScalarUnaryOpF(context, () =>
+                {
+                    EmitUnaryMathCall(context, nameof(Math.Abs));
+                });
+            }
        }

        public static void Fabs_V(ILEmitterCtx context)
        {
-            EmitVectorUnaryOpF(context, () =>
+            if (Optimizations.UseSse2)
            {
-                EmitUnaryMathCall(context, nameof(Math.Abs));
-            });
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Type[] typesSav    = new Type[] { typeof(float) };
+                    Type[] typesAndNot = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdc_R4(-0f);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetAllVector128), typesSav));
+
+                    context.EmitLdvec(op.Rn);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AndNot), typesAndNot));
+
+                    context.EmitStvec(op.Rd);
+
+                    if (op.RegisterSize == RegisterSize.Simd64)
+                    {
+                        EmitVectorZeroUpper(context, op.Rd);
+                    }
+                }
+                else /* if (sizeF == 1) */
+                {
+                    Type[] typesSav    = new Type[] { typeof(double) };
+                    Type[] typesAndNot = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    context.EmitLdc_R8(-0d);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorUnaryOpF(context, () =>
+                {
+                    EmitUnaryMathCall(context, nameof(Math.Abs));
+                });
+            }
        }

        public static void Fadd_S(ILEmitterCtx context)
@ -283,7 +366,7 @@ namespace ChocolArm64.Instructions
            }
        }

-        public static void Fmadd_S(ILEmitterCtx context)
+        public static void Fmadd_S(ILEmitterCtx context) // Fused.
        {
            if (Optimizations.FastFP && Optimizations.UseSse2)
            {
@ -450,22 +533,118 @@ namespace ChocolArm64.Instructions
            });
        }

-        public static void Fmla_V(ILEmitterCtx context)
+        public static void Fmla_V(ILEmitterCtx context) // Fused.
        {
-            EmitVectorTernaryOpF(context, () =>
+            if (Optimizations.FastFP && Optimizations.UseSse2)
            {
-                context.Emit(OpCodes.Mul);
-                context.Emit(OpCodes.Add);
-            });
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Type[] typesMulAdd = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdvec(op.Rd);
+                    context.EmitLdvec(op.Rn);
+                    context.EmitLdvec(op.Rm);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulAdd));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Add),      typesMulAdd));
+
+                    context.EmitStvec(op.Rd);
+
+                    if (op.RegisterSize == RegisterSize.Simd64)
+                    {
+                        EmitVectorZeroUpper(context, op.Rd);
+                    }
+                }
+                else /* if (sizeF == 1) */
+                {
+                    Type[] typesMulAdd = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    EmitLdvecWithCastToDouble(context, op.Rd);
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+                    EmitLdvecWithCastToDouble(context, op.Rm);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulAdd));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add),      typesMulAdd));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorTernaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd));
+                });
+            }
        }

-        public static void Fmla_Ve(ILEmitterCtx context)
+        public static void Fmla_Ve(ILEmitterCtx context) // Fused.
        {
-            EmitVectorTernaryOpByElemF(context, () =>
+            if (Optimizations.FastFP && Optimizations.UseSse2)
            {
-                context.Emit(OpCodes.Mul);
-                context.Emit(OpCodes.Add);
-            });
+                OpCodeSimdRegElemF64 op = (OpCodeSimdRegElemF64)context.CurrOp;
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Type[] typesSfl    = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>), typeof(byte) };
+                    Type[] typesMulAdd = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdvec(op.Rd);
+
+                    context.EmitLdvec(op.Rn);
+
+                    context.EmitLdvec(op.Rm);
+                    context.Emit(OpCodes.Dup);
+
+                    context.EmitLdc_I4(op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl));
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulAdd));
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Add), typesMulAdd));
+
+                    context.EmitStvec(op.Rd);
+
+                    if (op.RegisterSize == RegisterSize.Simd64)
+                    {
+                        EmitVectorZeroUpper(context, op.Rd);
+                    }
+                }
+                else /* if (sizeF == 1) */
+                {
+                    Type[] typesSfl    = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>), typeof(byte) };
+                    Type[] typesMulAdd = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    EmitLdvecWithCastToDouble(context, op.Rd);
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+
+                    EmitLdvecWithCastToDouble(context, op.Rm);
+                    context.Emit(OpCodes.Dup);
+
+                    context.EmitLdc_I4(op.Index | op.Index << 1);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle), typesSfl));
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulAdd));
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesMulAdd));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorTernaryOpByElemF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd));
+                });
+            }
        }

        public static void Fmls_Se(ILEmitterCtx context)
@ -477,25 +656,121 @@ namespace ChocolArm64.Instructions
            });
        }

-        public static void Fmls_V(ILEmitterCtx context)
+        public static void Fmls_V(ILEmitterCtx context) // Fused.
        {
-            EmitVectorTernaryOpF(context, () =>
+            if (Optimizations.FastFP && Optimizations.UseSse2)
            {
-                context.Emit(OpCodes.Mul);
-                context.Emit(OpCodes.Sub);
-            });
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdvec(op.Rd);
+                    context.EmitLdvec(op.Rn);
+                    context.EmitLdvec(op.Rm);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulSub));
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Subtract), typesMulSub));
+
+                    context.EmitStvec(op.Rd);
+
+                    if (op.RegisterSize == RegisterSize.Simd64)
+                    {
+                        EmitVectorZeroUpper(context, op.Rd);
+                    }
+                }
+                else /* if (sizeF == 1) */
+                {
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    EmitLdvecWithCastToDouble(context, op.Rd);
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+                    EmitLdvecWithCastToDouble(context, op.Rm);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulSub));
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorTernaryOpF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub));
+                });
+            }
        }

-        public static void Fmls_Ve(ILEmitterCtx context)
+        public static void Fmls_Ve(ILEmitterCtx context) // Fused.
        {
-            EmitVectorTernaryOpByElemF(context, () =>
+            if (Optimizations.FastFP && Optimizations.UseSse2)
            {
-                context.Emit(OpCodes.Mul);
-                context.Emit(OpCodes.Sub);
-            });
+                OpCodeSimdRegElemF64 op = (OpCodeSimdRegElemF64)context.CurrOp;
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Type[] typesSfl    = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>), typeof(byte) };
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdvec(op.Rd);
+
+                    context.EmitLdvec(op.Rn);
+
+                    context.EmitLdvec(op.Rm);
+                    context.Emit(OpCodes.Dup);
+
+                    context.EmitLdc_I4(op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl));
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMulSub));
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Subtract), typesMulSub));
+
+                    context.EmitStvec(op.Rd);
+
+                    if (op.RegisterSize == RegisterSize.Simd64)
+                    {
+                        EmitVectorZeroUpper(context, op.Rd);
+                    }
+                }
+                else /* if (sizeF == 1) */
+                {
+                    Type[] typesSfl    = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>), typeof(byte) };
+                    Type[] typesMulSub = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    EmitLdvecWithCastToDouble(context, op.Rd);
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+
+                    EmitLdvecWithCastToDouble(context, op.Rm);
+                    context.Emit(OpCodes.Dup);
+
+                    context.EmitLdc_I4(op.Index | op.Index << 1);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle), typesSfl));
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMulSub));
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorTernaryOpByElemF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub));
+                });
+            }
        }

-        public static void Fmsub_S(ILEmitterCtx context)
+        public static void Fmsub_S(ILEmitterCtx context) // Fused.
        {
            if (Optimizations.FastFP && Optimizations.UseSse2)
            {
@ -580,7 +855,59 @@ namespace ChocolArm64.Instructions

        public static void Fmul_Ve(ILEmitterCtx context)
        {
-            EmitVectorBinaryOpByElemF(context, () => context.Emit(OpCodes.Mul));
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                OpCodeSimdRegElemF64 op = (OpCodeSimdRegElemF64)context.CurrOp;
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Type[] typesSfl = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>), typeof(byte) };
+                    Type[] typesMul = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdvec(op.Rn);
+
+                    context.EmitLdvec(op.Rm);
+                    context.Emit(OpCodes.Dup);
+
+                    context.EmitLdc_I4(op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Shuffle), typesSfl));
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Multiply), typesMul));
+
+                    context.EmitStvec(op.Rd);
+
+                    if (op.RegisterSize == RegisterSize.Simd64)
+                    {
+                        EmitVectorZeroUpper(context, op.Rd);
+                    }
+                }
+                else /* if (sizeF == 1) */
+                {
+                    Type[] typesSfl = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>), typeof(byte) };
+                    Type[] typesMul = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+
+                    EmitLdvecWithCastToDouble(context, op.Rm);
+                    context.Emit(OpCodes.Dup);
+
+                    context.EmitLdc_I4(op.Index | op.Index << 1);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Shuffle), typesSfl));
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Multiply), typesMul));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpByElemF(context, () =>
+                {
+                    EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul));
+                });
+            }
        }

        public static void Fmulx_S(ILEmitterCtx context)
@ -617,12 +944,95 @@ namespace ChocolArm64.Instructions

        public static void Fneg_S(ILEmitterCtx context)
        {
-            EmitScalarUnaryOpF(context, () => context.Emit(OpCodes.Neg));
+            if (Optimizations.UseSse2)
+            {
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                if (op.Size == 0)
+                {
+                    Type[] typesSsv = new Type[] { typeof(float) };
+                    Type[] typesXor = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdc_R4(-0f);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetScalarVector128), typesSsv));
+
+                    context.EmitLdvec(op.Rn);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Xor), typesXor));
+
+                    context.EmitStvec(op.Rd);
+
+                    EmitVectorZero32_128(context, op.Rd);
+                }
+                else /* if (op.Size == 1) */
+                {
+                    Type[] typesSsv = new Type[] { typeof(double) };
+                    Type[] typesXor = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    context.EmitLdc_R8(-0d);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetScalarVector128), typesSsv));
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXor));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitScalarUnaryOpF(context, () => context.Emit(OpCodes.Neg));
+            }
        }

        public static void Fneg_V(ILEmitterCtx context)
        {
-            EmitVectorUnaryOpF(context, () => context.Emit(OpCodes.Neg));
+            if (Optimizations.UseSse2)
+            {
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Type[] typesSav = new Type[] { typeof(float) };
+                    Type[] typesXor = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    context.EmitLdc_R4(-0f);
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.SetAllVector128), typesSav));
+
+                    context.EmitLdvec(op.Rn);
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.Xor), typesXor));
+
+                    context.EmitStvec(op.Rd);
+
+                    if (op.RegisterSize == RegisterSize.Simd64)
+                    {
+                        EmitVectorZeroUpper(context, op.Rd);
+                    }
+                }
+                else /* if (sizeF == 1) */
+                {
+                    Type[] typesSav = new Type[] { typeof(double) };
+                    Type[] typesXor = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    context.EmitLdc_R8(-0d);
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXor));
+
+                    EmitStvecWithCastFromDouble(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorUnaryOpF(context, () => context.Emit(OpCodes.Neg));
+            }
        }

        public static void Fnmadd_S(ILEmitterCtx context)
@ -689,7 +1099,7 @@ namespace ChocolArm64.Instructions
            });
        }

-        public static void Frecps_S(ILEmitterCtx context)
+        public static void Frecps_S(ILEmitterCtx context) // Fused.
        {
            if (Optimizations.FastFP && Optimizations.UseSse2)
            {
@ -743,7 +1153,7 @@ namespace ChocolArm64.Instructions
            }
        }

-        public static void Frecps_V(ILEmitterCtx context)
+        public static void Frecps_V(ILEmitterCtx context) // Fused.
        {
            if (Optimizations.FastFP && Optimizations.UseSse2)
            {
@ -986,7 +1396,7 @@ namespace ChocolArm64.Instructions
            });
        }

-        public static void Frsqrts_S(ILEmitterCtx context)
+        public static void Frsqrts_S(ILEmitterCtx context) // Fused.
        {
            if (Optimizations.FastFP && Optimizations.UseSse2)
            {
@ -1048,7 +1458,7 @@ namespace ChocolArm64.Instructions
            }
        }

-        public static void Frsqrts_V(ILEmitterCtx context)
+        public static void Frsqrts_V(ILEmitterCtx context) // Fused.
        {
            if (Optimizations.FastFP && Optimizations.UseSse2)
            {
@ -1310,7 +1720,7 @@ namespace ChocolArm64.Instructions

                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));

-                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);

                context.EmitLdc_I4(numBytes);
                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
@ -1334,7 +1744,38 @@ namespace ChocolArm64.Instructions

        public static void Saddw_V(ILEmitterCtx context)
        {
-            EmitVectorWidenRmBinaryOpSx(context, () => context.Emit(OpCodes.Add));
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] typesSrl = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt = new Type[] { VectorIntTypesPerSizeLog2[op.Size] };
+                Type[] typesAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
+                                               VectorIntTypesPerSizeLog2[op.Size + 1] };
+
+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
+                                                   nameof(Sse41.ConvertToVector128Int32),
+                                                   nameof(Sse41.ConvertToVector128Int64) };
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size + 1);
+
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRmBinaryOpSx(context, () => context.Emit(OpCodes.Add));
+            }
        }

        public static void Shadd_V(ILEmitterCtx context)
@ -1439,11 +1880,34 @@ namespace ChocolArm64.Instructions

        public static void Smax_V(ILEmitterCtx context)
        {
-            Type[] types = new Type[] { typeof(long), typeof(long) };
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;

-            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
+                Type[] typesMax = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };

-            EmitVectorBinaryOpSx(context, () => context.EmitCall(mthdInfo));
+                Type typeSse = op.Size == 1 ? typeof(Sse2) : typeof(Sse41);
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.Max), typesMax));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                Type[] types = new Type[] { typeof(long), typeof(long) };
+
+                MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
+
+                EmitVectorBinaryOpSx(context, () => context.EmitCall(mthdInfo));
+            }
        }

        public static void Smaxp_V(ILEmitterCtx context)
@ -1457,11 +1921,34 @@ namespace ChocolArm64.Instructions

        public static void Smin_V(ILEmitterCtx context)
        {
-            Type[] types = new Type[] { typeof(long), typeof(long) };
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;

-            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
+                Type[] typesMin = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };

-            EmitVectorBinaryOpSx(context, () => context.EmitCall(mthdInfo));
+                Type typeSse = op.Size == 1 ? typeof(Sse2) : typeof(Sse41);
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.Min), typesMin));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                Type[] types = new Type[] { typeof(long), typeof(long) };
+
+                MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
+
+                EmitVectorBinaryOpSx(context, () => context.EmitCall(mthdInfo));
+            }
        }

        public static void Sminp_V(ILEmitterCtx context)
@ -1484,7 +1971,7 @@ namespace ChocolArm64.Instructions
                Type[] typesMulAdd = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
                                                  VectorIntTypesPerSizeLog2[op.Size + 1] };

-                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);

                string nameCvt = op.Size == 0
                    ? nameof(Sse41.ConvertToVector128Int16)
@ -1508,7 +1995,7 @@ namespace ChocolArm64.Instructions

                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));

-                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulAdd));
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.MultiplyLow), typesMulAdd));

                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesMulAdd));

@ -1535,7 +2022,7 @@ namespace ChocolArm64.Instructions
                Type[] typesMulSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
                                                  VectorIntTypesPerSizeLog2[op.Size + 1] };

-                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);

                string nameCvt = op.Size == 0
                    ? nameof(Sse41.ConvertToVector128Int16)
@ -1559,7 +2046,7 @@ namespace ChocolArm64.Instructions

                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));

-                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulSub));
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.MultiplyLow), typesMulSub));

                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));

@ -1735,7 +2222,7 @@ namespace ChocolArm64.Instructions

                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));

-                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);

                context.EmitLdc_I4(numBytes);
                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
@ -1754,7 +2241,38 @@ namespace ChocolArm64.Instructions

        public static void Ssubw_V(ILEmitterCtx context)
        {
-            EmitVectorWidenRmBinaryOpSx(context, () => context.Emit(OpCodes.Sub));
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] typesSrl = new Type[] { VectorIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt = new Type[] { VectorIntTypesPerSizeLog2[op.Size] };
+                Type[] typesSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size + 1],
+                                               VectorIntTypesPerSizeLog2[op.Size + 1] };
+
+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
+                                                   nameof(Sse41.ConvertToVector128Int32),
+                                                   nameof(Sse41.ConvertToVector128Int64) };
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size + 1);
+
+                EmitLdvecWithSignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSub));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRmBinaryOpSx(context, () => context.Emit(OpCodes.Sub));
+            }
        }

        public static void Sub_S(ILEmitterCtx context)
@ -1901,7 +2419,38 @@ namespace ChocolArm64.Instructions

        public static void Uaddw_V(ILEmitterCtx context)
        {
-            EmitVectorWidenRmBinaryOpZx(context, () => context.Emit(OpCodes.Add));
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt = new Type[] { VectorUIntTypesPerSizeLog2[op.Size] };
+                Type[] typesAdd = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1],
+                                               VectorUIntTypesPerSizeLog2[op.Size + 1] };
+
+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
+                                                   nameof(Sse41.ConvertToVector128Int32),
+                                                   nameof(Sse41.ConvertToVector128Int64) };
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size + 1);
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesAdd));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRmBinaryOpZx(context, () => context.Emit(OpCodes.Add));
+            }
        }

        public static void Uhadd_V(ILEmitterCtx context)
@ -1992,11 +2541,34 @@ namespace ChocolArm64.Instructions

        public static void Umax_V(ILEmitterCtx context)
        {
-            Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;

-            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
+                Type[] typesMax = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };

-            EmitVectorBinaryOpZx(context, () => context.EmitCall(mthdInfo));
+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.Max), typesMax));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
+
+                MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Max), types);
+
+                EmitVectorBinaryOpZx(context, () => context.EmitCall(mthdInfo));
+            }
        }

        public static void Umaxp_V(ILEmitterCtx context)
@ -2010,11 +2582,34 @@ namespace ChocolArm64.Instructions

        public static void Umin_V(ILEmitterCtx context)
        {
-            Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;

-            MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
+                Type[] typesMin = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], VectorUIntTypesPerSizeLog2[op.Size] };

-            EmitVectorBinaryOpZx(context, () => context.EmitCall(mthdInfo));
+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.Min), typesMin));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                Type[] types = new Type[] { typeof(ulong), typeof(ulong) };
+
+                MethodInfo mthdInfo = typeof(Math).GetMethod(nameof(Math.Min), types);
+
+                EmitVectorBinaryOpZx(context, () => context.EmitCall(mthdInfo));
+            }
        }

        public static void Uminp_V(ILEmitterCtx context)
@ -2037,7 +2632,7 @@ namespace ChocolArm64.Instructions
                Type[] typesMulAdd = new Type[] { VectorIntTypesPerSizeLog2 [op.Size + 1],
                                                  VectorIntTypesPerSizeLog2 [op.Size + 1] };

-                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);

                string nameCvt = op.Size == 0
                    ? nameof(Sse41.ConvertToVector128Int16)
@ -2061,7 +2656,7 @@ namespace ChocolArm64.Instructions

                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));

-                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulAdd));
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.MultiplyLow), typesMulAdd));

                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Add), typesMulAdd));

@ -2088,7 +2683,7 @@ namespace ChocolArm64.Instructions
                Type[] typesMulSub = new Type[] { VectorIntTypesPerSizeLog2 [op.Size + 1],
                                                  VectorIntTypesPerSizeLog2 [op.Size + 1] };

-                Type typeMul = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);
+                Type typeSse = op.Size == 0 ? typeof(Sse2) : typeof(Sse41);

                string nameCvt = op.Size == 0
                    ? nameof(Sse41.ConvertToVector128Int16)
@ -2112,7 +2707,7 @@ namespace ChocolArm64.Instructions

                context.EmitCall(typeof(Sse41).GetMethod(nameCvt, typesCvt));

-                context.EmitCall(typeMul.GetMethod(nameof(Sse2.MultiplyLow), typesMulSub));
+                context.EmitCall(typeSse.GetMethod(nameof(Sse2.MultiplyLow), typesMulSub));

                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesMulSub));

@ -2251,7 +2846,38 @@ namespace ChocolArm64.Instructions

        public static void Usubw_V(ILEmitterCtx context)
        {
-            EmitVectorWidenRmBinaryOpZx(context, () => context.Emit(OpCodes.Sub));
+            if (Optimizations.UseSse41)
+            {
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] typesSrl = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
+                Type[] typesCvt = new Type[] { VectorUIntTypesPerSizeLog2[op.Size] };
+                Type[] typesSub = new Type[] { VectorUIntTypesPerSizeLog2[op.Size + 1],
+                                               VectorUIntTypesPerSizeLog2[op.Size + 1] };
+
+                string[] namesCvt = new string[] { nameof(Sse41.ConvertToVector128Int16),
+                                                   nameof(Sse41.ConvertToVector128Int32),
+                                                   nameof(Sse41.ConvertToVector128Int64) };
+
+                int numBytes = op.RegisterSize == RegisterSize.Simd128 ? 8 : 0;
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size + 1);
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+
+                context.EmitLdc_I4(numBytes);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesSrl));
+
+                context.EmitCall(typeof(Sse41).GetMethod(namesCvt[op.Size], typesCvt));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSub));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size + 1);
+            }
+            else
+            {
+                EmitVectorWidenRmBinaryOpZx(context, () => context.Emit(OpCodes.Sub));
+            }
        }

        private static void EmitAbs(ILEmitterCtx context)
--- a/Instructions/InstEmitSimdCmp.cs
+++ b/Instructions/InstEmitSimdCmp.cs
@ -3,6 +3,7 @@ using ChocolArm64.State;
 using ChocolArm64.Translation;
 using System;
 using System.Reflection.Emit;
+using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;

 using static ChocolArm64.Instructions.InstEmitAluHelper;
@ -137,26 +138,43 @@ namespace ChocolArm64.Instructions

            context.EmitCondBranch(lblTrue, op.Cond);

-            EmitSetNzcv(context, op.Nzcv);
+            context.EmitLdc_I4(op.Nzcv);
+            EmitSetNzcv(context);

            context.Emit(OpCodes.Br, lblEnd);

            context.MarkLabel(lblTrue);

-            Fcmp_S(context);
+            EmitFcmpE(context, signalNaNs: false);

            context.MarkLabel(lblEnd);
        }

        public static void Fccmpe_S(ILEmitterCtx context)
        {
-            Fccmp_S(context);
+            OpCodeSimdFcond64 op = (OpCodeSimdFcond64)context.CurrOp;
+
+            ILLabel lblTrue = new ILLabel();
+            ILLabel lblEnd  = new ILLabel();
+
+            context.EmitCondBranch(lblTrue, op.Cond);
+
+            context.EmitLdc_I4(op.Nzcv);
+            EmitSetNzcv(context);
+
+            context.Emit(OpCodes.Br, lblEnd);
+
+            context.MarkLabel(lblTrue);
+
+            EmitFcmpE(context, signalNaNs: true);
+
+            context.MarkLabel(lblEnd);
        }

        public static void Fcmeq_S(ILEmitterCtx context)
        {
            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
-                                                 && Optimizations.UseSse2)
+                                                  && Optimizations.UseSse2)
            {
                EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar));
            }
@ -169,7 +187,7 @@ namespace ChocolArm64.Instructions
        public static void Fcmeq_V(ILEmitterCtx context)
        {
            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
-                                                 && Optimizations.UseSse2)
+                                                  && Optimizations.UseSse2)
            {
                EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareEqual));
            }
@ -182,7 +200,7 @@ namespace ChocolArm64.Instructions
        public static void Fcmge_S(ILEmitterCtx context)
        {
            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
-                                                 && Optimizations.UseSse2)
+                                                  && Optimizations.UseSse2)
            {
                EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar));
            }
@ -195,7 +213,7 @@ namespace ChocolArm64.Instructions
        public static void Fcmge_V(ILEmitterCtx context)
        {
            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
-                                                 && Optimizations.UseSse2)
+                                                  && Optimizations.UseSse2)
            {
                EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual));
            }
@ -208,7 +226,7 @@ namespace ChocolArm64.Instructions
        public static void Fcmgt_S(ILEmitterCtx context)
        {
            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
-                                                 && Optimizations.UseSse2)
+                                                  && Optimizations.UseSse2)
            {
                EmitScalarSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar));
            }
@ -221,7 +239,7 @@ namespace ChocolArm64.Instructions
        public static void Fcmgt_V(ILEmitterCtx context)
        {
            if (context.CurrOp is OpCodeSimdReg64 && Optimizations.UseSse
-                                                 && Optimizations.UseSse2)
+                                                  && Optimizations.UseSse2)
            {
                EmitVectorSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan));
            }
@ -252,31 +270,157 @@ namespace ChocolArm64.Instructions
        }

        public static void Fcmp_S(ILEmitterCtx context)
+        {
+            EmitFcmpE(context, signalNaNs: false);
+        }
+
+        public static void Fcmpe_S(ILEmitterCtx context)
+        {
+            EmitFcmpE(context, signalNaNs: true);
+        }
+
+        private static void EmitFcmpE(ILEmitterCtx context, bool signalNaNs)
        {
            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;

            bool cmpWithZero = !(op is OpCodeSimdFcond64) ? op.Bit3 : false;

-            //Handle NaN case.
-            //If any number is NaN, then NZCV = 0011.
-            if (cmpWithZero)
+            if (Optimizations.FastFP && Optimizations.UseSse2)
            {
-                EmitNaNCheck(context, op.Rn);
+                if (op.Size == 0)
+                {
+                    Type[] typesCmp = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+
+                    ILLabel lblNaN = new ILLabel();
+                    ILLabel lblEnd = new ILLabel();
+
+                    context.EmitLdvec(op.Rn);
+
+                    context.Emit(OpCodes.Dup);
+                    context.EmitStvectmp();
+
+                    if (cmpWithZero)
+                    {
+                        VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+                    }
+                    else
+                    {
+                        context.EmitLdvec(op.Rm);
+                    }
+
+                    context.Emit(OpCodes.Dup);
+                    context.EmitStvectmp2();
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareOrderedScalar), typesCmp));
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareEqualOrderedScalar), typesCmp));
+
+                    context.Emit(OpCodes.Brtrue_S, lblNaN);
+
+                    context.EmitLdc_I4(0);
+
+                    context.EmitLdvectmp();
+                    context.EmitLdvectmp2();
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareGreaterThanOrEqualOrderedScalar), typesCmp));
+
+                    context.EmitLdvectmp();
+                    context.EmitLdvectmp2();
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareEqualOrderedScalar), typesCmp));
+
+                    context.EmitLdvectmp();
+                    context.EmitLdvectmp2();
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.CompareLessThanOrderedScalar), typesCmp));
+
+                    context.EmitStflg((int)PState.NBit);
+                    context.EmitStflg((int)PState.ZBit);
+                    context.EmitStflg((int)PState.CBit);
+                    context.EmitStflg((int)PState.VBit);
+
+                    context.Emit(OpCodes.Br_S, lblEnd);
+
+                    context.MarkLabel(lblNaN);
+
+                    context.EmitLdc_I4(1);
+                    context.Emit(OpCodes.Dup);
+                    context.EmitLdc_I4(0);
+                    context.Emit(OpCodes.Dup);
+
+                    context.EmitStflg((int)PState.NBit);
+                    context.EmitStflg((int)PState.ZBit);
+                    context.EmitStflg((int)PState.CBit);
+                    context.EmitStflg((int)PState.VBit);
+
+                    context.MarkLabel(lblEnd);
+                }
+                else /* if (op.Size == 1) */
+                {
+                    Type[] typesCmp = new Type[] { typeof(Vector128<double>), typeof(Vector128<double>) };
+
+                    ILLabel lblNaN = new ILLabel();
+                    ILLabel lblEnd = new ILLabel();
+
+                    EmitLdvecWithCastToDouble(context, op.Rn);
+
+                    context.Emit(OpCodes.Dup);
+                    context.EmitStvectmp();
+
+                    if (cmpWithZero)
+                    {
+                        VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleZero));
+                    }
+                    else
+                    {
+                        EmitLdvecWithCastToDouble(context, op.Rm);
+                    }
+
+                    context.Emit(OpCodes.Dup);
+                    context.EmitStvectmp2();
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareOrderedScalar), typesCmp));
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorDoubleZero));
+
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareEqualOrderedScalar), typesCmp));
+
+                    context.Emit(OpCodes.Brtrue_S, lblNaN);
+
+                    context.EmitLdc_I4(0);
+
+                    context.EmitLdvectmp();
+                    context.EmitLdvectmp2();
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareGreaterThanOrEqualOrderedScalar), typesCmp));
+
+                    context.EmitLdvectmp();
+                    context.EmitLdvectmp2();
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareEqualOrderedScalar), typesCmp));
+
+                    context.EmitLdvectmp();
+                    context.EmitLdvectmp2();
+                    context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareLessThanOrderedScalar), typesCmp));
+
+                    context.EmitStflg((int)PState.NBit);
+                    context.EmitStflg((int)PState.ZBit);
+                    context.EmitStflg((int)PState.CBit);
+                    context.EmitStflg((int)PState.VBit);
+
+                    context.Emit(OpCodes.Br_S, lblEnd);
+
+                    context.MarkLabel(lblNaN);
+
+                    context.EmitLdc_I4(1);
+                    context.Emit(OpCodes.Dup);
+                    context.EmitLdc_I4(0);
+                    context.Emit(OpCodes.Dup);
+
+                    context.EmitStflg((int)PState.NBit);
+                    context.EmitStflg((int)PState.ZBit);
+                    context.EmitStflg((int)PState.CBit);
+                    context.EmitStflg((int)PState.VBit);
+
+                    context.MarkLabel(lblEnd);
+                }
            }
            else
-            {
-                EmitNaNCheck(context, op.Rn);
-                EmitNaNCheck(context, op.Rm);
-
-                context.Emit(OpCodes.Or);
-            }
-
-            ILLabel lblNaN = new ILLabel();
-            ILLabel lblEnd = new ILLabel();
-
-            context.Emit(OpCodes.Brtrue_S, lblNaN);
-
-            void EmitLoadOpers()
            {
                EmitVectorExtractF(context, op.Rn, 0, op.Size);

@ -286,7 +430,7 @@ namespace ChocolArm64.Instructions
                    {
                        context.EmitLdc_R4(0f);
                    }
-                    else /* if (Op.Size == 1) */
+                    else // if (op.Size == 1)
                    {
                        context.EmitLdc_R8(0d);
                    }
@ -295,67 +439,12 @@ namespace ChocolArm64.Instructions
                {
                    EmitVectorExtractF(context, op.Rm, 0, op.Size);
                }
-            }

-            //Z = Rn == Rm
-            EmitLoadOpers();
+                context.EmitLdc_I4(!signalNaNs ? 0 : 1);

-            context.Emit(OpCodes.Ceq);
-            context.Emit(OpCodes.Dup);
+                EmitSoftFloatCall(context, nameof(SoftFloat32.FPCompare));

-            context.EmitStflg((int)PState.ZBit);
-
-            //C = Rn >= Rm
-            EmitLoadOpers();
-
-            context.Emit(OpCodes.Cgt);
-            context.Emit(OpCodes.Or);
-
-            context.EmitStflg((int)PState.CBit);
-
-            //N = Rn < Rm
-            EmitLoadOpers();
-
-            context.Emit(OpCodes.Clt);
-
-            context.EmitStflg((int)PState.NBit);
-
-            //V = 0
-            context.EmitLdc_I4(0);
-
-            context.EmitStflg((int)PState.VBit);
-
-            context.Emit(OpCodes.Br_S, lblEnd);
-
-            context.MarkLabel(lblNaN);
-
-            EmitSetNzcv(context, 0b0011);
-
-            context.MarkLabel(lblEnd);
-        }
-
-        public static void Fcmpe_S(ILEmitterCtx context)
-        {
-            Fcmp_S(context);
-        }
-
-        private static void EmitNaNCheck(ILEmitterCtx context, int reg)
-        {
-            IOpCodeSimd64 op = (IOpCodeSimd64)context.CurrOp;
-
-            EmitVectorExtractF(context, reg, 0, op.Size);
-
-            if (op.Size == 0)
-            {
-                context.EmitCall(typeof(float), nameof(float.IsNaN));
-            }
-            else if (op.Size == 1)
-            {
-                context.EmitCall(typeof(double), nameof(double.IsNaN));
-            }
-            else
-            {
-                throw new InvalidOperationException();
+                EmitSetNzcv(context);
            }
        }

@ -486,7 +575,7 @@ namespace ChocolArm64.Instructions
            {
                context.EmitLdc_R4(0f);
            }
-            else /* if (SizeF == 1) */
+            else /* if (sizeF == 1) */
            {
                context.EmitLdc_R8(0d);
            }
--- a/Instructions/InstEmitSimdCvt.cs
+++ b/Instructions/InstEmitSimdCvt.cs
@ -76,33 +76,54 @@ namespace ChocolArm64.Instructions

            int sizeF = op.Size & 1;

-            int elems = 4 >> sizeF;
-
-            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
-
-            for (int index = 0; index < elems; index++)
+            if (Optimizations.UseSse2 && sizeF == 1)
            {
-                if (sizeF == 0)
-                {
-                    EmitVectorExtractZx(context, op.Rn, part + index, 1);
-                    context.Emit(OpCodes.Conv_U2);
+                Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+                Type[] typesCvt = new Type[] { typeof(Vector128<float>) };

-                    context.EmitLdarg(TranslatedSub.StateArgIdx);
+                string nameMov = op.RegisterSize == RegisterSize.Simd128
+                    ? nameof(Sse.MoveHighToLow)
+                    : nameof(Sse.MoveLowToHigh);

-                    context.EmitCall(typeof(SoftFloat16_32), nameof(SoftFloat16_32.FPConvert));
-                }
-                else /* if (sizeF == 1) */
-                {
-                    EmitVectorExtractF(context, op.Rn, part + index, 0);
+                context.EmitLdvec(op.Rn);
+                context.Emit(OpCodes.Dup);

-                    context.Emit(OpCodes.Conv_R8);
-                }
+                context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));

-                EmitVectorInsertTmpF(context, index, sizeF);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Double), typesCvt));
+
+                EmitStvecWithCastFromDouble(context, op.Rd);
            }
+            else
+            {
+                int elems = 4 >> sizeF;

-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
+                int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+                for (int index = 0; index < elems; index++)
+                {
+                    if (sizeF == 0)
+                    {
+                        EmitVectorExtractZx(context, op.Rn, part + index, 1);
+                        context.Emit(OpCodes.Conv_U2);
+
+                        context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                        context.EmitCall(typeof(SoftFloat16_32), nameof(SoftFloat16_32.FPConvert));
+                    }
+                    else /* if (sizeF == 1) */
+                    {
+                        EmitVectorExtractF(context, op.Rn, part + index, 0);
+
+                        context.Emit(OpCodes.Conv_R8);
+                    }
+
+                    EmitVectorInsertTmpF(context, index, sizeF);
+                }
+
+                context.EmitLdvectmp();
+                context.EmitStvec(op.Rd);
+            }
        }

        public static void Fcvtms_Gp(ILEmitterCtx context)
@ -121,43 +142,70 @@ namespace ChocolArm64.Instructions

            int sizeF = op.Size & 1;

-            int elems = 4 >> sizeF;
-
-            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
-
-            if (part != 0)
+            if (Optimizations.UseSse2 && sizeF == 1)
            {
+                Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
+                Type[] typesCvt = new Type[] { typeof(Vector128<double>) };
+
+                string nameMov = op.RegisterSize == RegisterSize.Simd128
+                    ? nameof(Sse.MoveLowToHigh)
+                    : nameof(Sse.MoveHighToLow);
+
                context.EmitLdvec(op.Rd);
-                context.EmitStvectmp();
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
+
+                EmitLdvecWithCastToDouble(context, op.Rn);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt));
+                context.Emit(OpCodes.Dup);
+
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
+
+                context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
+
+                context.EmitStvec(op.Rd);
            }
-
-            for (int index = 0; index < elems; index++)
+            else
            {
-                EmitVectorExtractF(context, op.Rn, index, sizeF);
+                int elems = 4 >> sizeF;

-                if (sizeF == 0)
+                int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
+
+                if (part != 0)
                {
-                    context.EmitLdarg(TranslatedSub.StateArgIdx);
-
-                    context.EmitCall(typeof(SoftFloat32_16), nameof(SoftFloat32_16.FPConvert));
-
-                    context.Emit(OpCodes.Conv_U8);
-                    EmitVectorInsertTmp(context, part + index, 1);
+                    context.EmitLdvec(op.Rd);
+                    context.EmitStvectmp();
                }
-                else /* if (sizeF == 1) */
+
+                for (int index = 0; index < elems; index++)
                {
-                    context.Emit(OpCodes.Conv_R4);
+                    EmitVectorExtractF(context, op.Rn, index, sizeF);

-                    EmitVectorInsertTmpF(context, part + index, 0);
+                    if (sizeF == 0)
+                    {
+                        context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                        context.EmitCall(typeof(SoftFloat32_16), nameof(SoftFloat32_16.FPConvert));
+
+                        context.Emit(OpCodes.Conv_U8);
+                        EmitVectorInsertTmp(context, part + index, 1);
+                    }
+                    else /* if (sizeF == 1) */
+                    {
+                        context.Emit(OpCodes.Conv_R4);
+
+                        EmitVectorInsertTmpF(context, part + index, 0);
+                    }
                }
-            }

-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
+                context.EmitLdvectmp();
+                context.EmitStvec(op.Rd);

-            if (part == 0)
-            {
-                EmitVectorZeroUpper(context, op.Rd);
+                if (part == 0)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
            }
        }

@ -260,7 +308,29 @@ namespace ChocolArm64.Instructions

        public static void Scvtf_V(ILEmitterCtx context)
        {
-            EmitVectorCvtf(context, signed: true);
+            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+
+            if (Optimizations.UseSse2 && sizeF == 0)
+            {
+                Type[] typesCvt = new Type[] { typeof(Vector128<int>) };
+
+                EmitLdvecWithSignedCast(context, op.Rn, 2);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt));
+
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorCvtf(context, signed: true);
+            }
        }

        public static void Ucvtf_Gp(ILEmitterCtx context)
@ -441,16 +511,6 @@ namespace ChocolArm64.Instructions
            context.EmitStintzr(op.Rd);
        }

-        private static void EmitVectorScvtf(ILEmitterCtx context)
-        {
-            EmitVectorCvtf(context, true);
-        }
-
-        private static void EmitVectorUcvtf(ILEmitterCtx context)
-        {
-            EmitVectorCvtf(context, false);
-        }
-
        private static void EmitVectorCvtf(ILEmitterCtx context, bool signed)
        {
            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
--- a/Instructions/InstEmitSimdHelper.cs
+++ b/Instructions/InstEmitSimdHelper.cs
@ -219,7 +219,7 @@ namespace ChocolArm64.Instructions
                type     = typeof(Sse);
                baseType = typeof(Vector128<float>);
            }
-            else /* if (SizeF == 1) */
+            else /* if (sizeF == 1) */
            {
                type     = typeof(Sse2);
                baseType = typeof(Vector128<double>);
@ -249,7 +249,7 @@ namespace ChocolArm64.Instructions
                {
                    EmitVectorZero32_128(context, op.Rd);
                }
-                else /* if (SizeF == 1) */
+                else /* if (sizeF == 1) */
                {
                    EmitVectorZeroUpper(context, op.Rd);
                }
@ -272,7 +272,7 @@ namespace ChocolArm64.Instructions
            {
                mthdInfo = typeof(MathF).GetMethod(name, new Type[] { typeof(float) });
            }
-            else /* if (SizeF == 1) */
+            else /* if (sizeF == 1) */
            {
                mthdInfo = typeof(Math).GetMethod(name, new Type[] { typeof(double) });
            }
@ -292,7 +292,7 @@ namespace ChocolArm64.Instructions
            {
                mthdInfo = typeof(MathF).GetMethod(name, new Type[] { typeof(float), typeof(float) });
            }
-            else /* if (SizeF == 1) */
+            else /* if (sizeF == 1) */
            {
                mthdInfo = typeof(Math).GetMethod(name, new Type[] { typeof(double), typeof(double) });
            }
@ -312,7 +312,7 @@ namespace ChocolArm64.Instructions
            {
                mthdInfo = typeof(MathF).GetMethod(nameof(MathF.Round), new Type[] { typeof(float), typeof(MidpointRounding) });
            }
-            else /* if (SizeF == 1) */
+            else /* if (sizeF == 1) */
            {
                mthdInfo = typeof(Math).GetMethod(nameof(Math.Round), new Type[] { typeof(double), typeof(MidpointRounding) });
            }
@ -334,7 +334,7 @@ namespace ChocolArm64.Instructions
            {
                mthdInfo = typeof(SoftFloat).GetMethod(name, new Type[] { typeof(float) });
            }
-            else /* if (SizeF == 1) */
+            else /* if (sizeF == 1) */
            {
                mthdInfo = typeof(SoftFloat).GetMethod(name, new Type[] { typeof(double) });
            }
@ -961,7 +961,7 @@ namespace ChocolArm64.Instructions
                {
                    EmitSatQ(context, op.Size, true, true);
                }
-                else /* if (Op.Size == 3) */
+                else /* if (op.Size == 3) */
                {
                    EmitUnarySignedSatQAbsOrNeg(context);
                }
@ -1022,7 +1022,7 @@ namespace ChocolArm64.Instructions
            {
                for (int index = 0; index < elems; index++)
                {
-                    EmitVectorExtract(context,                   op.Rn, index, op.Size, signed);
+                    EmitVectorExtract(context,                   op.Rn,  index, op.Size, signed);
                    EmitVectorExtract(context, ((OpCodeSimdReg64)op).Rm, index, op.Size, signed);

                    if (op.Size <= 2)
@ -1031,13 +1031,13 @@ namespace ChocolArm64.Instructions

                        EmitSatQ(context, op.Size, true, signed);
                    }
-                    else /* if (Op.Size == 3) */
+                    else /* if (op.Size == 3) */
                    {
                        if (add)
                        {
                            EmitBinarySatQAdd(context, signed);
                        }
-                        else /* if (Sub) */
+                        else /* if (sub) */
                        {
                            EmitBinarySatQSub(context, signed);
                        }
@ -1059,7 +1059,7 @@ namespace ChocolArm64.Instructions

                        EmitSatQ(context, op.Size, true, signed);
                    }
-                    else /* if (Op.Size == 3) */
+                    else /* if (op.Size == 3) */
                    {
                        EmitBinarySatQAccumulate(context, signed);
                    }
@ -1071,7 +1071,7 @@ namespace ChocolArm64.Instructions
            {
                for (int index = 0; index < elems; index++)
                {
-                    EmitVectorExtract(context,                   op.Rn, index, op.Size, signed);
+                    EmitVectorExtract(context,                   op.Rn,  index, op.Size, signed);
                    EmitVectorExtract(context, ((OpCodeSimdReg64)op).Rm, index, op.Size, signed);

                    emit();
@ -1304,52 +1304,64 @@ namespace ChocolArm64.Instructions
            }
        }

-        public static void EmitVectorZeroAll(ILEmitterCtx context, int rd)
+        public static void EmitVectorZeroAll(ILEmitterCtx context, int reg)
        {
-            if (Optimizations.UseSse2)
+            if (Optimizations.UseSse)
            {
                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));

-                context.EmitStvec(rd);
+                context.EmitStvec(reg);
            }
            else
            {
-                EmitVectorZeroLower(context, rd);
-                EmitVectorZeroUpper(context, rd);
+                EmitVectorZeroLower(context, reg);
+                EmitVectorZeroUpper(context, reg);
            }
        }

-        public static void EmitVectorZeroLower(ILEmitterCtx context, int rd)
+        public static void EmitVectorZeroLower(ILEmitterCtx context, int reg)
        {
-            EmitVectorInsert(context, rd, 0, 3, 0);
+            EmitVectorInsert(context, reg, 0, 3, 0);
        }

        public static void EmitVectorZeroLowerTmp(ILEmitterCtx context)
        {
-            EmitVectorInsertTmp(context, 0, 3, 0);
+            if (Optimizations.UseSse)
+            {
+                context.EmitLdvectmp();
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveHighToLow)));
+
+                context.EmitStvectmp();
+            }
+            else
+            {
+                EmitVectorInsertTmp(context, 0, 3, 0);
+            }
        }

        public static void EmitVectorZeroUpper(ILEmitterCtx context, int reg)
        {
-            if (Optimizations.UseSse2)
+            if (Optimizations.UseSse)
            {
-                //TODO: Use MoveScalar once it is fixed, as of the
-                //time of writing it just crashes the JIT.
+                //TODO: Use Sse2.MoveScalar once it is fixed,
+                //as of the time of writing it just crashes the JIT (SDK 2.1.500).
+
+                /*Type[] typesMov = new Type[] { typeof(Vector128<ulong>) };
+
                EmitLdvecWithUnsignedCast(context, reg, 3);

-                Type[] types = new Type[] { typeof(Vector128<ulong>), typeof(byte) };
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MoveScalar), typesMov));

-                //Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MoveScalar), Types));
+                EmitStvecWithUnsignedCast(context, reg, 3);*/

-                context.EmitLdc_I4(8);
+                context.EmitLdvec(reg);
+                VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));

-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical128BitLane), types));
+                context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));

-                context.EmitLdc_I4(8);
-
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), types));
-
-                EmitStvecWithUnsignedCast(context, reg, 3);
+                context.EmitStvec(reg);
            }
            else
            {
@ -1359,9 +1371,15 @@ namespace ChocolArm64.Instructions

        public static void EmitVectorZero32_128(ILEmitterCtx context, int reg)
        {
+            if (!Sse.IsSupported)
+            {
+                throw new PlatformNotSupportedException();
+            }
+
+            VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
            context.EmitLdvec(reg);

-            VectorHelper.EmitCall(context, nameof(VectorHelper.VectorZero32_128));
+            context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveScalar)));

            context.EmitStvec(reg);
        }
--- a/Instructions/InstEmitSimdMove.cs
+++ b/Instructions/InstEmitSimdMove.cs
@ -3,6 +3,7 @@ using ChocolArm64.State;
 using ChocolArm64.Translation;
 using System;
 using System.Reflection.Emit;
+using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;

 using static ChocolArm64.Instructions.InstEmitSimdHelper;
@ -17,6 +18,8 @@ namespace ChocolArm64.Instructions

            if (Optimizations.UseSse2)
            {
+                Type[] typesSav = new Type[] { UIntTypesPerSizeLog2[op.Size] };
+
                context.EmitLdintzr(op.Rn);

                switch (op.Size)
@ -26,16 +29,9 @@ namespace ChocolArm64.Instructions
                    case 2: context.Emit(OpCodes.Conv_U4); break;
                }

-                Type[] types = new Type[] { UIntTypesPerSizeLog2[op.Size] };
-
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), types));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));

                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
-
-                if (op.RegisterSize == RegisterSize.Simd64)
-                {
-                    EmitVectorZeroUpper(context, op.Rd);
-                }
            }
            else
            {
@ -48,11 +44,11 @@ namespace ChocolArm64.Instructions

                    EmitVectorInsert(context, op.Rd, index, op.Size);
                }
+            }

-                if (op.RegisterSize == RegisterSize.Simd64)
-                {
-                    EmitVectorZeroUpper(context, op.Rd);
-                }
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
            }
        }

@ -69,14 +65,34 @@ namespace ChocolArm64.Instructions
        {
            OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;

-            int bytes = op.GetBitsCount() >> 3;
-            int elems = bytes >> op.Size;
-
-            for (int index = 0; index < elems; index++)
+            if (Optimizations.UseSse2)
            {
+                Type[] typesSav = new Type[] { UIntTypesPerSizeLog2[op.Size] };
+
                EmitVectorExtractZx(context, op.Rn, op.DstIndex, op.Size);

-                EmitVectorInsert(context, op.Rd, index, op.Size);
+                switch (op.Size)
+                {
+                    case 0: context.Emit(OpCodes.Conv_U1); break;
+                    case 1: context.Emit(OpCodes.Conv_U2); break;
+                    case 2: context.Emit(OpCodes.Conv_U4); break;
+                }
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+            }
+            else
+            {
+                int bytes = op.GetBitsCount() >> 3;
+                int elems = bytes >> op.Size;
+
+                for (int index = 0; index < elems; index++)
+                {
+                    EmitVectorExtractZx(context, op.Rn, op.DstIndex, op.Size);
+
+                    EmitVectorInsert(context, op.Rd, index, op.Size);
+                }
            }

            if (op.RegisterSize == RegisterSize.Simd64)
@ -89,32 +105,65 @@ namespace ChocolArm64.Instructions
        {
            OpCodeSimdExt64 op = (OpCodeSimdExt64)context.CurrOp;

-            context.EmitLdvec(op.Rd);
-            context.EmitStvectmp();
-
-            int bytes = op.GetBitsCount() >> 3;
-
-            int position = op.Imm4;
-
-            for (int index = 0; index < bytes; index++)
+            if (Optimizations.UseSse2)
            {
-                int reg = op.Imm4 + index < bytes ? op.Rn : op.Rm;
+                Type[] typesShs = new Type[] { typeof(Vector128<byte>), typeof(byte) };
+                Type[] typesOr  = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };

-                if (position == bytes)
+                EmitLdvecWithUnsignedCast(context, op.Rn, 0);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
                {
-                    position = 0;
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
                }

-                EmitVectorExtractZx(context, reg, position++, 0);
-                EmitVectorInsertTmp(context, index, 0);
+                context.EmitLdc_I4(op.Imm4);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), typesShs));
+
+                EmitLdvecWithUnsignedCast(context, op.Rm, 0);
+
+                context.EmitLdc_I4((op.RegisterSize == RegisterSize.Simd64 ? 8 : 16) - op.Imm4);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical128BitLane), typesShs));
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
+
+                    context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
+                }
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), typesOr));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, 0);
            }
-
-            context.EmitLdvectmp();
-            context.EmitStvec(op.Rd);
-
-            if (op.RegisterSize == RegisterSize.Simd64)
+            else
            {
-                EmitVectorZeroUpper(context, op.Rd);
+                int bytes = op.GetBitsCount() >> 3;
+
+                int position = op.Imm4;
+
+                for (int index = 0; index < bytes; index++)
+                {
+                    int reg = op.Imm4 + index < bytes ? op.Rn : op.Rm;
+
+                    if (position == bytes)
+                    {
+                        position = 0;
+                    }
+
+                    EmitVectorExtractZx(context, reg, position++, 0);
+                    EmitVectorInsertTmp(context, index, 0);
+                }
+
+                context.EmitLdvectmp();
+                context.EmitStvec(op.Rd);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
            }
        }

--- a/Instructions/SoftFloat.cs
+++ b/Instructions/SoftFloat.cs
@ -789,6 +789,43 @@ namespace ChocolArm64.Instructions
            return result;
        }

+        public static int FPCompare(float value1, float value2, bool signalNaNs, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPCompare: state.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out _, state);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out _, state);
+
+            int result;
+
+            if (type1 == FpType.SNaN || type1 == FpType.QNaN || type2 == FpType.SNaN || type2 == FpType.QNaN)
+            {
+                result = 0b0011;
+
+                if (type1 == FpType.SNaN || type2 == FpType.SNaN || signalNaNs)
+                {
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+            }
+            else
+            {
+                if (value1 == value2)
+                {
+                    result = 0b0110;
+                }
+                else if (value1 < value2)
+                {
+                    result = 0b1000;
+                }
+                else
+                {
+                    result = 0b0010;
+                }
+            }
+
+            return result;
+        }
+
        public static float FPDiv(float value1, float value2, CpuThreadState state)
        {
            Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat32.FPDiv: state.Fpcr = 0x{state.Fpcr:X8}");
@ -1584,6 +1621,43 @@ namespace ChocolArm64.Instructions
            return result;
        }

+        public static int FPCompare(double value1, double value2, bool signalNaNs, CpuThreadState state)
+        {
+            Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPCompare: state.Fpcr = 0x{state.Fpcr:X8}");
+
+            value1 = value1.FPUnpack(out FpType type1, out bool sign1, out _, state);
+            value2 = value2.FPUnpack(out FpType type2, out bool sign2, out _, state);
+
+            int result;
+
+            if (type1 == FpType.SNaN || type1 == FpType.QNaN || type2 == FpType.SNaN || type2 == FpType.QNaN)
+            {
+                result = 0b0011;
+
+                if (type1 == FpType.SNaN || type2 == FpType.SNaN || signalNaNs)
+                {
+                    FPProcessException(FpExc.InvalidOp, state);
+                }
+            }
+            else
+            {
+                if (value1 == value2)
+                {
+                    result = 0b0110;
+                }
+                else if (value1 < value2)
+                {
+                    result = 0b1000;
+                }
+                else
+                {
+                    result = 0b0010;
+                }
+            }
+
+            return result;
+        }
+
        public static double FPDiv(double value1, double value2, CpuThreadState state)
        {
            Debug.WriteLineIf(state.Fpcr != 0, $"SoftFloat64.FPDiv: state.Fpcr = 0x{state.Fpcr:X8}");
--- a/Instructions/VectorHelper.cs
+++ b/Instructions/VectorHelper.cs
@ -9,18 +9,6 @@ namespace ChocolArm64.Instructions
 {
    static class VectorHelper
    {
-        private static readonly Vector128<float> Zero32128Mask;
-
-        static VectorHelper()
-        {
-            if (!Sse2.IsSupported)
-            {
-                throw new PlatformNotSupportedException();
-            }
-
-            Zero32128Mask = Sse.StaticCast<uint, float>(Sse2.SetVector128(0, 0, 0, 0xffffffff));
-        }
-
        public static void EmitCall(ILEmitterCtx context, string name64, string name128)
        {
            bool isSimd64 = context.CurrOp.RegisterSize == RegisterSize.Simd64;
@ -491,7 +479,7 @@ namespace ChocolArm64.Instructions
            {
                int intValue = BitConverter.SingleToInt32Bits(value);

-                ushort low  = (ushort)(intValue >> 0);
+                ushort low  = (ushort)(intValue >>  0);
                ushort high = (ushort)(intValue >> 16);

                Vector128<ushort> shortVector = Sse.StaticCast<float, ushort>(vector);
@ -578,17 +566,6 @@ namespace ChocolArm64.Instructions
            throw new PlatformNotSupportedException();
        }

-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static Vector128<float> VectorZero32_128(Vector128<float> vector)
-        {
-            if (Sse.IsSupported)
-            {
-                return Sse.And(vector, Zero32128Mask);
-            }
-
-            throw new PlatformNotSupportedException();
-        }
-
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static Vector128<sbyte> VectorSingleToSByte(Vector128<float> vector)
        {
--- a/OpCodeTable.cs
+++ b/OpCodeTable.cs
@ -216,9 +216,9 @@ namespace ChocolArm64
            SetA64("01011110111xxxxx100011xxxxxxxxxx", InstEmit.Cmtst_S,       typeof(OpCodeSimdReg64));
            SetA64("0>001110<<1xxxxx100011xxxxxxxxxx", InstEmit.Cmtst_V,       typeof(OpCodeSimdReg64));
            SetA64("0x00111000100000010110xxxxxxxxxx", InstEmit.Cnt_V,         typeof(OpCodeSimd64));
-            SetA64("0x001110000xxxxx000011xxxxxxxxxx", InstEmit.Dup_Gp,        typeof(OpCodeSimdIns64));
+            SetA64("0>001110000x<>>>000011xxxxxxxxxx", InstEmit.Dup_Gp,        typeof(OpCodeSimdIns64));
            SetA64("01011110000xxxxx000001xxxxxxxxxx", InstEmit.Dup_S,         typeof(OpCodeSimdIns64));
-            SetA64("0x001110000xxxxx000001xxxxxxxxxx", InstEmit.Dup_V,         typeof(OpCodeSimdIns64));
+            SetA64("0>001110000x<>>>000001xxxxxxxxxx", InstEmit.Dup_V,         typeof(OpCodeSimdIns64));
            SetA64("0x101110001xxxxx000111xxxxxxxxxx", InstEmit.Eor_V,         typeof(OpCodeSimdReg64));
            SetA64("0>101110000xxxxx0<xxx0xxxxxxxxxx", InstEmit.Ext_V,         typeof(OpCodeSimdExt64));
            SetA64("011111101x1xxxxx110101xxxxxxxxxx", InstEmit.Fabd_S,        typeof(OpCodeSimdReg64));
@ -384,9 +384,9 @@ namespace ChocolArm64
            SetA64("0x001110<<1xxxxx000000xxxxxxxxxx", InstEmit.Saddl_V,       typeof(OpCodeSimdReg64));
            SetA64("0x001110<<100000001010xxxxxxxxxx", InstEmit.Saddlp_V,      typeof(OpCodeSimd64));
            SetA64("0x001110<<1xxxxx000100xxxxxxxxxx", InstEmit.Saddw_V,       typeof(OpCodeSimdReg64));
-            SetA64("x0011110xx100010000000xxxxxxxxxx", InstEmit.Scvtf_Gp,      typeof(OpCodeSimdCvt64));
+            SetA64("x00111100x100010000000xxxxxxxxxx", InstEmit.Scvtf_Gp,      typeof(OpCodeSimdCvt64));
            SetA64("010111100x100001110110xxxxxxxxxx", InstEmit.Scvtf_S,       typeof(OpCodeSimd64));
-            SetA64("0x0011100x100001110110xxxxxxxxxx", InstEmit.Scvtf_V,       typeof(OpCodeSimd64));
+            SetA64("0>0011100<100001110110xxxxxxxxxx", InstEmit.Scvtf_V,       typeof(OpCodeSimd64));
            SetA64("01011110000xxxxx000000xxxxxxxxxx", InstEmit.Sha1c_V,       typeof(OpCodeSimdReg64));
            SetA64("0101111000101000000010xxxxxxxxxx", InstEmit.Sha1h_V,       typeof(OpCodeSimd64));
            SetA64("01011110000xxxxx001000xxxxxxxxxx", InstEmit.Sha1m_V,       typeof(OpCodeSimdReg64));
@ -486,9 +486,9 @@ namespace ChocolArm64
            SetA64("001011100x110000001110xxxxxxxxxx", InstEmit.Uaddlv_V,      typeof(OpCodeSimd64));
            SetA64("01101110<<110000001110xxxxxxxxxx", InstEmit.Uaddlv_V,      typeof(OpCodeSimd64));
            SetA64("0x101110<<1xxxxx000100xxxxxxxxxx", InstEmit.Uaddw_V,       typeof(OpCodeSimdReg64));
-            SetA64("x0011110xx100011000000xxxxxxxxxx", InstEmit.Ucvtf_Gp,      typeof(OpCodeSimdCvt64));
+            SetA64("x00111100x100011000000xxxxxxxxxx", InstEmit.Ucvtf_Gp,      typeof(OpCodeSimdCvt64));
            SetA64("011111100x100001110110xxxxxxxxxx", InstEmit.Ucvtf_S,       typeof(OpCodeSimd64));
-            SetA64("0x1011100x100001110110xxxxxxxxxx", InstEmit.Ucvtf_V,       typeof(OpCodeSimd64));
+            SetA64("0>1011100<100001110110xxxxxxxxxx", InstEmit.Ucvtf_V,       typeof(OpCodeSimd64));
            SetA64("0x101110<<1xxxxx000001xxxxxxxxxx", InstEmit.Uhadd_V,       typeof(OpCodeSimdReg64));
            SetA64("0x101110<<1xxxxx001001xxxxxxxxxx", InstEmit.Uhsub_V,       typeof(OpCodeSimdReg64));
            SetA64("0x101110<<1xxxxx011001xxxxxxxxxx", InstEmit.Umax_V,        typeof(OpCodeSimdReg64));