mirror of
https://github.com/Ryujinx/Ryujinx.git
synced 2024-11-07 22:38:37 +00:00
Add Smlal_Ve, Smlsl_Ve, Smull_Ve, Umlal_Ve, Umlsl_Ve, Umull_Ve Inst.; add Tests. Add Sse Opt. for Trn1/2_V and Uzp1/2_V Inst. Nits. (#566)
* Update OpCodeTable.cs * Update InstEmitSimdArithmetic.cs * Update InstEmitSimdHelper.cs * Update CpuTestSimdRegElem.cs * Update InstEmitSimdMove.cs * Update InstEmitSimdCvt.cs * Update SoftFallback.cs * Update InstEmitSimdHelper.cs * Update SoftFloat.cs * Update CryptoHelper.cs * Update InstEmitSimdArithmetic.cs * Update InstEmitSimdCmp.cs * Address PR feedback. * Address PR feedback.
This commit is contained in:
parent
36b9ab0e48
commit
8f7fcede7f
|
@ -9,7 +9,7 @@ namespace ChocolArm64.Instructions
|
|||
static class CryptoHelper
|
||||
{
|
||||
#region "LookUp Tables"
|
||||
private static byte[] _sBox =
|
||||
private static readonly byte[] _sBox = new byte[]
|
||||
{
|
||||
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
|
||||
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
|
||||
|
@ -29,7 +29,7 @@ namespace ChocolArm64.Instructions
|
|||
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
|
||||
};
|
||||
|
||||
private static byte[] _invSBox =
|
||||
private static readonly byte[] _invSBox = new byte[]
|
||||
{
|
||||
0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
|
||||
0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
|
||||
|
@ -49,7 +49,7 @@ namespace ChocolArm64.Instructions
|
|||
0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
|
||||
};
|
||||
|
||||
private static byte[] _gfMul02 =
|
||||
private static readonly byte[] _gfMul02 = new byte[]
|
||||
{
|
||||
0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e,
|
||||
0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
|
||||
|
@ -69,7 +69,7 @@ namespace ChocolArm64.Instructions
|
|||
0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5
|
||||
};
|
||||
|
||||
private static byte[] _gfMul03 =
|
||||
private static readonly byte[] _gfMul03 = new byte[]
|
||||
{
|
||||
0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11,
|
||||
0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21,
|
||||
|
@ -89,7 +89,7 @@ namespace ChocolArm64.Instructions
|
|||
0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a
|
||||
};
|
||||
|
||||
private static byte[] _gfMul09 =
|
||||
private static readonly byte[] _gfMul09 = new byte[]
|
||||
{
|
||||
0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77,
|
||||
0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7,
|
||||
|
@ -109,7 +109,7 @@ namespace ChocolArm64.Instructions
|
|||
0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46
|
||||
};
|
||||
|
||||
private static byte[] _gfMul0B =
|
||||
private static readonly byte[] _gfMul0B = new byte[]
|
||||
{
|
||||
0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69,
|
||||
0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9,
|
||||
|
@ -129,7 +129,7 @@ namespace ChocolArm64.Instructions
|
|||
0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3
|
||||
};
|
||||
|
||||
private static byte[] _gfMul0D =
|
||||
private static readonly byte[] _gfMul0D = new byte[]
|
||||
{
|
||||
0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b,
|
||||
0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b,
|
||||
|
@ -149,7 +149,7 @@ namespace ChocolArm64.Instructions
|
|||
0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97
|
||||
};
|
||||
|
||||
private static byte[] _gfMul0E =
|
||||
private static readonly byte[] _gfMul0E = new byte[]
|
||||
{
|
||||
0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a,
|
||||
0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba,
|
||||
|
@ -169,9 +169,15 @@ namespace ChocolArm64.Instructions
|
|||
0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d
|
||||
};
|
||||
|
||||
private static byte[] _srPerm = { 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3 };
|
||||
private static readonly byte[] _srPerm = new byte[]
|
||||
{
|
||||
0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
|
||||
};
|
||||
|
||||
private static byte[] _isrPerm = { 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11 };
|
||||
private static readonly byte[] _isrPerm = new byte[]
|
||||
{
|
||||
0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
|
||||
};
|
||||
#endregion
|
||||
|
||||
public static Vector128<float> AesInvMixColumns(Vector128<float> op)
|
||||
|
@ -179,7 +185,7 @@ namespace ChocolArm64.Instructions
|
|||
byte[] inState = new byte[16];
|
||||
byte[] outState = new byte[16];
|
||||
|
||||
FromVectorToByteArray(inState, ref op);
|
||||
FromVectorToByteArray(op, inState);
|
||||
|
||||
for (int columns = 0; columns <= 3; columns++)
|
||||
{
|
||||
|
@ -206,7 +212,7 @@ namespace ChocolArm64.Instructions
|
|||
byte[] inState = new byte[16];
|
||||
byte[] outState = new byte[16];
|
||||
|
||||
FromVectorToByteArray(inState, ref op);
|
||||
FromVectorToByteArray(op, inState);
|
||||
|
||||
for (int idx = 0; idx <= 15; idx++)
|
||||
{
|
||||
|
@ -223,7 +229,7 @@ namespace ChocolArm64.Instructions
|
|||
byte[] inState = new byte[16];
|
||||
byte[] outState = new byte[16];
|
||||
|
||||
FromVectorToByteArray(inState, ref op);
|
||||
FromVectorToByteArray(op, inState);
|
||||
|
||||
for (int idx = 0; idx <= 15; idx++)
|
||||
{
|
||||
|
@ -240,7 +246,7 @@ namespace ChocolArm64.Instructions
|
|||
byte[] inState = new byte[16];
|
||||
byte[] outState = new byte[16];
|
||||
|
||||
FromVectorToByteArray(inState, ref op);
|
||||
FromVectorToByteArray(op, inState);
|
||||
|
||||
for (int columns = 0; columns <= 3; columns++)
|
||||
{
|
||||
|
@ -267,7 +273,7 @@ namespace ChocolArm64.Instructions
|
|||
byte[] inState = new byte[16];
|
||||
byte[] outState = new byte[16];
|
||||
|
||||
FromVectorToByteArray(inState, ref op);
|
||||
FromVectorToByteArray(op, inState);
|
||||
|
||||
for (int idx = 0; idx <= 15; idx++)
|
||||
{
|
||||
|
@ -284,7 +290,7 @@ namespace ChocolArm64.Instructions
|
|||
byte[] inState = new byte[16];
|
||||
byte[] outState = new byte[16];
|
||||
|
||||
FromVectorToByteArray(inState, ref op);
|
||||
FromVectorToByteArray(op, inState);
|
||||
|
||||
for (int idx = 0; idx <= 15; idx++)
|
||||
{
|
||||
|
@ -296,33 +302,30 @@ namespace ChocolArm64.Instructions
|
|||
return op;
|
||||
}
|
||||
|
||||
private static void FromVectorToByteArray(byte[] state, ref Vector128<float> op)
|
||||
{
|
||||
ulong uLongLow = VectorHelper.VectorExtractIntZx((op), (byte)0, 3);
|
||||
ulong uLongHigh = VectorHelper.VectorExtractIntZx((op), (byte)1, 3);
|
||||
|
||||
for (int idx = 0; idx <= 7; idx++)
|
||||
{
|
||||
state[idx + 0] = (byte)(uLongLow & 0xFFUL);
|
||||
state[idx + 8] = (byte)(uLongHigh & 0xFFUL);
|
||||
|
||||
uLongLow >>= 8;
|
||||
uLongHigh >>= 8;
|
||||
}
|
||||
}
|
||||
|
||||
private static void FromByteArrayToVector(byte[] state, ref Vector128<float> op)
|
||||
private unsafe static void FromVectorToByteArray(Vector128<float> op, byte[] state)
|
||||
{
|
||||
if (!Sse2.IsSupported)
|
||||
{
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
op = Sse.StaticCast<byte, float>(Sse2.SetVector128(
|
||||
state[15], state[14], state[13], state[12],
|
||||
state[11], state[10], state[9], state[8],
|
||||
state[7], state[6], state[5], state[4],
|
||||
state[3], state[2], state[1], state[0]));
|
||||
fixed (byte* ptr = &state[0])
|
||||
{
|
||||
Sse2.Store(ptr, Sse.StaticCast<float, byte>(op));
|
||||
}
|
||||
}
|
||||
|
||||
private unsafe static void FromByteArrayToVector(byte[] state, ref Vector128<float> op)
|
||||
{
|
||||
if (!Sse2.IsSupported)
|
||||
{
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
fixed (byte* ptr = &state[0])
|
||||
{
|
||||
op = Sse.StaticCast<byte, float>(Sse2.LoadVector128(ptr));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -392,8 +392,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fadd_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarSseOrSse2OpF(context, nameof(Sse.AddScalar));
|
||||
}
|
||||
|
@ -408,8 +407,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fadd_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorSseOrSse2OpF(context, nameof(Sse.Add));
|
||||
}
|
||||
|
@ -470,8 +468,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Faddp_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Add));
|
||||
}
|
||||
|
@ -486,8 +483,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fdiv_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarSseOrSse2OpF(context, nameof(Sse.DivideScalar));
|
||||
}
|
||||
|
@ -502,8 +498,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fdiv_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorSseOrSse2OpF(context, nameof(Sse.Divide));
|
||||
}
|
||||
|
@ -564,8 +559,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fmax_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarSseOrSse2OpF(context, nameof(Sse.MaxScalar));
|
||||
}
|
||||
|
@ -580,8 +574,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fmax_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorSseOrSse2OpF(context, nameof(Sse.Max));
|
||||
}
|
||||
|
@ -612,8 +605,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fmaxp_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Max));
|
||||
}
|
||||
|
@ -628,8 +620,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fmin_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarSseOrSse2OpF(context, nameof(Sse.MinScalar));
|
||||
}
|
||||
|
@ -644,8 +635,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fmin_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorSseOrSse2OpF(context, nameof(Sse.Min));
|
||||
}
|
||||
|
@ -676,8 +666,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fminp_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorPairwiseSseOrSse2OpF(context, nameof(Sse.Min));
|
||||
}
|
||||
|
@ -984,8 +973,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fmul_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarSseOrSse2OpF(context, nameof(Sse.MultiplyScalar));
|
||||
}
|
||||
|
@ -1005,8 +993,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fmul_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorSseOrSse2OpF(context, nameof(Sse.Multiply));
|
||||
}
|
||||
|
@ -1753,8 +1740,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fsqrt_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarSseOrSse2OpF(context, nameof(Sse.SqrtScalar));
|
||||
}
|
||||
|
@ -1769,8 +1755,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fsqrt_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorSseOrSse2OpF(context, nameof(Sse.Sqrt));
|
||||
}
|
||||
|
@ -1785,8 +1770,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fsub_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitScalarSseOrSse2OpF(context, nameof(Sse.SubtractScalar));
|
||||
}
|
||||
|
@ -1801,8 +1785,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fsub_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitVectorSseOrSse2OpF(context, nameof(Sse.Subtract));
|
||||
}
|
||||
|
@ -2268,6 +2251,15 @@ namespace ChocolArm64.Instructions
|
|||
}
|
||||
}
|
||||
|
||||
public static void Smlal_Ve(ILEmitterCtx context)
|
||||
{
|
||||
EmitVectorWidenTernaryOpByElemSx(context, () =>
|
||||
{
|
||||
context.Emit(OpCodes.Mul);
|
||||
context.Emit(OpCodes.Add);
|
||||
});
|
||||
}
|
||||
|
||||
public static void Smlsl_V(ILEmitterCtx context)
|
||||
{
|
||||
OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
|
||||
|
@ -2319,11 +2311,25 @@ namespace ChocolArm64.Instructions
|
|||
}
|
||||
}
|
||||
|
||||
public static void Smlsl_Ve(ILEmitterCtx context)
|
||||
{
|
||||
EmitVectorWidenTernaryOpByElemSx(context, () =>
|
||||
{
|
||||
context.Emit(OpCodes.Mul);
|
||||
context.Emit(OpCodes.Sub);
|
||||
});
|
||||
}
|
||||
|
||||
public static void Smull_V(ILEmitterCtx context)
|
||||
{
|
||||
EmitVectorWidenRnRmBinaryOpSx(context, () => context.Emit(OpCodes.Mul));
|
||||
}
|
||||
|
||||
public static void Smull_Ve(ILEmitterCtx context)
|
||||
{
|
||||
EmitVectorWidenBinaryOpByElemSx(context, () => context.Emit(OpCodes.Mul));
|
||||
}
|
||||
|
||||
public static void Sqabs_S(ILEmitterCtx context)
|
||||
{
|
||||
EmitScalarSaturatingUnaryOpSx(context, () => EmitAbs(context));
|
||||
|
@ -2929,6 +2935,15 @@ namespace ChocolArm64.Instructions
|
|||
}
|
||||
}
|
||||
|
||||
public static void Umlal_Ve(ILEmitterCtx context)
|
||||
{
|
||||
EmitVectorWidenTernaryOpByElemZx(context, () =>
|
||||
{
|
||||
context.Emit(OpCodes.Mul);
|
||||
context.Emit(OpCodes.Add);
|
||||
});
|
||||
}
|
||||
|
||||
public static void Umlsl_V(ILEmitterCtx context)
|
||||
{
|
||||
OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
|
||||
|
@ -2980,11 +2995,25 @@ namespace ChocolArm64.Instructions
|
|||
}
|
||||
}
|
||||
|
||||
public static void Umlsl_Ve(ILEmitterCtx context)
|
||||
{
|
||||
EmitVectorWidenTernaryOpByElemZx(context, () =>
|
||||
{
|
||||
context.Emit(OpCodes.Mul);
|
||||
context.Emit(OpCodes.Sub);
|
||||
});
|
||||
}
|
||||
|
||||
public static void Umull_V(ILEmitterCtx context)
|
||||
{
|
||||
EmitVectorWidenRnRmBinaryOpZx(context, () => context.Emit(OpCodes.Mul));
|
||||
}
|
||||
|
||||
public static void Umull_Ve(ILEmitterCtx context)
|
||||
{
|
||||
EmitVectorWidenBinaryOpByElemZx(context, () => context.Emit(OpCodes.Mul));
|
||||
}
|
||||
|
||||
public static void Uqadd_S(ILEmitterCtx context)
|
||||
{
|
||||
EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Add);
|
||||
|
|
|
@ -173,8 +173,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fcmeq_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqualScalar), scalar: true);
|
||||
}
|
||||
|
@ -186,8 +185,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fcmeq_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareEqual), scalar: false);
|
||||
}
|
||||
|
@ -199,8 +197,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fcmge_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true);
|
||||
}
|
||||
|
@ -212,8 +209,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fcmge_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false);
|
||||
}
|
||||
|
@ -225,8 +221,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fcmgt_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true);
|
||||
}
|
||||
|
@ -238,8 +233,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fcmgt_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false);
|
||||
}
|
||||
|
@ -251,8 +245,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fcmle_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqualScalar), scalar: true, isLeOrLt: true);
|
||||
}
|
||||
|
@ -264,8 +257,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fcmle_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanOrEqual), scalar: false, isLeOrLt: true);
|
||||
}
|
||||
|
@ -277,8 +269,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fcmlt_S(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThanScalar), scalar: true, isLeOrLt: true);
|
||||
}
|
||||
|
@ -290,8 +281,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static void Fcmlt_V(ILEmitterCtx context)
|
||||
{
|
||||
if (Optimizations.FastFP && Optimizations.UseSse
|
||||
&& Optimizations.UseSse2)
|
||||
if (Optimizations.FastFP && Optimizations.UseSse2)
|
||||
{
|
||||
EmitCmpSseOrSse2OpF(context, nameof(Sse.CompareGreaterThan), scalar: false, isLeOrLt: true);
|
||||
}
|
||||
|
|
|
@ -78,7 +78,6 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
if (Optimizations.UseSse2 && sizeF == 1)
|
||||
{
|
||||
Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
|
||||
Type[] typesCvt = new Type[] { typeof(Vector128<float>) };
|
||||
|
||||
string nameMov = op.RegisterSize == RegisterSize.Simd128
|
||||
|
@ -88,7 +87,7 @@ namespace ChocolArm64.Instructions
|
|||
context.EmitLdvec(op.Rn);
|
||||
context.Emit(OpCodes.Dup);
|
||||
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameMov));
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Double), typesCvt));
|
||||
|
||||
|
@ -144,7 +143,6 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
if (Optimizations.UseSse2 && sizeF == 1)
|
||||
{
|
||||
Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
|
||||
Type[] typesCvt = new Type[] { typeof(Vector128<double>) };
|
||||
|
||||
string nameMov = op.RegisterSize == RegisterSize.Simd128
|
||||
|
@ -154,15 +152,15 @@ namespace ChocolArm64.Instructions
|
|||
context.EmitLdvec(op.Rd);
|
||||
VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
|
||||
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
|
||||
|
||||
EmitLdvecWithCastToDouble(context, op.Rn);
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertToVector128Single), typesCvt));
|
||||
context.Emit(OpCodes.Dup);
|
||||
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
|
||||
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameMov));
|
||||
|
||||
context.EmitStvec(op.Rd);
|
||||
}
|
||||
|
|
|
@ -642,21 +642,21 @@ namespace ChocolArm64.Instructions
|
|||
{
|
||||
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
|
||||
|
||||
EmitVectorOpByElem(context, emit, op.Index, false, true);
|
||||
EmitVectorOpByElem(context, emit, op.Index, ternary: false, signed: true);
|
||||
}
|
||||
|
||||
public static void EmitVectorBinaryOpByElemZx(ILEmitterCtx context, Action emit)
|
||||
{
|
||||
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
|
||||
|
||||
EmitVectorOpByElem(context, emit, op.Index, false, false);
|
||||
EmitVectorOpByElem(context, emit, op.Index, ternary: false, signed: false);
|
||||
}
|
||||
|
||||
public static void EmitVectorTernaryOpByElemZx(ILEmitterCtx context, Action emit)
|
||||
{
|
||||
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
|
||||
|
||||
EmitVectorOpByElem(context, emit, op.Index, true, false);
|
||||
EmitVectorOpByElem(context, emit, op.Index, ternary: true, signed: false);
|
||||
}
|
||||
|
||||
public static void EmitVectorOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed)
|
||||
|
@ -809,6 +809,64 @@ namespace ChocolArm64.Instructions
|
|||
context.EmitStvec(op.Rd);
|
||||
}
|
||||
|
||||
public static void EmitVectorWidenBinaryOpByElemSx(ILEmitterCtx context, Action emit)
|
||||
{
|
||||
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
|
||||
|
||||
EmitVectorWidenOpByElem(context, emit, op.Index, ternary: false, signed: true);
|
||||
}
|
||||
|
||||
public static void EmitVectorWidenBinaryOpByElemZx(ILEmitterCtx context, Action emit)
|
||||
{
|
||||
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
|
||||
|
||||
EmitVectorWidenOpByElem(context, emit, op.Index, ternary: false, signed: false);
|
||||
}
|
||||
|
||||
public static void EmitVectorWidenTernaryOpByElemSx(ILEmitterCtx context, Action emit)
|
||||
{
|
||||
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
|
||||
|
||||
EmitVectorWidenOpByElem(context, emit, op.Index, ternary: true, signed: true);
|
||||
}
|
||||
|
||||
public static void EmitVectorWidenTernaryOpByElemZx(ILEmitterCtx context, Action emit)
|
||||
{
|
||||
OpCodeSimdRegElem64 op = (OpCodeSimdRegElem64)context.CurrOp;
|
||||
|
||||
EmitVectorWidenOpByElem(context, emit, op.Index, ternary: true, signed: false);
|
||||
}
|
||||
|
||||
public static void EmitVectorWidenOpByElem(ILEmitterCtx context, Action emit, int elem, bool ternary, bool signed)
|
||||
{
|
||||
OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
|
||||
|
||||
int elems = 8 >> op.Size;
|
||||
|
||||
int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
|
||||
|
||||
EmitVectorExtract(context, op.Rm, elem, op.Size, signed);
|
||||
context.EmitSttmp();
|
||||
|
||||
for (int index = 0; index < elems; index++)
|
||||
{
|
||||
if (ternary)
|
||||
{
|
||||
EmitVectorExtract(context, op.Rd, index, op.Size + 1, signed);
|
||||
}
|
||||
|
||||
EmitVectorExtract(context, op.Rn, part + index, op.Size, signed);
|
||||
context.EmitLdtmp();
|
||||
|
||||
emit();
|
||||
|
||||
EmitVectorInsertTmp(context, index, op.Size + 1);
|
||||
}
|
||||
|
||||
context.EmitLdvectmp();
|
||||
context.EmitStvec(op.Rd);
|
||||
}
|
||||
|
||||
public static void EmitVectorPairwiseOpSx(ILEmitterCtx context, Action emit)
|
||||
{
|
||||
EmitVectorPairwiseOp(context, emit, true);
|
||||
|
@ -1416,7 +1474,7 @@ namespace ChocolArm64.Instructions
|
|||
if (Optimizations.UseSse)
|
||||
{
|
||||
//TODO: Use Sse2.MoveScalar once it is fixed,
|
||||
//as of the time of writing it just crashes the JIT (SDK 2.1.500).
|
||||
//as of the time of writing it just crashes the JIT (SDK 2.1.503).
|
||||
|
||||
/*Type[] typesMov = new Type[] { typeof(Vector128<ulong>) };
|
||||
|
||||
|
|
|
@ -12,6 +12,34 @@ namespace ChocolArm64.Instructions
|
|||
{
|
||||
static partial class InstEmit
|
||||
{
|
||||
#region "Masks"
|
||||
private static readonly long[] _masksE0_TrnUzpXtn = new long[]
|
||||
{
|
||||
14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0,
|
||||
13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0,
|
||||
11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0
|
||||
};
|
||||
|
||||
private static readonly long[] _masksE1_TrnUzp = new long[]
|
||||
{
|
||||
15L << 56 | 13L << 48 | 11L << 40 | 09L << 32 | 07L << 24 | 05L << 16 | 03L << 8 | 01L << 0,
|
||||
15L << 56 | 14L << 48 | 11L << 40 | 10L << 32 | 07L << 24 | 06L << 16 | 03L << 8 | 02L << 0,
|
||||
15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0
|
||||
};
|
||||
|
||||
private static readonly long[] _masksE0_Uzp = new long[]
|
||||
{
|
||||
13L << 56 | 09L << 48 | 05L << 40 | 01L << 32 | 12L << 24 | 08L << 16 | 04L << 8 | 00L << 0,
|
||||
11L << 56 | 10L << 48 | 03L << 40 | 02L << 32 | 09L << 24 | 08L << 16 | 01L << 8 | 00L << 0
|
||||
};
|
||||
|
||||
private static readonly long[] _masksE1_Uzp = new long[]
|
||||
{
|
||||
15L << 56 | 11L << 48 | 07L << 40 | 03L << 32 | 14L << 24 | 10L << 16 | 06L << 8 | 02L << 0,
|
||||
15L << 56 | 14L << 48 | 07L << 40 | 06L << 32 | 13L << 24 | 12L << 16 | 05L << 8 | 04L << 0
|
||||
};
|
||||
#endregion
|
||||
|
||||
public static void Dup_Gp(ILEmitterCtx context)
|
||||
{
|
||||
OpCodeSimdIns64 op = (OpCodeSimdIns64)context.CurrOp;
|
||||
|
@ -379,15 +407,6 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
if (Optimizations.UseSsse3)
|
||||
{
|
||||
long[] masks = new long[]
|
||||
{
|
||||
14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0,
|
||||
13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0,
|
||||
11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0
|
||||
};
|
||||
|
||||
Type[] typesMov = new Type[] { typeof(Vector128<float>), typeof(Vector128<float>) };
|
||||
Type[] typesSfl = new Type[] { typeof(Vector128<sbyte>), typeof(Vector128<sbyte>) };
|
||||
Type[] typesSve = new Type[] { typeof(long), typeof(long) };
|
||||
|
||||
string nameMov = op.RegisterSize == RegisterSize.Simd128
|
||||
|
@ -397,18 +416,18 @@ namespace ChocolArm64.Instructions
|
|||
context.EmitLdvec(op.Rd);
|
||||
VectorHelper.EmitCall(context, nameof(VectorHelper.VectorSingleZero));
|
||||
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh), typesMov));
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MoveLowToHigh)));
|
||||
|
||||
EmitLdvecWithSignedCast(context, op.Rn, 0);
|
||||
EmitLdvecWithSignedCast(context, op.Rn, 0); // value
|
||||
|
||||
context.EmitLdc_I8(masks[op.Size]);
|
||||
context.Emit(OpCodes.Dup);
|
||||
context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // mask
|
||||
context.Emit(OpCodes.Dup); // mask
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
|
||||
|
||||
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl));
|
||||
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
|
||||
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameMov, typesMov));
|
||||
context.EmitCall(typeof(Sse).GetMethod(nameMov));
|
||||
|
||||
context.EmitStvec(op.Rd);
|
||||
}
|
||||
|
@ -465,22 +484,61 @@ namespace ChocolArm64.Instructions
|
|||
{
|
||||
OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
|
||||
|
||||
int words = op.GetBitsCount() >> 4;
|
||||
int pairs = words >> op.Size;
|
||||
|
||||
for (int index = 0; index < pairs; index++)
|
||||
if (Optimizations.UseSsse3)
|
||||
{
|
||||
int idx = index << 1;
|
||||
Type[] typesSve = new Type[] { typeof(long), typeof(long) };
|
||||
|
||||
EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
|
||||
EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
|
||||
string nameUpk = part == 0
|
||||
? nameof(Sse2.UnpackLow)
|
||||
: nameof(Sse2.UnpackHigh);
|
||||
|
||||
EmitVectorInsertTmp(context, idx + 1, op.Size);
|
||||
EmitVectorInsertTmp(context, idx, op.Size);
|
||||
EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value
|
||||
|
||||
if (op.Size < 3)
|
||||
{
|
||||
context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1
|
||||
context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
|
||||
|
||||
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
|
||||
}
|
||||
|
||||
EmitLdvecWithSignedCast(context, op.Rm, op.Size); // value
|
||||
|
||||
if (op.Size < 3)
|
||||
{
|
||||
context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1
|
||||
context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
|
||||
|
||||
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
|
||||
}
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(op.Size)));
|
||||
|
||||
EmitStvecWithSignedCast(context, op.Rd, op.Size);
|
||||
}
|
||||
else
|
||||
{
|
||||
int words = op.GetBitsCount() >> 4;
|
||||
int pairs = words >> op.Size;
|
||||
|
||||
context.EmitLdvectmp();
|
||||
context.EmitStvec(op.Rd);
|
||||
for (int index = 0; index < pairs; index++)
|
||||
{
|
||||
int idx = index << 1;
|
||||
|
||||
EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
|
||||
EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
|
||||
|
||||
EmitVectorInsertTmp(context, idx + 1, op.Size);
|
||||
EmitVectorInsertTmp(context, idx, op.Size);
|
||||
}
|
||||
|
||||
context.EmitLdvectmp();
|
||||
context.EmitStvec(op.Rd);
|
||||
}
|
||||
|
||||
if (op.RegisterSize == RegisterSize.Simd64)
|
||||
{
|
||||
|
@ -492,26 +550,91 @@ namespace ChocolArm64.Instructions
|
|||
{
|
||||
OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
|
||||
|
||||
int words = op.GetBitsCount() >> 4;
|
||||
int pairs = words >> op.Size;
|
||||
|
||||
for (int index = 0; index < pairs; index++)
|
||||
if (Optimizations.UseSsse3)
|
||||
{
|
||||
int idx = index << 1;
|
||||
Type[] typesSve = new Type[] { typeof(long), typeof(long) };
|
||||
|
||||
EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
|
||||
EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
|
||||
string nameUpk = part == 0
|
||||
? nameof(Sse2.UnpackLow)
|
||||
: nameof(Sse2.UnpackHigh);
|
||||
|
||||
EmitVectorInsertTmp(context, pairs + index, op.Size);
|
||||
EmitVectorInsertTmp(context, index, op.Size);
|
||||
if (op.RegisterSize == RegisterSize.Simd128)
|
||||
{
|
||||
EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value
|
||||
|
||||
if (op.Size < 3)
|
||||
{
|
||||
context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1
|
||||
context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
|
||||
|
||||
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
|
||||
}
|
||||
|
||||
EmitLdvecWithSignedCast(context, op.Rm, op.Size); // value
|
||||
|
||||
if (op.Size < 3)
|
||||
{
|
||||
context.EmitLdc_I8(_masksE1_TrnUzp [op.Size]); // maskE1
|
||||
context.EmitLdc_I8(_masksE0_TrnUzpXtn[op.Size]); // maskE0
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
|
||||
|
||||
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
|
||||
}
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
|
||||
|
||||
EmitStvecWithSignedCast(context, op.Rd, op.Size);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitLdvecWithSignedCast(context, op.Rn, op.Size);
|
||||
EmitLdvecWithSignedCast(context, op.Rm, op.Size);
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), GetTypesSflUpk(op.Size))); // value
|
||||
|
||||
if (op.Size < 2)
|
||||
{
|
||||
context.EmitLdc_I8(_masksE1_Uzp[op.Size]); // maskE1
|
||||
context.EmitLdc_I8(_masksE0_Uzp[op.Size]); // maskE0
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
|
||||
|
||||
context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), GetTypesSflUpk(0)));
|
||||
}
|
||||
|
||||
VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt64Zero));
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
|
||||
|
||||
EmitStvecWithSignedCast(context, op.Rd, op.Size);
|
||||
}
|
||||
}
|
||||
|
||||
context.EmitLdvectmp();
|
||||
context.EmitStvec(op.Rd);
|
||||
|
||||
if (op.RegisterSize == RegisterSize.Simd64)
|
||||
else
|
||||
{
|
||||
EmitVectorZeroUpper(context, op.Rd);
|
||||
int words = op.GetBitsCount() >> 4;
|
||||
int pairs = words >> op.Size;
|
||||
|
||||
for (int index = 0; index < pairs; index++)
|
||||
{
|
||||
int idx = index << 1;
|
||||
|
||||
EmitVectorExtractZx(context, op.Rn, idx + part, op.Size);
|
||||
EmitVectorExtractZx(context, op.Rm, idx + part, op.Size);
|
||||
|
||||
EmitVectorInsertTmp(context, pairs + index, op.Size);
|
||||
EmitVectorInsertTmp(context, index, op.Size);
|
||||
}
|
||||
|
||||
context.EmitLdvectmp();
|
||||
context.EmitStvec(op.Rd);
|
||||
|
||||
if (op.RegisterSize == RegisterSize.Simd64)
|
||||
{
|
||||
EmitVectorZeroUpper(context, op.Rd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -521,36 +644,26 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
if (Optimizations.UseSse2)
|
||||
{
|
||||
EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
|
||||
EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
|
||||
|
||||
Type[] types = new Type[]
|
||||
{
|
||||
VectorUIntTypesPerSizeLog2[op.Size],
|
||||
VectorUIntTypesPerSizeLog2[op.Size]
|
||||
};
|
||||
|
||||
string name = part == 0 || (part != 0 && op.RegisterSize == RegisterSize.Simd64)
|
||||
string nameUpk = part == 0
|
||||
? nameof(Sse2.UnpackLow)
|
||||
: nameof(Sse2.UnpackHigh);
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(name, types));
|
||||
EmitLdvecWithSignedCast(context, op.Rn, op.Size);
|
||||
EmitLdvecWithSignedCast(context, op.Rm, op.Size);
|
||||
|
||||
if (op.RegisterSize == RegisterSize.Simd64 && part != 0)
|
||||
if (op.RegisterSize == RegisterSize.Simd128)
|
||||
{
|
||||
context.EmitLdc_I4(8);
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(op.Size)));
|
||||
}
|
||||
else
|
||||
{
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.UnpackLow), GetTypesSflUpk(op.Size)));
|
||||
VectorHelper.EmitCall(context, nameof(VectorHelper.VectorInt64Zero));
|
||||
|
||||
Type[] shTypes = new Type[] { VectorUIntTypesPerSizeLog2[op.Size], typeof(byte) };
|
||||
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), shTypes));
|
||||
context.EmitCall(typeof(Sse2).GetMethod(nameUpk, GetTypesSflUpk(3)));
|
||||
}
|
||||
|
||||
EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
|
||||
|
||||
if (op.RegisterSize == RegisterSize.Simd64 && part == 0)
|
||||
{
|
||||
EmitVectorZeroUpper(context, op.Rd);
|
||||
}
|
||||
EmitStvecWithSignedCast(context, op.Rd, op.Size);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -579,5 +692,10 @@ namespace ChocolArm64.Instructions
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Type[] GetTypesSflUpk(int size)
|
||||
{
|
||||
return new Type[] { VectorIntTypesPerSizeLog2[size], VectorIntTypesPerSizeLog2[size] };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -664,7 +664,7 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
for (int bit = highBit; bit >= 0; bit--)
|
||||
{
|
||||
if (((value >> bit) & 0b1) != 0)
|
||||
if (((int)(value >> bit) & 0b1) != 0)
|
||||
{
|
||||
return (ulong)(highBit - bit);
|
||||
}
|
||||
|
@ -688,7 +688,7 @@ namespace ChocolArm64.Instructions
|
|||
do
|
||||
{
|
||||
nibbleIdx -= 4;
|
||||
preCount = ClzNibbleTbl[(value >> nibbleIdx) & 0b1111];
|
||||
preCount = ClzNibbleTbl[(int)(value >> nibbleIdx) & 0b1111];
|
||||
count += preCount;
|
||||
}
|
||||
while (preCount == 4);
|
||||
|
@ -698,11 +698,6 @@ namespace ChocolArm64.Instructions
|
|||
|
||||
public static ulong CountSetBits8(ulong value) // "size" is 8 (SIMD&FP Inst.).
|
||||
{
|
||||
if (value == 0xfful)
|
||||
{
|
||||
return 8ul;
|
||||
}
|
||||
|
||||
value = ((value >> 1) & 0x55ul) + (value & 0x55ul);
|
||||
value = ((value >> 2) & 0x33ul) + (value & 0x33ul);
|
||||
|
||||
|
|
|
@ -1545,9 +1545,9 @@ namespace ChocolArm64.Instructions
|
|||
return -value;
|
||||
}
|
||||
|
||||
private static float ZerosOrOnes(bool zeros)
|
||||
private static float ZerosOrOnes(bool ones)
|
||||
{
|
||||
return BitConverter.Int32BitsToSingle(!zeros ? 0 : -1);
|
||||
return BitConverter.Int32BitsToSingle(ones ? -1 : 0);
|
||||
}
|
||||
|
||||
private static float FPUnpack(
|
||||
|
@ -2629,9 +2629,9 @@ namespace ChocolArm64.Instructions
|
|||
return -value;
|
||||
}
|
||||
|
||||
private static double ZerosOrOnes(bool zeros)
|
||||
private static double ZerosOrOnes(bool ones)
|
||||
{
|
||||
return BitConverter.Int64BitsToDouble(!zeros ? 0L : -1L);
|
||||
return BitConverter.Int64BitsToDouble(ones ? -1L : 0L);
|
||||
}
|
||||
|
||||
private static double FPUnpack(
|
||||
|
|
|
@ -445,9 +445,12 @@ namespace ChocolArm64
|
|||
SetA64("0x001110<<1xxxxx011011xxxxxxxxxx", InstEmit.Smin_V, typeof(OpCodeSimdReg64));
|
||||
SetA64("0x001110<<1xxxxx101011xxxxxxxxxx", InstEmit.Sminp_V, typeof(OpCodeSimdReg64));
|
||||
SetA64("0x001110<<1xxxxx100000xxxxxxxxxx", InstEmit.Smlal_V, typeof(OpCodeSimdReg64));
|
||||
SetA64("0x001111xxxxxxxx0010x0xxxxxxxxxx", InstEmit.Smlal_Ve, typeof(OpCodeSimdRegElem64));
|
||||
SetA64("0x001110<<1xxxxx101000xxxxxxxxxx", InstEmit.Smlsl_V, typeof(OpCodeSimdReg64));
|
||||
SetA64("0x001111xxxxxxxx0110x0xxxxxxxxxx", InstEmit.Smlsl_Ve, typeof(OpCodeSimdRegElem64));
|
||||
SetA64("0x001110000xxxxx001011xxxxxxxxxx", InstEmit.Smov_S, typeof(OpCodeSimdIns64));
|
||||
SetA64("0x001110<<1xxxxx110000xxxxxxxxxx", InstEmit.Smull_V, typeof(OpCodeSimdReg64));
|
||||
SetA64("0x001111xxxxxxxx1010x0xxxxxxxxxx", InstEmit.Smull_Ve, typeof(OpCodeSimdRegElem64));
|
||||
SetA64("01011110xx100000011110xxxxxxxxxx", InstEmit.Sqabs_S, typeof(OpCodeSimd64));
|
||||
SetA64("0>001110<<100000011110xxxxxxxxxx", InstEmit.Sqabs_V, typeof(OpCodeSimd64));
|
||||
SetA64("01011110xx1xxxxx000011xxxxxxxxxx", InstEmit.Sqadd_S, typeof(OpCodeSimdReg64));
|
||||
|
@ -534,9 +537,12 @@ namespace ChocolArm64
|
|||
SetA64("0x101110<<1xxxxx011011xxxxxxxxxx", InstEmit.Umin_V, typeof(OpCodeSimdReg64));
|
||||
SetA64("0x101110<<1xxxxx101011xxxxxxxxxx", InstEmit.Uminp_V, typeof(OpCodeSimdReg64));
|
||||
SetA64("0x101110<<1xxxxx100000xxxxxxxxxx", InstEmit.Umlal_V, typeof(OpCodeSimdReg64));
|
||||
SetA64("0x101111xxxxxxxx0010x0xxxxxxxxxx", InstEmit.Umlal_Ve, typeof(OpCodeSimdRegElem64));
|
||||
SetA64("0x101110<<1xxxxx101000xxxxxxxxxx", InstEmit.Umlsl_V, typeof(OpCodeSimdReg64));
|
||||
SetA64("0x101111xxxxxxxx0110x0xxxxxxxxxx", InstEmit.Umlsl_Ve, typeof(OpCodeSimdRegElem64));
|
||||
SetA64("0x001110000xxxxx001111xxxxxxxxxx", InstEmit.Umov_S, typeof(OpCodeSimdIns64));
|
||||
SetA64("0x101110<<1xxxxx110000xxxxxxxxxx", InstEmit.Umull_V, typeof(OpCodeSimdReg64));
|
||||
SetA64("0x101111xxxxxxxx1010x0xxxxxxxxxx", InstEmit.Umull_Ve, typeof(OpCodeSimdRegElem64));
|
||||
SetA64("01111110xx1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_S, typeof(OpCodeSimdReg64));
|
||||
SetA64("0>101110<<1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_V, typeof(OpCodeSimdReg64));
|
||||
SetA64("0>101110<<1xxxxx010111xxxxxxxxxx", InstEmit.Uqrshl_V, typeof(OpCodeSimdReg64));
|
||||
|
|
|
@ -45,6 +45,32 @@ namespace Ryujinx.Tests.Cpu
|
|||
0x0F808000u // MUL V0.2S, V0.2S, V0.S[0]
|
||||
};
|
||||
}
|
||||
|
||||
private static uint[] _SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S_()
|
||||
{
|
||||
return new uint[]
|
||||
{
|
||||
0x0F402000u, // SMLAL V0.4S, V0.4H, V0.H[0]
|
||||
0x0F406000u, // SMLSL V0.4S, V0.4H, V0.H[0]
|
||||
0x0F40A000u, // SMULL V0.4S, V0.4H, V0.H[0]
|
||||
0x2F402000u, // UMLAL V0.4S, V0.4H, V0.H[0]
|
||||
0x2F406000u, // UMLSL V0.4S, V0.4H, V0.H[0]
|
||||
0x2F40A000u // UMULL V0.4S, V0.4H, V0.H[0]
|
||||
};
|
||||
}
|
||||
|
||||
private static uint[] _SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D_()
|
||||
{
|
||||
return new uint[]
|
||||
{
|
||||
0x0F802000u, // SMLAL V0.2D, V0.2S, V0.S[0]
|
||||
0x0F806000u, // SMLSL V0.2D, V0.2S, V0.S[0]
|
||||
0x0F80A000u, // SMULL V0.2D, V0.2S, V0.S[0]
|
||||
0x2F802000u, // UMLAL V0.2D, V0.2S, V0.S[0]
|
||||
0x2F806000u, // UMLSL V0.2D, V0.2S, V0.S[0]
|
||||
0x2F80A000u // UMULL V0.2D, V0.2S, V0.S[0]
|
||||
};
|
||||
}
|
||||
#endregion
|
||||
|
||||
private const int RndCnt = 2;
|
||||
|
@ -103,6 +129,61 @@ namespace Ryujinx.Tests.Cpu
|
|||
|
||||
CompareAgainstUnicorn();
|
||||
}
|
||||
|
||||
[Test, Pairwise]
|
||||
public void SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S([ValueSource("_SU_Mlal_Mlsl_Mull_Ve_4H4S_8H4S_")] uint opcodes,
|
||||
[Values(0u)] uint rd,
|
||||
[Values(1u, 0u)] uint rn,
|
||||
[Values(2u, 0u)] uint rm,
|
||||
[ValueSource("_4H_")] [Random(RndCnt)] ulong z,
|
||||
[ValueSource("_4H_")] [Random(RndCnt)] ulong a,
|
||||
[ValueSource("_4H_")] [Random(RndCnt)] ulong b,
|
||||
[Values(0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u)] uint index,
|
||||
[Values(0b0u, 0b1u)] uint q) // <4H4S, 8H4S>
|
||||
{
|
||||
uint h = (index >> 2) & 1;
|
||||
uint l = (index >> 1) & 1;
|
||||
uint m = index & 1;
|
||||
|
||||
opcodes |= ((rm & 15) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
|
||||
opcodes |= (l << 21) | (m << 20) | (h << 11);
|
||||
opcodes |= ((q & 1) << 30);
|
||||
|
||||
Vector128<float> v0 = MakeVectorE0E1(z, z);
|
||||
Vector128<float> v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul);
|
||||
Vector128<float> v2 = MakeVectorE0E1(b, b * h);
|
||||
|
||||
SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
|
||||
|
||||
CompareAgainstUnicorn();
|
||||
}
|
||||
|
||||
[Test, Pairwise]
|
||||
public void SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D([ValueSource("_SU_Mlal_Mlsl_Mull_Ve_2S2D_4S2D_")] uint opcodes,
|
||||
[Values(0u)] uint rd,
|
||||
[Values(1u, 0u)] uint rn,
|
||||
[Values(2u, 0u)] uint rm,
|
||||
[ValueSource("_2S_")] [Random(RndCnt)] ulong z,
|
||||
[ValueSource("_2S_")] [Random(RndCnt)] ulong a,
|
||||
[ValueSource("_2S_")] [Random(RndCnt)] ulong b,
|
||||
[Values(0u, 1u, 2u, 3u)] uint index,
|
||||
[Values(0b0u, 0b1u)] uint q) // <2S2D, 4S2D>
|
||||
{
|
||||
uint h = (index >> 1) & 1;
|
||||
uint l = index & 1;
|
||||
|
||||
opcodes |= ((rm & 15) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
|
||||
opcodes |= (l << 21) | (h << 11);
|
||||
opcodes |= ((q & 1) << 30);
|
||||
|
||||
Vector128<float> v0 = MakeVectorE0E1(z, z);
|
||||
Vector128<float> v1 = MakeVectorE0E1(q == 0u ? a : 0ul, q == 1u ? a : 0ul);
|
||||
Vector128<float> v2 = MakeVectorE0E1(b, b * h);
|
||||
|
||||
SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
|
||||
|
||||
CompareAgainstUnicorn();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue