diff --git a/ChocolArm64/Instructions/InstEmitSimdMove.cs b/ChocolArm64/Instructions/InstEmitSimdMove.cs index 20647ce0..cdd35171 100644 --- a/ChocolArm64/Instructions/InstEmitSimdMove.cs +++ b/ChocolArm64/Instructions/InstEmitSimdMove.cs @@ -355,35 +355,94 @@ namespace ChocolArm64.Instructions { OpCodeSimdTbl64 op = (OpCodeSimdTbl64)context.CurrOp; - context.EmitLdvec(op.Rm); - - for (int index = 0; index < op.Size; index++) + if (Optimizations.UseSsse3) { - context.EmitLdvec((op.Rn + index) & 0x1f); - } + Type[] typesCmpSflSub = new Type[] { typeof(Vector128), typeof(Vector128) }; + Type[] typesOr = new Type[] { typeof(Vector128 ), typeof(Vector128 ) }; + Type[] typesSav = new Type[] { typeof(long) }; - switch (op.Size) + context.EmitLdvec(op.Rn); + context.EmitLdvec(op.Rm); + + context.EmitLdc_I8(0x0F0F0F0F0F0F0F0FL); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); + + context.EmitStvectmp2(); + context.EmitLdvectmp2(); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareGreaterThan), typesCmpSflSub)); + + context.EmitLdvec(op.Rm); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), typesOr)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesCmpSflSub)); + + for (int index = 1; index < op.Size; index++) + { + context.EmitLdvec((op.Rn + index) & 0x1F); + context.EmitLdvec(op.Rm); + + context.EmitLdc_I8(0x1010101010101010L * index); + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav)); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesCmpSflSub)); + + context.EmitStvectmp(); + context.EmitLdvectmp(); + + context.EmitLdvectmp2(); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.CompareGreaterThan), typesCmpSflSub)); + + context.EmitLdvectmp(); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), typesOr)); + + context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesCmpSflSub)); + + context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), typesOr)); + } + + context.EmitStvec(op.Rd); + + if (op.RegisterSize == RegisterSize.Simd64) + { + EmitVectorZeroUpper(context, op.Rd); + } + } + else { - case 1: VectorHelper.EmitCall(context, - nameof(VectorHelper.Tbl1_V64), - nameof(VectorHelper.Tbl1_V128)); break; + context.EmitLdvec(op.Rm); - case 2: VectorHelper.EmitCall(context, - nameof(VectorHelper.Tbl2_V64), - nameof(VectorHelper.Tbl2_V128)); break; + for (int index = 0; index < op.Size; index++) + { + context.EmitLdvec((op.Rn + index) & 0x1F); + } - case 3: VectorHelper.EmitCall(context, - nameof(VectorHelper.Tbl3_V64), - nameof(VectorHelper.Tbl3_V128)); break; + switch (op.Size) + { + case 1: VectorHelper.EmitCall(context, + nameof(VectorHelper.Tbl1_V64), + nameof(VectorHelper.Tbl1_V128)); break; - case 4: VectorHelper.EmitCall(context, - nameof(VectorHelper.Tbl4_V64), - nameof(VectorHelper.Tbl4_V128)); break; + case 2: VectorHelper.EmitCall(context, + nameof(VectorHelper.Tbl2_V64), + nameof(VectorHelper.Tbl2_V128)); break; - default: throw new InvalidOperationException(); + case 3: VectorHelper.EmitCall(context, + nameof(VectorHelper.Tbl3_V64), + nameof(VectorHelper.Tbl3_V128)); break; + + case 4: VectorHelper.EmitCall(context, + nameof(VectorHelper.Tbl4_V64), + nameof(VectorHelper.Tbl4_V128)); break; + + default: throw new InvalidOperationException(); + } + + context.EmitStvec(op.Rd); } - - context.EmitStvec(op.Rd); } public static void Trn1_V(ILEmitterCtx context) diff --git a/ChocolArm64/Translation/ILEmitterCtx.cs b/ChocolArm64/Translation/ILEmitterCtx.cs index f39bd371..8804521c 100644 --- a/ChocolArm64/Translation/ILEmitterCtx.cs +++ b/ChocolArm64/Translation/ILEmitterCtx.cs @@ -61,7 +61,9 @@ namespace ChocolArm64.Translation //Vectors are part of another "set" of locals. private const int VecGpTmp1Index = ReservedLocalsCount + 0; - private const int UserVecTempStart = ReservedLocalsCount + 1; + private const int VecGpTmp2Index = ReservedLocalsCount + 1; + private const int VecGpTmp3Index = ReservedLocalsCount + 2; + private const int UserVecTempStart = ReservedLocalsCount + 3; private static int _userIntTempCount; private static int _userVecTempCount; @@ -629,6 +631,12 @@ namespace ChocolArm64.Translation public void EmitLdvectmp() => EmitLdvec(VecGpTmp1Index); public void EmitStvectmp() => EmitStvec(VecGpTmp1Index); + public void EmitLdvectmp2() => EmitLdvec(VecGpTmp2Index); + public void EmitStvectmp2() => EmitStvec(VecGpTmp2Index); + + public void EmitLdvectmp3() => EmitLdvec(VecGpTmp3Index); + public void EmitStvectmp3() => EmitStvec(VecGpTmp3Index); + public void EmitLdint(int index) => Ldloc(index, VarType.Int); public void EmitStint(int index) => Stloc(index, VarType.Int); diff --git a/Ryujinx.Tests/Cpu/CpuTest.cs b/Ryujinx.Tests/Cpu/CpuTest.cs index 47feb573..b147cf44 100644 --- a/Ryujinx.Tests/Cpu/CpuTest.cs +++ b/Ryujinx.Tests/Cpu/CpuTest.cs @@ -93,10 +93,14 @@ namespace Ryujinx.Tests.Cpu } protected void SetThreadState(ulong x0 = 0, ulong x1 = 0, ulong x2 = 0, ulong x3 = 0, ulong x31 = 0, - Vector128 v0 = default(Vector128), - Vector128 v1 = default(Vector128), - Vector128 v2 = default(Vector128), - Vector128 v3 = default(Vector128), + Vector128 v0 = default(Vector128), + Vector128 v1 = default(Vector128), + Vector128 v2 = default(Vector128), + Vector128 v3 = default(Vector128), + Vector128 v4 = default(Vector128), + Vector128 v5 = default(Vector128), + Vector128 v30 = default(Vector128), + Vector128 v31 = default(Vector128), bool overflow = false, bool carry = false, bool zero = false, bool negative = false, int fpcr = 0x0, int fpsr = 0x0) { @@ -107,10 +111,14 @@ namespace Ryujinx.Tests.Cpu _thread.ThreadState.X31 = x31; - _thread.ThreadState.V0 = v0; - _thread.ThreadState.V1 = v1; - _thread.ThreadState.V2 = v2; - _thread.ThreadState.V3 = v3; + _thread.ThreadState.V0 = v0; + _thread.ThreadState.V1 = v1; + _thread.ThreadState.V2 = v2; + _thread.ThreadState.V3 = v3; + _thread.ThreadState.V4 = v4; + _thread.ThreadState.V5 = v5; + _thread.ThreadState.V30 = v30; + _thread.ThreadState.V31 = v31; _thread.ThreadState.Overflow = overflow; _thread.ThreadState.Carry = carry; @@ -129,10 +137,14 @@ namespace Ryujinx.Tests.Cpu _unicornEmu.SP = x31; - _unicornEmu.Q[0] = v0; - _unicornEmu.Q[1] = v1; - _unicornEmu.Q[2] = v2; - _unicornEmu.Q[3] = v3; + _unicornEmu.Q[0] = v0; + _unicornEmu.Q[1] = v1; + _unicornEmu.Q[2] = v2; + _unicornEmu.Q[3] = v3; + _unicornEmu.Q[4] = v4; + _unicornEmu.Q[5] = v5; + _unicornEmu.Q[30] = v30; + _unicornEmu.Q[31] = v31; _unicornEmu.OverflowFlag = overflow; _unicornEmu.CarryFlag = carry; @@ -165,17 +177,21 @@ namespace Ryujinx.Tests.Cpu protected CpuThreadState SingleOpcode(uint opcode, ulong x0 = 0, ulong x1 = 0, ulong x2 = 0, ulong x3 = 0, ulong x31 = 0, - Vector128 v0 = default(Vector128), - Vector128 v1 = default(Vector128), - Vector128 v2 = default(Vector128), - Vector128 v3 = default(Vector128), + Vector128 v0 = default(Vector128), + Vector128 v1 = default(Vector128), + Vector128 v2 = default(Vector128), + Vector128 v3 = default(Vector128), + Vector128 v4 = default(Vector128), + Vector128 v5 = default(Vector128), + Vector128 v30 = default(Vector128), + Vector128 v31 = default(Vector128), bool overflow = false, bool carry = false, bool zero = false, bool negative = false, int fpcr = 0x0, int fpsr = 0x0) { Opcode(opcode); Opcode(0xD4200000); // BRK #0 Opcode(0xD65F03C0); // RET - SetThreadState(x0, x1, x2, x3, x31, v0, v1, v2, v3, overflow, carry, zero, negative, fpcr, fpsr); + SetThreadState(x0, x1, x2, x3, x31, v0, v1, v2, v3, v4, v5, v30, v31, overflow, carry, zero, negative, fpcr, fpsr); ExecuteOpcodes(); return GetThreadState(); diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdTbl.cs b/Ryujinx.Tests/Cpu/CpuTestSimdTbl.cs new file mode 100644 index 00000000..69195af2 --- /dev/null +++ b/Ryujinx.Tests/Cpu/CpuTestSimdTbl.cs @@ -0,0 +1,315 @@ +#define SimdTbl + +using NUnit.Framework; + +using System.Collections.Generic; +using System.Runtime.Intrinsics; + +namespace Ryujinx.Tests.Cpu +{ + [Category("SimdTbl")] + public sealed class CpuTestSimdTbl : CpuTest + { +#if SimdTbl + +#region "Helper methods" + private static ulong GenIdxsForTbls(int regs) + { + const byte idxInRngMin = (byte)0; + byte idxInRngMax = (byte)((16 * regs) - 1); + byte idxOutRngMin = (byte) (16 * regs); + const byte idxOutRngMax = (byte)255; + + ulong idxs = 0ul; + + for (int cnt = 1; cnt <= 8; cnt++) + { + ulong idxInRng = (ulong)TestContext.CurrentContext.Random.NextByte(idxInRngMin, idxInRngMax); + ulong idxOutRng = (ulong)TestContext.CurrentContext.Random.NextByte(idxOutRngMin, idxOutRngMax); + + ulong idx = TestContext.CurrentContext.Random.NextBool() ? idxInRng : idxOutRng; + + idxs = (idxs << 8) | idx; + } + + return idxs; + } +#endregion + +#region "ValueSource (Types)" + private static ulong[] _8B_() + { + return new ulong[] { 0x0000000000000000ul, 0x7F7F7F7F7F7F7F7Ful, + 0x8080808080808080ul, 0xFFFFFFFFFFFFFFFFul }; + } + + private static IEnumerable _GenIdxsForTbl1_() + { + yield return 0x0000000000000000ul; + yield return 0x7F7F7F7F7F7F7F7Ful; + yield return 0x8080808080808080ul; + yield return 0xFFFFFFFFFFFFFFFFul; + + for (int cnt = 1; cnt <= RndCntIdxs; cnt++) + { + yield return GenIdxsForTbls(regs: 1); + } + } + + private static IEnumerable _GenIdxsForTbl2_() + { + yield return 0x0000000000000000ul; + yield return 0x7F7F7F7F7F7F7F7Ful; + yield return 0x8080808080808080ul; + yield return 0xFFFFFFFFFFFFFFFFul; + + for (int cnt = 1; cnt <= RndCntIdxs; cnt++) + { + yield return GenIdxsForTbls(regs: 2); + } + } + + private static IEnumerable _GenIdxsForTbl3_() + { + yield return 0x0000000000000000ul; + yield return 0x7F7F7F7F7F7F7F7Ful; + yield return 0x8080808080808080ul; + yield return 0xFFFFFFFFFFFFFFFFul; + + for (int cnt = 1; cnt <= RndCntIdxs; cnt++) + { + yield return GenIdxsForTbls(regs: 3); + } + } + + private static IEnumerable _GenIdxsForTbl4_() + { + yield return 0x0000000000000000ul; + yield return 0x7F7F7F7F7F7F7F7Ful; + yield return 0x8080808080808080ul; + yield return 0xFFFFFFFFFFFFFFFFul; + + for (int cnt = 1; cnt <= RndCntIdxs; cnt++) + { + yield return GenIdxsForTbls(regs: 4); + } + } +#endregion + +#region "ValueSource (Opcodes)" + private static uint[] _SingleRegTbl_V_8B_16B_() + { + return new uint[] + { + 0x0E000000u, // TBL V0.8B, { V0.16B }, V0.8B + }; + } + + private static uint[] _TwoRegTbl_V_8B_16B_() + { + return new uint[] + { + 0x0E002000u, // TBL V0.8B, { V0.16B, V1.16B }, V0.8B + }; + } + + private static uint[] _ThreeRegTbl_V_8B_16B_() + { + return new uint[] + { + 0x0E004000u, // TBL V0.8B, { V0.16B, V1.16B, V2.16B }, V0.8B + }; + } + + private static uint[] _FourRegTbl_V_8B_16B_() + { + return new uint[] + { + 0x0E006000u, // TBL V0.8B, { V0.16B, V1.16B, V2.16B, V3.16B }, V0.8B + }; + } +#endregion + + private const int RndCntTbls = 2; + private const int RndCntIdxs = 2; + + [Test, Pairwise, Description("TBL ., { .16B }, .")] + public void SingleRegTbl_V_8B_16B([ValueSource("_SingleRegTbl_V_8B_16B_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u)] uint rn, + [Values(2u)] uint rm, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table0, + [ValueSource("_GenIdxsForTbl1_")] ulong indexes, + [Values(0b0u, 0b1u)] uint q) // <8B, 16B> + { + opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((q & 1) << 30); + + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(table0, table0); + Vector128 v2 = MakeVectorE0E1(indexes, q == 1u ? indexes : 0ul); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise, Description("TBL ., { .16B, .16B }, .")] + public void TwoRegTbl_V_8B_16B([ValueSource("_TwoRegTbl_V_8B_16B_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u)] uint rn, + [Values(3u)] uint rm, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table0, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table1, + [ValueSource("_GenIdxsForTbl2_")] ulong indexes, + [Values(0b0u, 0b1u)] uint q) // <8B, 16B> + { + opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((q & 1) << 30); + + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(table0, table0); + Vector128 v2 = MakeVectorE0E1(table1, table1); + Vector128 v3 = MakeVectorE0E1(indexes, q == 1u ? indexes : 0ul); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2, v3: v3); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise, Description("TBL ., { .16B, .16B }, .")] + public void Mod_TwoRegTbl_V_8B_16B([ValueSource("_TwoRegTbl_V_8B_16B_")] uint opcodes, + [Values(30u, 1u)] uint rd, + [Values(31u)] uint rn, + [Values(1u, 30u)] uint rm, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table0, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table1, + [ValueSource("_GenIdxsForTbl2_")] ulong indexes, + [Values(0b0u, 0b1u)] uint q) // <8B, 16B> + { + opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((q & 1) << 30); + + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v30 = MakeVectorE0E1(z, z); + Vector128 v31 = MakeVectorE0E1(table0, table0); + Vector128 v0 = MakeVectorE0E1(table1, table1); + Vector128 v1 = MakeVectorE0E1(indexes, indexes); + + SingleOpcode(opcodes, v0: v0, v1: v1, v30: v30, v31: v31); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise, Description("TBL ., { .16B, .16B, .16B }, .")] + public void ThreeRegTbl_V_8B_16B([ValueSource("_ThreeRegTbl_V_8B_16B_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u)] uint rn, + [Values(4u)] uint rm, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table0, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table1, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table2, + [ValueSource("_GenIdxsForTbl3_")] ulong indexes, + [Values(0b0u, 0b1u)] uint q) // <8B, 16B> + { + opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((q & 1) << 30); + + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(table0, table0); + Vector128 v2 = MakeVectorE0E1(table1, table1); + Vector128 v3 = MakeVectorE0E1(table2, table2); + Vector128 v4 = MakeVectorE0E1(indexes, q == 1u ? indexes : 0ul); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2, v3: v3, v4: v4); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise, Description("TBL ., { .16B, .16B, .16B }, .")] + public void Mod_ThreeRegTbl_V_8B_16B([ValueSource("_ThreeRegTbl_V_8B_16B_")] uint opcodes, + [Values(30u, 2u)] uint rd, + [Values(31u)] uint rn, + [Values(2u, 30u)] uint rm, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table0, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table1, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table2, + [ValueSource("_GenIdxsForTbl3_")] ulong indexes, + [Values(0b0u, 0b1u)] uint q) // <8B, 16B> + { + opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((q & 1) << 30); + + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v30 = MakeVectorE0E1(z, z); + Vector128 v31 = MakeVectorE0E1(table0, table0); + Vector128 v0 = MakeVectorE0E1(table1, table1); + Vector128 v1 = MakeVectorE0E1(table2, table2); + Vector128 v2 = MakeVectorE0E1(indexes, indexes); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2, v30: v30, v31: v31); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise, Description("TBL ., { .16B, .16B, .16B, .16B }, .")] + public void FourRegTbl_V_8B_16B([ValueSource("_FourRegTbl_V_8B_16B_")] uint opcodes, + [Values(0u)] uint rd, + [Values(1u)] uint rn, + [Values(5u)] uint rm, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table0, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table1, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table2, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table3, + [ValueSource("_GenIdxsForTbl4_")] ulong indexes, + [Values(0b0u, 0b1u)] uint q) // <8B, 16B> + { + opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((q & 1) << 30); + + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v0 = MakeVectorE0E1(z, z); + Vector128 v1 = MakeVectorE0E1(table0, table0); + Vector128 v2 = MakeVectorE0E1(table1, table1); + Vector128 v3 = MakeVectorE0E1(table2, table2); + Vector128 v4 = MakeVectorE0E1(table3, table3); + Vector128 v5 = MakeVectorE0E1(indexes, q == 1u ? indexes : 0ul); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2, v3: v3, v4: v4, v5: v5); + + CompareAgainstUnicorn(); + } + + [Test, Pairwise, Description("TBL ., { .16B, .16B, .16B, .16B }, .")] + public void Mod_FourRegTbl_V_8B_16B([ValueSource("_FourRegTbl_V_8B_16B_")] uint opcodes, + [Values(30u, 3u)] uint rd, + [Values(31u)] uint rn, + [Values(3u, 30u)] uint rm, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table0, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table1, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table2, + [ValueSource("_8B_")] [Random(RndCntTbls)] ulong table3, + [ValueSource("_GenIdxsForTbl4_")] ulong indexes, + [Values(0b0u, 0b1u)] uint q) // <8B, 16B> + { + opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0); + opcodes |= ((q & 1) << 30); + + ulong z = TestContext.CurrentContext.Random.NextULong(); + Vector128 v30 = MakeVectorE0E1(z, z); + Vector128 v31 = MakeVectorE0E1(table0, table0); + Vector128 v0 = MakeVectorE0E1(table1, table1); + Vector128 v1 = MakeVectorE0E1(table2, table2); + Vector128 v2 = MakeVectorE0E1(table3, table3); + Vector128 v3 = MakeVectorE0E1(indexes, indexes); + + SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2, v3: v3, v30: v30, v31: v31); + + CompareAgainstUnicorn(); + } +#endif + } +}