mirror of
https://github.com/Ryujinx/Ryujinx.git
synced 2024-10-01 12:30:00 +02:00
Faster crc32 implementation (#1294)
* Add Pclmulqdq intrinsic * Implement crc32 in terms of pclmulqdq * Address PR comments
This commit is contained in:
parent
bcb7761eac
commit
f8cd072b62
6 changed files with 160 additions and 26 deletions
|
@ -165,6 +165,7 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Add(X86Instruction.Pavgb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe0, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Pavgw, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe3, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Pblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3810, InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Pclmulqdq, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a44, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Pcmpeqb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f74, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Pcmpeqd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f76, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Pcmpeqq, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3829, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
|
@ -633,6 +634,13 @@ namespace ARMeilleure.CodeGen.X86
|
|||
WriteInstruction(dest, source, type, X86Instruction.Or);
|
||||
}
|
||||
|
||||
public void Pclmulqdq(Operand dest, Operand source, byte imm)
|
||||
{
|
||||
WriteInstruction(dest, null, source, X86Instruction.Pclmulqdq);
|
||||
|
||||
WriteByte(imm);
|
||||
}
|
||||
|
||||
public void Pcmpeqw(Operand dest, Operand src1, Operand src2)
|
||||
{
|
||||
WriteInstruction(dest, src1, src2, X86Instruction.Pcmpeqw);
|
||||
|
|
|
@ -82,6 +82,7 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Add(Intrinsic.X86Pavgb, new IntrinsicInfo(X86Instruction.Pavgb, IntrinsicType.Binary));
|
||||
Add(Intrinsic.X86Pavgw, new IntrinsicInfo(X86Instruction.Pavgw, IntrinsicType.Binary));
|
||||
Add(Intrinsic.X86Pblendvb, new IntrinsicInfo(X86Instruction.Pblendvb, IntrinsicType.Ternary));
|
||||
Add(Intrinsic.X86Pclmulqdq, new IntrinsicInfo(X86Instruction.Pclmulqdq, IntrinsicType.TernaryImm));
|
||||
Add(Intrinsic.X86Pcmpeqb, new IntrinsicInfo(X86Instruction.Pcmpeqb, IntrinsicType.Binary));
|
||||
Add(Intrinsic.X86Pcmpeqd, new IntrinsicInfo(X86Instruction.Pcmpeqd, IntrinsicType.Binary));
|
||||
Add(Intrinsic.X86Pcmpeqq, new IntrinsicInfo(X86Instruction.Pcmpeqq, IntrinsicType.Binary));
|
||||
|
|
|
@ -98,6 +98,7 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Pavgb,
|
||||
Pavgw,
|
||||
Pblendvb,
|
||||
Pclmulqdq,
|
||||
Pcmpeqb,
|
||||
Pcmpeqd,
|
||||
Pcmpeqq,
|
||||
|
|
|
@ -1,53 +1,174 @@
|
|||
// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
||||
|
||||
using ARMeilleure.Decoders;
|
||||
using ARMeilleure.IntermediateRepresentation;
|
||||
using ARMeilleure.Translation;
|
||||
using System;
|
||||
|
||||
using static ARMeilleure.Instructions.InstEmitHelper;
|
||||
using static ARMeilleure.Instructions.InstEmitSimdHelper;
|
||||
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
|
||||
|
||||
namespace ARMeilleure.Instructions
|
||||
{
|
||||
static partial class InstEmit
|
||||
{
|
||||
public static void Crc32b(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, false, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32b));
|
||||
}
|
||||
}
|
||||
|
||||
public static void Crc32h(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, false, 16);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32h));
|
||||
}
|
||||
}
|
||||
|
||||
public static void Crc32w(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, false, 32);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32w));
|
||||
}
|
||||
}
|
||||
|
||||
public static void Crc32x(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized64(context, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32x));
|
||||
}
|
||||
}
|
||||
|
||||
public static void Crc32cb(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, true, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32cb));
|
||||
}
|
||||
}
|
||||
|
||||
public static void Crc32ch(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, true, 16);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32ch));
|
||||
}
|
||||
}
|
||||
|
||||
public static void Crc32cw(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, true, 32);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32cw));
|
||||
}
|
||||
}
|
||||
|
||||
public static void Crc32cx(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized64(context, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32cx));
|
||||
}
|
||||
}
|
||||
|
||||
private static void EmitCrc32Optimized(ArmEmitterContext context, bool castagnoli, int bitsize)
|
||||
{
|
||||
OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
|
||||
|
||||
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
|
||||
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
|
||||
|
||||
Operand crc = GetIntOrZR(context, op.Rn);
|
||||
Operand data = GetIntOrZR(context, op.Rm);
|
||||
|
||||
crc = context.VectorInsert(context.VectorZero(), crc, 0);
|
||||
|
||||
switch (bitsize)
|
||||
{
|
||||
case 8: data = context.VectorInsert8(context.VectorZero(), data, 0); break;
|
||||
case 16: data = context.VectorInsert16(context.VectorZero(), data, 0); break;
|
||||
case 32: data = context.VectorInsert(context.VectorZero(), data, 0); break;
|
||||
}
|
||||
|
||||
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
|
||||
|
||||
if (bitsize < 32)
|
||||
{
|
||||
crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc);
|
||||
}
|
||||
|
||||
SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
|
||||
}
|
||||
|
||||
private static void EmitCrc32Optimized64(ArmEmitterContext context, bool castagnoli)
|
||||
{
|
||||
OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
|
||||
|
||||
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
|
||||
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
|
||||
|
||||
Operand crc = GetIntOrZR(context, op.Rn);
|
||||
Operand data = GetIntOrZR(context, op.Rm);
|
||||
|
||||
crc = context.VectorInsert(context.VectorZero(), crc, 0);
|
||||
data = context.VectorInsert(context.VectorZero(), data, 0);
|
||||
|
||||
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
|
||||
Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4));
|
||||
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
|
||||
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res);
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32));
|
||||
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
|
||||
|
||||
SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
|
||||
}
|
||||
|
||||
private static void EmitCrc32Call(ArmEmitterContext context, Delegate dlg)
|
||||
{
|
||||
|
|
|
@ -71,6 +71,7 @@ namespace ARMeilleure.IntermediateRepresentation
|
|||
X86Pavgb,
|
||||
X86Pavgw,
|
||||
X86Pblendvb,
|
||||
X86Pclmulqdq,
|
||||
X86Pcmpeqb,
|
||||
X86Pcmpeqd,
|
||||
X86Pcmpeqq,
|
||||
|
|
|
@ -17,6 +17,7 @@ namespace ARMeilleure
|
|||
public static bool UsePopCntIfAvailable { get; set; } = true;
|
||||
public static bool UseAvxIfAvailable { get; set; } = true;
|
||||
public static bool UseAesniIfAvailable { get; set; } = true;
|
||||
public static bool UsePclmulqdqIfAvailable { get; set; } = true;
|
||||
|
||||
public static bool ForceLegacySse
|
||||
{
|
||||
|
@ -33,5 +34,6 @@ namespace ARMeilleure
|
|||
internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt;
|
||||
internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse;
|
||||
internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni;
|
||||
internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq;
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue