Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JitArm64_LoadStore: Optimize dcbx using CSSC instruction #13252

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 96 additions & 34 deletions Source/Core/Common/Arm64Emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "Common/Align.h"
#include "Common/Assert.h"
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
#include "Common/MathUtil.h"
#include "Common/SmallVector.h"
Expand Down Expand Up @@ -204,24 +205,9 @@ static const u32 Data1SrcEnc[][2] = {
{0, 3}, // REV64
{0, 4}, // CLZ
{0, 5}, // CLS
};

// Data-Processing (2 source)
static const u32 Data2SrcEnc[] = {
0x02, // UDIV
0x03, // SDIV
0x08, // LSLV
0x09, // LSRV
0x0A, // ASRV
0x0B, // RORV
0x10, // CRC32B
0x11, // CRC32H
0x12, // CRC32W
0x14, // CRC32CB
0x15, // CRC32CH
0x16, // CRC32CW
0x13, // CRC32X (64bit Only)
0x17, // XRC32CX (64bit Only)
{0, 6}, // CTZ
{0, 7}, // CNT
{0, 8}, // ABS
};

// Data-Processing (3 source)
Expand Down Expand Up @@ -409,14 +395,22 @@ void ARM64XEmitter::EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn)
(Data1SrcEnc[instenc][1] << 10) | (DecodeReg(Rn) << 5) | DecodeReg(Rd));
}

void ARM64XEmitter::EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
void ARM64XEmitter::EncodeData2SrcInst(Data2SrcEnc instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
bool b64Bit = Is64Bit(Rd);

Write32((b64Bit << 31) | (0x0D6 << 21) | (DecodeReg(Rm) << 16) | (Data2SrcEnc[instenc] << 10) |
Write32((b64Bit << 31) | (0x0D6 << 21) | (DecodeReg(Rm) << 16) | (u32(instenc) << 10) |
(DecodeReg(Rn) << 5) | DecodeReg(Rd));
}

void ARM64XEmitter::EncodeDataCSSCImmInst(DataCSSCImm8Enc opc, ARM64Reg Rd, ARM64Reg Rn, u8 imm)
{
bool b64Bit = Is64Bit(Rd);

Write32((b64Bit << 31) | (0x47 << 22) | (u32(opc) << 18) | (imm << 10) | (DecodeReg(Rn) << 5) |
DecodeReg(Rd));
}

void ARM64XEmitter::EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm,
ARM64Reg Ra)
{
Expand Down Expand Up @@ -1180,59 +1174,59 @@ void ARM64XEmitter::CLS(ARM64Reg Rd, ARM64Reg Rn)
// Data-Processing 2 source
void ARM64XEmitter::UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(0, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::UDIV, Rd, Rn, Rm);
}
void ARM64XEmitter::SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(1, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::SDIV, Rd, Rn, Rm);
}
void ARM64XEmitter::LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(2, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::LSLV, Rd, Rn, Rm);
}
void ARM64XEmitter::LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(3, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::LSRV, Rd, Rn, Rm);
}
void ARM64XEmitter::ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(4, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::ASRV, Rd, Rn, Rm);
}
void ARM64XEmitter::RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(5, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::RORV, Rd, Rn, Rm);
}
void ARM64XEmitter::CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(6, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::CRC32B, Rd, Rn, Rm);
}
void ARM64XEmitter::CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(7, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::CRC32H, Rd, Rn, Rm);
}
void ARM64XEmitter::CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(8, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::CRC32W, Rd, Rn, Rm);
}
void ARM64XEmitter::CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(9, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::CRC32CB, Rd, Rn, Rm);
}
void ARM64XEmitter::CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(10, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::CRC32CH, Rd, Rn, Rm);
}
void ARM64XEmitter::CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(11, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::CRC32CW, Rd, Rn, Rm);
}
void ARM64XEmitter::CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(12, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::CRC32X, Rd, Rn, Rm);
}
void ARM64XEmitter::CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EncodeData2SrcInst(13, Rd, Rn, Rm);
EncodeData2SrcInst(Data2SrcEnc::CRC32CX, Rd, Rn, Rm);
}

// Data-Processing 3 source
Expand Down Expand Up @@ -1775,6 +1769,74 @@ void ARM64XEmitter::ADRP(ARM64Reg Rd, s64 imm)
EncodeAddressInst(1, Rd, static_cast<s32>(imm >> 12));
}

// Common Short Sequence Compression (CSSC) instructions
void ARM64XEmitter::ABS(ARM64Reg Rd, ARM64Reg Rn)
{
if (!cpu_info.bCSSC)
PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer.");
EncodeData1SrcInst(8, Rd, Rn);
}
void ARM64XEmitter::CNT(ARM64Reg Rd, ARM64Reg Rn)
{
if (!cpu_info.bCSSC)
PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer.");
EncodeData1SrcInst(7, Rd, Rn);
}
void ARM64XEmitter::CTZ(ARM64Reg Rd, ARM64Reg Rn)
{
if (!cpu_info.bCSSC)
PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer.");
EncodeData1SrcInst(6, Rd, Rn);
}
void ARM64XEmitter::SMIN(ARM64Reg Rd, ARM64Reg Rn, s8 imm)
{
if (!cpu_info.bCSSC)
PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer.");
EncodeDataCSSCImmInst(DataCSSCImm8Enc::SMIN, Rd, Rn, static_cast<u8>(imm));
}
void ARM64XEmitter::SMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
if (!cpu_info.bCSSC)
PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer.");
EncodeData2SrcInst(Data2SrcEnc::SMIN, Rd, Rn, Rm);
}
void ARM64XEmitter::SMAX(ARM64Reg Rd, ARM64Reg Rn, s8 imm)
{
if (!cpu_info.bCSSC)
PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer.");
EncodeDataCSSCImmInst(DataCSSCImm8Enc::SMAX, Rd, Rn, static_cast<u8>(imm));
}
void ARM64XEmitter::SMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
if (!cpu_info.bCSSC)
PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer.");
EncodeData2SrcInst(Data2SrcEnc::SMAX, Rd, Rn, Rm);
}
void ARM64XEmitter::UMIN(ARM64Reg Rd, ARM64Reg Rn, u8 imm)
{
if (!cpu_info.bCSSC)
PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer.");
EncodeDataCSSCImmInst(DataCSSCImm8Enc::UMIN, Rd, Rn, static_cast<u8>(imm));
}
void ARM64XEmitter::UMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
if (!cpu_info.bCSSC)
PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer.");
EncodeData2SrcInst(Data2SrcEnc::UMIN, Rd, Rn, Rm);
}
void ARM64XEmitter::UMAX(ARM64Reg Rd, ARM64Reg Rn, u8 imm)
{
if (!cpu_info.bCSSC)
PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer.");
EncodeDataCSSCImmInst(DataCSSCImm8Enc::UMAX, Rd, Rn, static_cast<u8>(imm));
}
void ARM64XEmitter::UMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
if (!cpu_info.bCSSC)
PanicAlertFmt("Trying to use CSSC on a system that doesn't support it. Bad programmer.");
EncodeData2SrcInst(Data2SrcEnc::UMAX, Rd, Rn, Rm);
}

// This is using a hand-rolled algorithm. The goal is zero memory allocations, not necessarily
// the best JIT-time time complexity. (The number of moves is usually very small.)
void ARM64XEmitter::ParallelMoves(RegisterMove* begin, RegisterMove* end,
Expand Down
48 changes: 47 additions & 1 deletion Source/Core/Common/Arm64Emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,38 @@ class ARM64XEmitter
// Must be cleared with SetCodePtr() afterwards.
bool m_write_failed = false;

// Data-Processing (2 source)
enum class Data2SrcEnc : u32
{
UDIV = 0x02,
SDIV = 0x03,
LSLV = 0x08,
LSRV = 0x09,
ASRV = 0x0A,
RORV = 0x0B,
CRC32B = 0x10,
CRC32H = 0x11,
CRC32W = 0x12,
CRC32CB = 0x14,
CRC32CH = 0x15,
CRC32CW = 0x16,
CRC32X = 0x13, // 64-bit only
CRC32CX = 0x17, // 64-bit only
// CSSC
SMAX = 0x18,
UMAX = 0x19,
SMIN = 0x1A,
UMIN = 0x1B,
};

enum class DataCSSCImm8Enc : u8
{
SMAX = 0b0000,
UMAX = 0b0001,
SMIN = 0b0010,
UMIN = 0b0011,
};

void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags);
void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr);
void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr);
Expand All @@ -638,7 +670,7 @@ class ARM64XEmitter
void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond);
void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn);
void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EncodeData2SrcInst(Data2SrcEnc instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra);
void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm);
Expand All @@ -655,6 +687,7 @@ class ARM64XEmitter
s32 imm);
void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
void EncodeDataCSSCImmInst(DataCSSCImm8Enc opc, ARM64Reg Rd, ARM64Reg Rn, u8 imm);

[[nodiscard]] FixupBranch WriteFixupBranch();

Expand Down Expand Up @@ -1021,6 +1054,19 @@ class ARM64XEmitter
void ADR(ARM64Reg Rd, s32 imm);
void ADRP(ARM64Reg Rd, s64 imm);

// Common Short Sequence Compression (CSSC) instructions
void ABS(ARM64Reg Rd, ARM64Reg Rn);
void CNT(ARM64Reg Rd, ARM64Reg Rn);
void CTZ(ARM64Reg Rd, ARM64Reg Rn);
void SMIN(ARM64Reg Rd, ARM64Reg Rn, s8 imm);
void SMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void SMAX(ARM64Reg Rd, ARM64Reg Rn, s8 imm);
void SMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void UMIN(ARM64Reg Rd, ARM64Reg Rn, u8 imm);
void UMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void UMAX(ARM64Reg Rd, ARM64Reg Rn, u8 imm);
void UMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);

// Wrapper around ADR/ADRP/MOVZ/MOVN/MOVK
void MOVI2R(ARM64Reg Rd, u64 imm);
bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2);
Expand Down
9 changes: 8 additions & 1 deletion Source/Core/Common/ArmCPUDetect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,14 @@ void CPUInfo::Detect()
bSHA1 = hwcap & HWCAP_SHA1;
bSHA2 = hwcap & HWCAP_SHA2;

#if defined(AT_HWCAP2) && defined(HWCAP2_AFP)
#if defined(AT_HWCAP2)
const u32 hwcap2 = ReadHwCap(AT_HWCAP2);
#if defined(HWCAP2_AFP)
bAFP = hwcap2 & HWCAP2_AFP;
#endif
#if defined(HWCAP2_CSSC)
bCSSC = hwcap2 & HWCAP2_CSSC;
#endif
#endif

u64 midr = 0;
Expand Down Expand Up @@ -332,6 +337,8 @@ std::string CPUInfo::Summarize()
sum.push_back("SHA1");
if (bSHA2)
sum.push_back("SHA2");
if (bCSSC)
sum.push_back("CSSC");

return fmt::to_string(fmt::join(sum, ","));
}
3 changes: 2 additions & 1 deletion Source/Core/Common/CPUDetect.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ struct CPUInfo
bool bSHA2 = false;

// ARMv8 specific
bool bAFP = false; // Alternate floating-point behavior
bool bAFP = false; // Alternate floating-point behavior
bool bCSSC = false; // Common Short Sequence Compression

// Call Detect()
explicit CPUInfo();
Expand Down
12 changes: 10 additions & 2 deletions Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include "Common/Arm64Emitter.h"
#include "Common/BitSet.h"
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
#include "Common/ScopeGuard.h"

Expand Down Expand Up @@ -805,8 +806,15 @@ void JitArm64::dcbx(UGeckoInstruction inst)
SDIV(WB, reg_downcount, reg_cycle_count); // WB = downcount / cycle_count
SUB(WA, loop_counter, 1); // WA = CTR - 1
// ^ Note that this CTR-1 implicitly handles the CTR == 0 case correctly.
CMP(WB, WA);
CSEL(WA, WB, WA, CCFlags::CC_LO); // WA = min(WB, WA)
if (cpu_info.bCSSC)
{
UMIN(WA, WB, WA);
}
else
{
CMP(WB, WA);
CSEL(WA, WB, WA, CCFlags::CC_LO); // WA = min(WB, WA)
}

// WA now holds the amount of loops to execute minus 1, which is the amount we need to adjust
// downcount, CTR, and Rb by to exit the loop construct with the right values in those
Expand Down