Skip to content

Commit 9fffa33

Browse files
authored
Merge pull request #18234 from unknownbrackets/x86-ir-transfer
x86jit: Perform vector transfers instead of flushing to memory
2 parents d3cd065 + 38e5b33 commit 9fffa33

10 files changed

+343
-31
lines changed

Common/x64Emitter.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -1697,7 +1697,6 @@ void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, ar
16971697

16981698
void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
16991699

1700-
// THESE TWO ARE UNTESTED.
17011700
void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
17021701
void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
17031702

@@ -1892,6 +1891,9 @@ void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest
18921891
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
18931892
void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);}
18941893

1894+
void XEmitter::INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteSSE41Op(0x66, 0x3A21, dest, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
1895+
void XEmitter::EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg) { WriteSSE41Op(0x66, 0x3A17, arg, dest, 1); Write8(subreg); }
1896+
18951897
void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);}
18961898
void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);}
18971899
void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);}
@@ -2084,7 +2086,7 @@ void XEmitter::VCVTTPD2DQ(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits,
20842086
void XEmitter::VCVTTSS2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF3, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
20852087
void XEmitter::VCVTTSD2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF2, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
20862088
void XEmitter::VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A17, regOp1, arg, 1); Write8(subreg); }
2087-
void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8(subreg); }
2089+
void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
20882090
void XEmitter::VLDDQU(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0xF2, sseLDDQU, regOp1, arg); }
20892091
void XEmitter::VMOVAPS(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x00, sseMOVAPfromRM, regOp1, arg); }
20902092
void XEmitter::VMOVAPD(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x66, sseMOVAPfromRM, regOp1, arg); }

Common/x64Emitter.h

+7-5
Original file line numberDiff line numberDiff line change
@@ -684,12 +684,14 @@ class XEmitter
684684

685685
// SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
686686
void DPPD(X64Reg dest, OpArg src, u8 arg);
687-
688-
// These are probably useful for VFPU emulation.
689-
void INSERTPS(X64Reg dest, OpArg src, u8 arg);
690-
void EXTRACTPS(OpArg dest, X64Reg src, u8 arg);
691687
#endif
692688

689+
// SSE4: Insert and extract for floats.
690+
// Note: insert from memory or an XMM.
691+
void INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
692+
// Extract to memory or GPR.
693+
void EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg);
694+
693695
// SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy.
694696
void HADDPS(X64Reg dest, OpArg src);
695697

@@ -1040,7 +1042,7 @@ class XEmitter
10401042
// Can only extract from the low 128 bits.
10411043
void VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg);
10421044
// Can only insert into the low 128 bits, zeros upper bits. Inserts from XMM.
1043-
void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg);
1045+
void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
10441046
void VLDDQU(int bits, X64Reg regOp1, OpArg arg);
10451047
void VMOVAPS(int bits, X64Reg regOp1, OpArg arg);
10461048
void VMOVAPD(int bits, X64Reg regOp1, OpArg arg);

Core/MIPS/ARM64/Arm64IRRegCache.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ void Arm64IRRegCache::AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) {
347347
}
348348
}
349349

350-
bool Arm64IRRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
350+
bool Arm64IRRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
351351
// No special flags, skip the check for a little speed.
352352
return true;
353353
}

Core/MIPS/ARM64/Arm64IRRegCache.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class Arm64IRRegCache : public IRNativeRegCacheBase {
8686
const int *GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &count, int &base) const override;
8787
void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) override;
8888

89-
bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) override;
89+
bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) override;
9090
void LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
9191
void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
9292
void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;

Core/MIPS/IR/IRRegCache.cpp

+50-17
Original file line numberDiff line numberDiff line change
@@ -406,12 +406,12 @@ IRNativeReg IRNativeRegCacheBase::FindFreeReg(MIPSLoc type, MIPSMap flags) const
406406

407407
bool IRNativeRegCacheBase::IsGPRClobbered(IRReg gpr) const {
408408
_dbg_assert_(IsValidGPR(gpr));
409-
return IsRegClobbered(MIPSLoc::REG, MIPSMap::INIT, gpr);
409+
return IsRegClobbered(MIPSLoc::REG, gpr);
410410
}
411411

412412
bool IRNativeRegCacheBase::IsFPRClobbered(IRReg fpr) const {
413413
_dbg_assert_(IsValidFPR(fpr));
414-
return IsRegClobbered(MIPSLoc::FREG, MIPSMap::INIT, fpr + 32);
414+
return IsRegClobbered(MIPSLoc::FREG, fpr + 32);
415415
}
416416

417417
IRUsage IRNativeRegCacheBase::GetNextRegUsage(const IRSituation &info, MIPSLoc type, IRReg r) const {
@@ -423,7 +423,7 @@ IRUsage IRNativeRegCacheBase::GetNextRegUsage(const IRSituation &info, MIPSLoc t
423423
return IRUsage::UNKNOWN;
424424
}
425425

426-
bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r) const {
426+
bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, IRReg r) const {
427427
static const int UNUSED_LOOKAHEAD_OPS = 30;
428428

429429
IRSituation info;
@@ -450,6 +450,21 @@ bool IRNativeRegCacheBase::IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r)
450450
return false;
451451
}
452452

453+
bool IRNativeRegCacheBase::IsRegRead(MIPSLoc type, IRReg first) const {
454+
static const int UNUSED_LOOKAHEAD_OPS = 30;
455+
456+
IRSituation info;
457+
info.lookaheadCount = UNUSED_LOOKAHEAD_OPS;
458+
// We look starting one ahead, unlike spilling.
459+
info.currentIndex = irIndex_ + 1;
460+
info.instructions = irBlock_->GetInstructions();
461+
info.numInstructions = irBlock_->GetNumInstructions();
462+
463+
// Note: this intentionally doesn't look at the full reg, only the lane.
464+
IRUsage usage = GetNextRegUsage(info, type, first);
465+
return usage == IRUsage::READ;
466+
}
467+
453468
IRNativeReg IRNativeRegCacheBase::FindBestToSpill(MIPSLoc type, MIPSMap flags, bool unusedOnly, bool *clobbered) const {
454469
int allocCount = 0, base = 0;
455470
const int *allocOrder = GetAllocationOrder(type, flags, allocCount, base);
@@ -501,7 +516,7 @@ IRNativeReg IRNativeRegCacheBase::FindBestToSpill(MIPSLoc type, MIPSMap flags, b
501516
return -1;
502517
}
503518

504-
bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
519+
bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
505520
int allocCount = 0, base = 0;
506521
const int *allocOrder = GetAllocationOrder(type, flags, allocCount, base);
507522

@@ -514,6 +529,11 @@ bool IRNativeRegCacheBase::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type,
514529
return false;
515530
}
516531

532+
bool IRNativeRegCacheBase::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) {
533+
// To be overridden if the backend supports transfers.
534+
return false;
535+
}
536+
517537
void IRNativeRegCacheBase::DiscardNativeReg(IRNativeReg nreg) {
518538
_assert_msg_(nreg >= 0 && nreg < config_.totalNativeRegs, "DiscardNativeReg on invalid register %d", nreg);
519539
if (nr[nreg].mipsReg != IRREG_INVALID) {
@@ -930,21 +950,28 @@ IRNativeReg IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRReg first, int la
930950
case MIPSLoc::REG:
931951
if (type != MIPSLoc::REG) {
932952
nreg = AllocateReg(type, flags);
933-
} else if (!IsNativeRegCompatible(nreg, type, flags)) {
953+
} else if (!IsNativeRegCompatible(nreg, type, flags, lanes)) {
934954
// If it's not compatible, we'll need to reallocate.
935-
// TODO: Could do a transfer and avoid memory flush.
936-
FlushNativeReg(nreg);
937-
nreg = AllocateReg(type, flags);
955+
if (TransferNativeReg(nreg, -1, type, first, lanes, flags)) {
956+
nreg = mr[first].nReg;
957+
} else {
958+
FlushNativeReg(nreg);
959+
nreg = AllocateReg(type, flags);
960+
}
938961
}
939962
break;
940963

941964
case MIPSLoc::FREG:
942965
case MIPSLoc::VREG:
943966
if (type != mr[first].loc) {
944967
nreg = AllocateReg(type, flags);
945-
} else if (!IsNativeRegCompatible(nreg, type, flags)) {
946-
FlushNativeReg(nreg);
947-
nreg = AllocateReg(type, flags);
968+
} else if (!IsNativeRegCompatible(nreg, type, flags, lanes)) {
969+
if (TransferNativeReg(nreg, -1, type, first, lanes, flags)) {
970+
nreg = mr[first].nReg;
971+
} else {
972+
FlushNativeReg(nreg);
973+
nreg = AllocateReg(type, flags);
974+
}
948975
}
949976
break;
950977

@@ -981,10 +1008,13 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
9811008
_assert_msg_(!mreg.isStatic, "Cannot MapNativeReg a static reg mismatch");
9821009
if ((flags & MIPSMap::NOINIT) != MIPSMap::NOINIT) {
9831010
// If we need init, we have to flush mismatches.
984-
// TODO: Do a shuffle if interior only?
985-
// TODO: We may also be motivated to have multiple read-only "views" or an IRReg.
986-
// For example Vec4Scale v0..v3, v0..v3, v3
987-
FlushNativeReg(mreg.nReg);
1011+
if (!TransferNativeReg(mreg.nReg, nreg, type, first, lanes, flags)) {
1012+
// TODO: We may also be motivated to have multiple read-only "views" or an IRReg.
1013+
// For example Vec4Scale v0..v3, v0..v3, v3
1014+
FlushNativeReg(mreg.nReg);
1015+
}
1016+
// The mismatch has been "resolved" now.
1017+
mismatch = false;
9881018
} else if (oldlanes != 1) {
9891019
// Even if we don't care about the current contents, we can't discard outside.
9901020
bool extendsBefore = oldlane > i;
@@ -1017,6 +1047,9 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
10171047
DiscardNativeReg(mreg.nReg);
10181048
else
10191049
FlushNativeReg(mreg.nReg);
1050+
1051+
// That took care of the mismatch, either by clobber or flush.
1052+
mismatch = false;
10201053
}
10211054
}
10221055
}
@@ -1027,8 +1060,8 @@ void IRNativeRegCacheBase::MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg fi
10271060
if ((flags & MIPSMap::NOINIT) != MIPSMap::NOINIT) {
10281061
// We better not be trying to map to a different nreg if it's in one now.
10291062
// This might happen on some sort of transfer...
1030-
// TODO: Make a direct transfer, i.e. FREG -> VREG?
1031-
FlushNativeReg(mreg.nReg);
1063+
if (!TransferNativeReg(mreg.nReg, nreg, type, first, lanes, flags))
1064+
FlushNativeReg(mreg.nReg);
10321065
} else {
10331066
DiscardNativeReg(mreg.nReg);
10341067
}

Core/MIPS/IR/IRRegCache.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -209,13 +209,14 @@ class IRNativeRegCacheBase {
209209
IRNativeReg AllocateReg(MIPSLoc type, MIPSMap flags);
210210
IRNativeReg FindFreeReg(MIPSLoc type, MIPSMap flags) const;
211211
IRNativeReg FindBestToSpill(MIPSLoc type, MIPSMap flags, bool unusedOnly, bool *clobbered) const;
212-
virtual bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags);
212+
virtual bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes);
213213
virtual void DiscardNativeReg(IRNativeReg nreg);
214214
virtual void FlushNativeReg(IRNativeReg nreg);
215215
virtual void DiscardReg(IRReg mreg);
216216
virtual void FlushReg(IRReg mreg);
217217
virtual void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state);
218218
virtual void MapNativeReg(MIPSLoc type, IRNativeReg nreg, IRReg first, int lanes, MIPSMap flags);
219+
virtual bool TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags);
219220
virtual IRNativeReg MapNativeReg(MIPSLoc type, IRReg first, int lanes, MIPSMap flags);
220221
IRNativeReg MapNativeRegAsPointer(IRReg gpr);
221222

@@ -238,7 +239,8 @@ class IRNativeRegCacheBase {
238239
void SetSpillLockIRIndex(IRReg reg, int index);
239240
int GetMipsRegOffset(IRReg r);
240241

241-
bool IsRegClobbered(MIPSLoc type, MIPSMap flags, IRReg r) const;
242+
bool IsRegClobbered(MIPSLoc type, IRReg r) const;
243+
bool IsRegRead(MIPSLoc type, IRReg r) const;
242244
IRUsage GetNextRegUsage(const IRSituation &info, MIPSLoc type, IRReg r) const;
243245

244246
bool IsValidGPR(IRReg r) const;

Core/MIPS/RiscV/RiscVRegCache.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -303,11 +303,11 @@ void RiscVRegCache::AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) {
303303
}
304304
}
305305

306-
bool RiscVRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) {
306+
bool RiscVRegCache::IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) {
307307
// No special flags except VREG, skip the check for a little speed.
308308
if (type != MIPSLoc::VREG)
309309
return true;
310-
return IRNativeRegCacheBase::IsNativeRegCompatible(nreg, type, flags);
310+
return IRNativeRegCacheBase::IsNativeRegCompatible(nreg, type, flags, lanes);
311311
}
312312

313313
void RiscVRegCache::LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) {

Core/MIPS/RiscV/RiscVRegCache.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ class RiscVRegCache : public IRNativeRegCacheBase {
7676
const int *GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &count, int &base) const override;
7777
void AdjustNativeRegAsPtr(IRNativeReg nreg, bool state) override;
7878

79-
bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags) override;
79+
bool IsNativeRegCompatible(IRNativeReg nreg, MIPSLoc type, MIPSMap flags, int lanes) override;
8080
void LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
8181
void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
8282
void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;

0 commit comments

Comments
 (0)