Skip to content

HLE: Slice the very slow memset/memcpy variants #18560

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 54 additions & 16 deletions Core/HLE/ReplaceTables.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,37 +182,54 @@ static int Replace_memcpy_jak() {
u32 destPtr = PARAM(0);
u32 srcPtr = PARAM(1);
u32 bytes = PARAM(2);
bool skip = false;

if (bytes == 0) {
RETURN(destPtr);
return 5;
}

bool skip = false;
bool sliced = false;
static constexpr uint32_t SLICE_SIZE = 32768;

currentMIPS->InvalidateICache(srcPtr, bytes);
if ((skipGPUReplacements & (int)GPUReplacementSkip::MEMCPY) == 0) {
if (Memory::IsVRAMAddress(destPtr) || Memory::IsVRAMAddress(srcPtr)) {
skip = gpu->PerformMemoryCopy(destPtr, srcPtr, bytes);
}
}
if (!skip && bytes > SLICE_SIZE && bytes != 512 * 272 * 4) {
// This is a very slow func. To avoid thread blocking, do a slice at a time.
// Avoiding exactly 512 * 272 * 4 to detect videos, though.
bytes = SLICE_SIZE;
sliced = true;
}
if (!skip && bytes != 0) {
u8 *dst = Memory::GetPointerWriteRange(destPtr, bytes);
const u8 *src = Memory::GetPointerRange(srcPtr, bytes);

if (!dst || !src) {
} else {
if (dst && src) {
// Jak style overlap.
for (u32 i = 0; i < bytes; i++) {
dst[i] = src[i];
}
}
}

// Jak relies on more registers coming out right than the ABI specifies.
// See the disassembly of the function for the explanations for these...
currentMIPS->r[MIPS_REG_T0] = 0;
currentMIPS->r[MIPS_REG_A0] = -1;
currentMIPS->r[MIPS_REG_A2] = 0;
currentMIPS->r[MIPS_REG_A3] = destPtr + bytes;
RETURN(destPtr);
if (sliced) {
currentMIPS->r[MIPS_REG_A0] += SLICE_SIZE;
currentMIPS->r[MIPS_REG_A1] += SLICE_SIZE;
currentMIPS->r[MIPS_REG_A2] -= SLICE_SIZE;
} else {
// Jak relies on more registers coming out right than the ABI specifies.
// See the disassembly of the function for the explanations for these...
currentMIPS->r[MIPS_REG_T0] = 0;
currentMIPS->r[MIPS_REG_A0] = -1;
currentMIPS->r[MIPS_REG_A2] = 0;
// Even after slicing, this ends up correct.
currentMIPS->r[MIPS_REG_A3] = destPtr + bytes;
RETURN(destPtr);
}

if (MemBlockInfoDetailed(bytes)) {
// It's pretty common that games will copy video data.
Expand All @@ -231,6 +248,10 @@ static int Replace_memcpy_jak() {
}
}

if (sliced) {
// Negative causes the function to be run again for the next slice.
return 5 + bytes * -8 + 2;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs a comment that a negative value is used to signal Comp_ReplacementFunc that the function call is to be repeated.

}
return 5 + bytes * 8 + 2; // approximation. This is a slow memcpy - a byte copy loop..
}

Expand Down Expand Up @@ -364,24 +385,41 @@ static int Replace_memset_jak() {
}

bool skip = false;
bool sliced = false;
static constexpr uint32_t SLICE_SIZE = 32768;
if (Memory::IsVRAMAddress(destPtr) && (skipGPUReplacements & (int)GPUReplacementSkip::MEMSET) == 0) {
skip = gpu->PerformMemorySet(destPtr, value, bytes);
}
if (!skip && bytes > SLICE_SIZE) {
// This is a very slow func. To avoid thread blocking, do a slice at a time.
bytes = SLICE_SIZE;
sliced = true;
}
if (!skip && bytes != 0) {
u8 *dst = Memory::GetPointerWriteRange(destPtr, bytes);
if (dst) {
memset(dst, value, bytes);
}
}

NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, "ReplaceMemset");

if (sliced) {
currentMIPS->r[MIPS_REG_A0] += SLICE_SIZE;
currentMIPS->r[MIPS_REG_A2] -= SLICE_SIZE;

// This is approximate, and must be a negative value.
// Negative causes the function to be run again for the next slice.
return 5 + (int)SLICE_SIZE * -6 + 2;
}

// Even after slicing, this ends up correct.
currentMIPS->r[MIPS_REG_T0] = destPtr + bytes;
currentMIPS->r[MIPS_REG_A2] = -1;
currentMIPS->r[MIPS_REG_A3] = -1;
RETURN(destPtr);

NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, "ReplaceMemset");

return 5 + bytes * 6 + 2; // approximation (hm, inspecting the disasm this should be 5 + 6 * bytes + 2, but this is what works..)
return 5 + bytes * 6 + 2; // approximation
}

static uint32_t SafeStringLen(const uint32_t ptr, uint32_t maxLen = 0x07FFFFFF) {
Expand Down Expand Up @@ -1449,12 +1487,12 @@ static const ReplacementTableEntry entries[] = {
{ "ceilf", &Replace_ceilf, 0, REPFLAG_DISABLED },

{ "memcpy", &Replace_memcpy, 0, 0 },
{ "memcpy_jak", &Replace_memcpy_jak, 0, 0 },
{ "memcpy_jak", &Replace_memcpy_jak, 0, REPFLAG_SLICED },
{ "memcpy16", &Replace_memcpy16, 0, 0 },
{ "memcpy_swizzled", &Replace_memcpy_swizzled, 0, 0 },
{ "memmove", &Replace_memmove, 0, 0 },
{ "memset", &Replace_memset, 0, 0 },
{ "memset_jak", &Replace_memset_jak, 0, 0 },
{ "memset_jak", &Replace_memset_jak, 0, REPFLAG_SLICED },
{ "strlen", &Replace_strlen, 0, REPFLAG_DISABLED },
{ "strcpy", &Replace_strcpy, 0, REPFLAG_DISABLED },
{ "strncpy", &Replace_strncpy, 0, REPFLAG_DISABLED },
Expand Down Expand Up @@ -1738,7 +1776,7 @@ bool CanReplaceJalTo(u32 dest, const ReplacementTableEntry **entry, u32 *funcSiz
return false;
}

if ((*entry)->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT | REPFLAG_DISABLED)) {
if ((*entry)->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT | REPFLAG_DISABLED | REPFLAG_SLICED)) {
// If it's a hook, we can't replace the jal, we have to go inside the func.
return false;
}
Expand Down
2 changes: 2 additions & 0 deletions Core/HLE/ReplaceTables.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ enum {
REPFLAG_HOOKENTER = 0x04,
// Only hooks jr ra, so only use on funcs that have that.
REPFLAG_HOOKEXIT = 0x08,
// Function may take a lot of time and execute in slices (executed multiple times.)
REPFLAG_SLICED = 0x10,
};

// Kind of similar to HLE functions but with different data.
Expand Down
11 changes: 11 additions & 0 deletions Core/MIPS/ARM/ArmJit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,18 @@ void ArmJit::Comp_ReplacementFunc(MIPSOpcode op)
} else {
ApplyRoundingMode();
RestoreDowncount();

CMPI2R(R0, 0, SCRATCHREG2);
FixupBranch positive = B_CC(CC_GE);

RSB(R0, R0, Operand2(0));
MovFromPC(R1);
FixupBranch done = B();

SetJumpTarget(positive);
LDR(R1, CTXREG, MIPS_REG_RA * 4);

SetJumpTarget(done);
WriteDownCountR(R0);
WriteExitDestInR(R1);
js.compiling = false;
Expand Down
11 changes: 10 additions & 1 deletion Core/MIPS/ARM64/Arm64IRCompSystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,16 @@ void Arm64JitBackend::CompIR_System(IRInst inst) {
QuickCallFunction(SCRATCH2_64, GetReplacementFunc(inst.constant)->replaceFunc);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
SUB(DOWNCOUNTREG, DOWNCOUNTREG, W0);

// Absolute value the result and subtract.
CMP(W0, 0);
CSNEG(SCRATCH1, W0, W0, CC_PL);
SUB(DOWNCOUNTREG, DOWNCOUNTREG, SCRATCH1);

// W0 might be the mapped reg, but there's only one.
// Set dest reg to the sign of the result.
regs_.Map(inst);
ASR(regs_.R(inst.dest), W0, 31);
break;

case IROp::Break:
Expand Down
11 changes: 11 additions & 0 deletions Core/MIPS/ARM64/Arm64Jit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,18 @@ void Arm64Jit::Comp_ReplacementFunc(MIPSOpcode op)
} else {
ApplyRoundingMode();
LoadStaticRegisters();

CMPI2R(W0, 0);
FixupBranch positive = B(CC_GE);

NEG(W0, W0);
MovFromPC(W1);
FixupBranch done = B();

SetJumpTarget(positive);
LDR(INDEX_UNSIGNED, W1, CTXREG, MIPS_REG_RA * 4);

SetJumpTarget(done);
WriteDownCountR(W0);
WriteExitDestInR(W1);
js.compiling = false;
Expand Down
5 changes: 4 additions & 1 deletion Core/MIPS/IR/IRFrontend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,15 +164,18 @@ void IRFrontend::Comp_ReplacementFunc(MIPSOpcode op) {
FlushAll();
RestoreRoundingMode();
ir.Write(IROp::SetPCConst, 0, ir.AddConstant(GetCompilerPC()));
ir.Write(IROp::CallReplacement, 0, ir.AddConstant(index));
ir.Write(IROp::CallReplacement, IRTEMP_0, ir.AddConstant(index));

if (entry->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT)) {
// Compile the original instruction at this address. We ignore cycles for hooks.
ApplyRoundingMode();
MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this);
} else {
ApplyRoundingMode();
// If IRTEMP_0 was set to 1, it means the replacement needs to run again (sliced.)
// This is necessary for replacements that take a lot of cycles.
ir.Write(IROp::Downcount, 0, ir.AddConstant(js.downcountAmount));
ir.Write(IROp::ExitToConstIfNeq, ir.AddConstant(GetCompilerPC()), IRTEMP_0, MIPS_REG_ZERO);
ir.Write(IROp::ExitToReg, 0, MIPS_REG_RA, 0);
js.compiling = false;
}
Expand Down
2 changes: 1 addition & 1 deletion Core/MIPS/IR/IRInst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ static const IRMeta irMeta[] = {
{ IROp::Break, "Break", "", IRFLAG_EXIT },
{ IROp::SetPC, "SetPC", "_G" },
{ IROp::SetPCConst, "SetPC", "_C" },
{ IROp::CallReplacement, "CallRepl", "_C", IRFLAG_BARRIER },
{ IROp::CallReplacement, "CallRepl", "GC", IRFLAG_BARRIER },
{ IROp::Breakpoint, "Breakpoint", "_C", IRFLAG_BARRIER },
{ IROp::MemoryCheck, "MemoryCheck", "IGC", IRFLAG_BARRIER },

Expand Down
3 changes: 2 additions & 1 deletion Core/MIPS/IR/IRInterpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1089,7 +1089,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
int funcIndex = inst->constant;
const ReplacementTableEntry *f = GetReplacementFunc(funcIndex);
int cycles = f->replaceFunc();
mips->downcount -= cycles;
mips->r[inst->dest] = cycles < 0 ? -1 : 0;
mips->downcount -= cycles < 0 ? -cycles : cycles;
break;
}

Expand Down
6 changes: 5 additions & 1 deletion Core/MIPS/MIPSInt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1038,13 +1038,17 @@ namespace MIPSInt
int index = op.encoding & 0xFFFFFF;
const ReplacementTableEntry *entry = GetReplacementFunc(index);
if (entry && entry->replaceFunc && (entry->flags & REPFLAG_DISABLED) == 0) {
entry->replaceFunc();
int cycles = entry->replaceFunc();

if (entry->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT)) {
// Interpret the original instruction under the hook.
MIPSInterpret(Memory::Read_Instruction(PC, true));
} else if (cycles < 0) {
// Leave PC unchanged, call the replacement again (assumes args are modified.)
currentMIPS->downcount += cycles;
} else {
PC = currentMIPS->r[MIPS_REG_RA];
currentMIPS->downcount -= cycles;
}
} else {
if (!entry || !entry->replaceFunc) {
Expand Down
7 changes: 7 additions & 0 deletions Core/MIPS/RiscV/RiscVCompSystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,13 @@ void RiscVJitBackend::CompIR_System(IRInst inst) {
QuickCallFunction(GetReplacementFunc(inst.constant)->replaceFunc, SCRATCH2);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();

regs_.Map(inst);
SRAIW(regs_.R(inst.dest), X10, 31);

// Absolute value trick: if neg, abs(x) == (x ^ -1) + 1.
XOR(X10, X10, regs_.R(inst.dest));
SUBW(X10, X10, regs_.R(inst.dest));
SUB(DOWNCOUNTREG, DOWNCOUNTREG, X10);
break;

Expand Down
10 changes: 10 additions & 0 deletions Core/MIPS/x86/Jit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -658,8 +658,18 @@ void Jit::Comp_ReplacementFunc(MIPSOpcode op) {
ApplyRoundingMode();
MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this);
} else {
CMP(32, R(EAX), Imm32(0));
FixupBranch positive = J_CC(CC_GE);

MOV(32, R(ECX), MIPSSTATE_VAR(pc));
ADD(32, MIPSSTATE_VAR(downcount), R(EAX));
FixupBranch done = J();

SetJumpTarget(positive);
MOV(32, R(ECX), MIPSSTATE_VAR(r[MIPS_REG_RA]));
SUB(32, MIPSSTATE_VAR(downcount), R(EAX));

SetJumpTarget(done);
ApplyRoundingMode();
// Need to set flags again, ApplyRoundingMode destroyed them (and EAX.)
SUB(32, MIPSSTATE_VAR(downcount), Imm8(0));
Expand Down
17 changes: 15 additions & 2 deletions Core/MIPS/x86/X64IRCompSystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,21 @@ void X64JitBackend::CompIR_System(IRInst inst) {
ABI_CallFunction(GetReplacementFunc(inst.constant)->replaceFunc);
WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT);
LoadStaticRegisters();
//SUB(32, R(DOWNCOUNTREG), R(DOWNCOUNTREG), R(EAX));
SUB(32, MDisp(CTXREG, downcountOffset), R(EAX));

// Since we flushed above, and we're mapping write, EAX should be safe.
regs_.Map(inst);
MOV(32, regs_.R(inst.dest), R(EAX));
NEG(32, R(EAX));
// Set it back if it negate made it negative. That's the absolute value.
CMOVcc(32, EAX, regs_.R(inst.dest), CC_S);

// Now set the dest to the sign bit status.
SAR(32, regs_.R(inst.dest), Imm8(31));

if (jo.downcountInRegister)
SUB(32, R(DOWNCOUNTREG), R(EAX));
else
SUB(32, MDisp(CTXREG, downcountOffset), R(EAX));
break;

case IROp::Break:
Expand Down