-
Notifications
You must be signed in to change notification settings - Fork 402
Improve x86 arrayset #7763
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Improve x86 arrayset #7763
Changes from all commits
ab2a852
8e5212a
32a90a2
761c586
27ae963
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -227,6 +227,68 @@ TR::Register* OMR::X86::TreeEvaluator::SIMDstoreEvaluator(TR::Node* node, TR::Co | |
return NULL; | ||
} | ||
|
||
TR::Register* OMR::X86::TreeEvaluator::broadcastHelper(TR::Node *node, TR::Register *vectorReg, TR::VectorLength vl, TR::DataType et, TR::CodeGenerator *cg) | ||
{ | ||
bool broadcast64 = et.isInt64() || et.isDouble(); | ||
|
||
if (!cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX2) || | ||
vl != TR::VectorLength128) | ||
{ | ||
// Expand byte & word to 32-bits | ||
switch (et) | ||
{ | ||
case TR::Int8: | ||
generateRegRegInstruction(TR::InstOpCode::PUNPCKLBWRegReg, node, vectorReg, vectorReg, cg); | ||
// fallthrough | ||
case TR::Int16: | ||
generateRegRegImmInstruction(TR::InstOpCode::PSHUFLWRegRegImm1, node, vectorReg, vectorReg, 0x0, cg); | ||
et = TR::Int32; | ||
break; | ||
default: | ||
break; | ||
} | ||
} | ||
|
||
switch (vl) | ||
{ | ||
case TR::VectorLength128: | ||
switch (et) | ||
{ | ||
case TR::Int8: | ||
TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX2), "8-bit to 128-bit vsplats requires AVX2"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So I expect Int8 and Int16 would crash if AVX2 is not supported. This original implementation was a little biased towards older micro archtechures by using the punpcklbw/pshuflw sequence to expand byte and word to 32-bits before using pshufd to expand broadcast to all lanes. I think its best to write this in two sections, one for AVX2+ hardware and keep the older logic for SSE/AVX1. Some rough pseudocode below. if (avx2) {
switch (et) {
case Int8:
opcode = VPBROADCASTB; break;
case Int16:
opcode = VPBROADCASTW; break;
case Int32:
case Float:
opcode = VPBROADCASTD; break;
case Int64:
case Double:
opcode = VPBROADCASTQ; break;
default:
// assert (unexpected type)
}
encoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl);
TR_ASSERT_FATAL(encoding != OMR::X86::Bad, "Broadcast instruction is not supported");
generateRegRegInstruction(opcode, node, vectorReg, vectorReg, cg, encoding);
} else {
// Expand byte and word to 32-bits.
// assert if VL > 128; operation not supported
// PSHUFD
} Try to run omr tests locally and set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No, it works because Int8 and Int16 don't reach there. As written now if AVX2 isn't supported we'll do the 8->16->32 expansion first and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, I see you set
Either ways is OK. I will let you decide. |
||
generateRegRegInstruction(TR::InstOpCode::VPBROADCASTBRegReg, node, vectorReg, vectorReg, cg); | ||
break; | ||
case TR::Int16: | ||
TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX2), "16-bit to 128-bit vsplats requires AVX2"); | ||
generateRegRegInstruction(TR::InstOpCode::VPBROADCASTWRegReg, node, vectorReg, vectorReg, cg); | ||
break; | ||
default: | ||
generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, vectorReg, vectorReg, broadcast64 ? 0x44 : 0, cg); | ||
break; | ||
} | ||
break; | ||
case TR::VectorLength256: | ||
{ | ||
TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX2), "256-bit vsplats requires AVX2"); | ||
TR::InstOpCode opcode = broadcast64 ? TR::InstOpCode::VBROADCASTSDYmmYmm : TR::InstOpCode::VBROADCASTSSRegReg; | ||
generateRegRegInstruction(opcode.getMnemonic(), node, vectorReg, vectorReg, cg, opcode.getSIMDEncoding(&cg->comp()->target().cpu, TR::VectorLength256)); | ||
break; | ||
} | ||
case TR::VectorLength512: | ||
{ | ||
TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX512F), "512-bit vsplats requires AVX-512"); | ||
TR::InstOpCode opcode = broadcast64 ? TR::InstOpCode::VBROADCASTSDZmmXmm : TR::InstOpCode::VBROADCASTSSRegReg; | ||
generateRegRegInstruction(opcode.getMnemonic(), node, vectorReg, vectorReg, cg, OMR::X86::EVEX_L512); | ||
break; | ||
} | ||
default: | ||
TR_ASSERT_FATAL(0, "Unsupported vector length"); | ||
break; | ||
} | ||
|
||
return vectorReg; | ||
} | ||
|
||
TR::Register* OMR::X86::TreeEvaluator::SIMDsplatsEvaluator(TR::Node* node, TR::CodeGenerator* cg) | ||
{ | ||
TR::Node* childNode = node->getChild(0); | ||
|
@@ -269,40 +331,7 @@ TR::Register* OMR::X86::TreeEvaluator::SIMDsplatsEvaluator(TR::Node* node, TR::C | |
break; | ||
} | ||
|
||
// Expand byte & word to 32-bits | ||
switch (et) | ||
{ | ||
case TR::Int8: | ||
generateRegRegInstruction(TR::InstOpCode::PUNPCKLBWRegReg, node, resultReg, resultReg, cg); | ||
case TR::Int16: | ||
generateRegRegImmInstruction(TR::InstOpCode::PSHUFLWRegRegImm1, node, resultReg, resultReg, 0x0, cg); | ||
default: | ||
break; | ||
} | ||
|
||
switch (vl) | ||
{ | ||
case TR::VectorLength128: | ||
generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, resultReg, resultReg, broadcast64 ? 0x44 : 0, cg); | ||
break; | ||
case TR::VectorLength256: | ||
{ | ||
TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX2), "256-bit vsplats requires AVX2"); | ||
TR::InstOpCode opcode = broadcast64 ? TR::InstOpCode::VBROADCASTSDYmmYmm : TR::InstOpCode::VBROADCASTSSRegReg; | ||
generateRegRegInstruction(opcode.getMnemonic(), node, resultReg, resultReg, cg, opcode.getSIMDEncoding(&cg->comp()->target().cpu, TR::VectorLength256)); | ||
break; | ||
} | ||
case TR::VectorLength512: | ||
{ | ||
TR_ASSERT_FATAL(cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX512F), "512-bit vsplats requires AVX-512"); | ||
TR::InstOpCode opcode = broadcast64 ? TR::InstOpCode::VBROADCASTSDZmmXmm : TR::InstOpCode::VBROADCASTSSRegReg; | ||
generateRegRegInstruction(opcode.getMnemonic(), node, resultReg, resultReg, cg, OMR::X86::EVEX_L512); | ||
break; | ||
} | ||
default: | ||
TR_ASSERT_FATAL(0, "Unsupported vector length"); | ||
break; | ||
} | ||
TR::TreeEvaluator::broadcastHelper(node, resultReg, vl, et, cg); | ||
|
||
node->setRegister(resultReg); | ||
cg->decReferenceCount(childNode); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
movaps requires alignment (to the size of the vector length), is the memory reference always aligned?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, there's a preceding
movups
that will set any unaligned bytes, and then the pointer is bumped to the next 16-byte aligned address.(Previous patch left the pointer at the same address if it was already aligned, so the same 16 bytes would be set with
movups
and then again by the firstmovaps
in the loop. I've fixed that to move the pointer to the next aligned address regardless of whether it was previously aligned or not to improve that.)