Skip to content

Commit 7d26f3f

Browse files
authored
Merge pull request #21282 from BradleyWood/0.51-shc
(0.51) x86: Implement String.hashCode with vectorizedHashCode()
2 parents 8c56a80 + 1bae589 commit 7d26f3f

File tree

1 file changed

+45
-202
lines changed

1 file changed

+45
-202
lines changed

runtime/compiler/x/codegen/J9TreeEvaluator.cpp

Lines changed: 45 additions & 202 deletions
Original file line numberDiff line numberDiff line change
@@ -9230,207 +9230,20 @@ TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGene
92309230
return result;
92319231
}
92329232

9233-
// Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
9234-
//
9235-
// Conversion process example:
9236-
//
9237-
// str[8] = example string representing 8 characters (compressed or decompressed)
9238-
//
9239-
// The serial method for creating the hash:
9240-
// hash = 0, offset = 0, count = 8
9241-
// for (int i = offset; i < offset+count; ++i) {
9242-
// hash = (hash << 5) - hash + str[i];
9243-
// }
9244-
//
9245-
// Note that ((hash << 5) - hash) is equivalent to hash * 31
9246-
//
9247-
// Expanding out the for loop:
9248-
// hash = ((((((((0*31+str[0])*31+str[1])*31+str[2])*31+str[3])*31+str[4])*31+str[5])*31+str[6])*31+str[7])
9249-
//
9250-
// Simplified:
9251-
// hash = (31^7)*str[0] + (31^6)*str[1] + (31^5)*str[2] + (31^4)*str[3]
9252-
// + (31^3)*str[4] + (31^2)*str[5] + (31^1)*str[6] + (31^0)*str[7]
9253-
//
9254-
// Rearranged:
9255-
// hash = (31^7)*str[0] + (31^3)*str[4]
9256-
// + (31^6)*str[1] + (31^2)*str[5]
9257-
// + (31^5)*str[2] + (31^1)*str[6]
9258-
// + (31^4)*str[3] + (31^0)*str[7]
9259-
//
9260-
// Factor out [31^3, 31^2, 31^1, 31^0]:
9261-
// hash = 31^3*((31^4)*str[0] + str[4]) Vector[0]
9262-
// + 31^2*((31^4)*str[1] + str[5]) Vector[1]
9263-
// + 31^1*((31^4)*str[2] + str[6]) Vector[2]
9264-
// + 31^0*((31^4)*str[3] + str[7]) Vector[3]
9265-
//
9266-
// Keep factoring out any 31^4 if possible (this example has no such case). If the string was 12 characters long then:
9267-
// 31^3*((31^8)*str[0] + (31^4)*str[4] + (31^0)*str[8]) would become 31^3*(31^4((31^4)*str[0] + str[4]) + (31^0)*str[8])
9268-
//
9269-
// Vectorization is done by simultaneously calculating the four sums that hash is made of (each -> is a successive step):
9270-
// Vector[0] = str[0] -> multiply 31^4 -> add str[4] -> multiply 31^3
9271-
// Vector[1] = str[1] -> multiply 31^4 -> add str[5] -> multiply 31^2
9272-
// Vector[2] = str[2] -> multiply 31^4 -> add str[6] -> multiply 31^1
9273-
// Vector[3] = str[3] -> multiply 31^4 -> add str[7] -> multiply 1
9274-
//
9275-
// Adding these four vectorized values together produces the required hash.
9276-
// If the number of characters in the string is not a multiple of 4, then the remainder of the hash is calculated serially.
9277-
//
9278-
// Implementation overview:
9279-
//
9280-
// start_label
9281-
// if size < threshold, goto serial_label, current threshold is 4
9282-
// xmm0 = load 16 bytes align constant [923521, 923521, 923521, 923521]
9283-
// xmm1 = 0
9284-
// SSEloop
9285-
// xmm2 = decompressed: load 8 byte value in lower 8 bytes.
9286-
// compressed: load 4 byte value in lower 4 bytes
9287-
// xmm1 = xmm1 * xmm0
9288-
// if(isCompressed)
9289-
// movzxbd xmm2, xmm2
9290-
// else
9291-
// movzxwd xmm2, xmm2
9292-
// xmm1 = xmm1 + xmm2
9293-
// i = i + 4;
9294-
// cmp i, end -3
9295-
// jge SSEloop
9296-
// xmm0 = load 16 bytes align [31^3, 31^2, 31, 1]
9297-
// xmm1 = xmm1 * xmm0 value contains [a0, a1, a2, a3]
9298-
// xmm0 = xmm1
9299-
// xmm0 = xmm0 >> 64 bits
9300-
// xmm1 = xmm1 + xmm0 reduce add [a0+a2, a1+a3, .., ...]
9301-
// xmm0 = xmm1
9302-
// xmm0 = xmm0 >> 32 bits
9303-
// xmm1 = xmm1 + xmm0 reduce add [a0+a2 + a1+a3, .., .., ..]
9304-
// movd xmm1, GPR1
9305-
//
9306-
// serial_label
9307-
//
9308-
// cmp i end
9309-
// jle end
9310-
// serial_loop
9311-
// GPR2 = GPR1
9312-
// GPR1 = GPR1 << 5
9313-
// GPR1 = GPR1 - GPR2
9314-
// GPR2 = load c[i]
9315-
// add GPR1, GPR2
9316-
// dec i
9317-
// cmp i, end
9318-
// jl serial_loop
9319-
//
9320-
// end_label
93219233
static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR::CodeGenerator* cg)
93229234
{
9323-
TR_ASSERT(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "String hashcode offset can only be const zero.");
9324-
9325-
const int size = 4;
9326-
auto shift = isCompressed ? 0 : 1;
9327-
9328-
auto address = cg->evaluate(node->getChild(0));
9329-
auto length = cg->evaluate(node->getChild(2));
9330-
auto index = cg->allocateRegister();
9331-
auto hash = cg->allocateRegister();
9332-
auto tmp = cg->allocateRegister();
9333-
auto hashXMM = cg->allocateRegister(TR_VRF);
9334-
auto tmpXMM = cg->allocateRegister(TR_VRF);
9335-
auto multiplierXMM = cg->allocateRegister(TR_VRF);
9336-
9337-
auto begLabel = generateLabelSymbol(cg);
9338-
auto endLabel = generateLabelSymbol(cg);
9339-
auto loopLabel = generateLabelSymbol(cg);
9340-
begLabel->setStartInternalControlFlow();
9341-
endLabel->setEndInternalControlFlow();
9342-
auto deps = generateRegisterDependencyConditions((uint8_t)6, (uint8_t)6, cg);
9343-
deps->addPreCondition(address, TR::RealRegister::NoReg, cg);
9344-
deps->addPreCondition(index, TR::RealRegister::NoReg, cg);
9345-
deps->addPreCondition(length, TR::RealRegister::NoReg, cg);
9346-
deps->addPreCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
9347-
deps->addPreCondition(tmpXMM, TR::RealRegister::NoReg, cg);
9348-
deps->addPreCondition(hashXMM, TR::RealRegister::NoReg, cg);
9349-
deps->addPostCondition(address, TR::RealRegister::NoReg, cg);
9350-
deps->addPostCondition(index, TR::RealRegister::NoReg, cg);
9351-
deps->addPostCondition(length, TR::RealRegister::NoReg, cg);
9352-
deps->addPostCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
9353-
deps->addPostCondition(tmpXMM, TR::RealRegister::NoReg, cg);
9354-
deps->addPostCondition(hashXMM, TR::RealRegister::NoReg, cg);
9355-
9356-
generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, index, length, cg);
9357-
generateRegImmInstruction(TR::InstOpCode::AND4RegImms, node, index, size-1, cg); // mod size
9358-
generateRegMemInstruction(TR::InstOpCode::CMOVE4RegMem, node, index, generateX86MemoryReference(cg->findOrCreate4ByteConstant(node, size), cg), cg);
9359-
9360-
// Prepend zeros
9361-
{
9362-
TR::Compilation *comp = cg->comp();
9363-
9364-
static uint64_t MASKDECOMPRESSED[] = { 0x0000000000000000ULL, 0xffffffffffffffffULL };
9365-
static uint64_t MASKCOMPRESSED[] = { 0xffffffff00000000ULL, 0x0000000000000000ULL };
9366-
generateRegMemInstruction(isCompressed ? TR::InstOpCode::MOVDRegMem : TR::InstOpCode::MOVQRegMem, node, hashXMM, generateX86MemoryReference(address, index, shift, -(size << shift) + TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
9367-
generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, tmp, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, isCompressed ? MASKCOMPRESSED : MASKDECOMPRESSED), cg), cg);
9368-
9369-
auto mr = generateX86MemoryReference(tmp, index, shift, 0, cg);
9370-
if (comp->target().cpu.supportsAVX())
9371-
{
9372-
generateRegMemInstruction(TR::InstOpCode::PANDRegMem, node, hashXMM, mr, cg);
9373-
}
9374-
else
9375-
{
9376-
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXMM, mr, cg);
9377-
generateRegRegInstruction(TR::InstOpCode::PANDRegReg, node, hashXMM, tmpXMM, cg);
9378-
}
9379-
generateRegRegInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegReg : TR::InstOpCode::PMOVZXWDRegReg, node, hashXMM, hashXMM, cg);
9380-
}
9381-
9382-
// Reduction Loop
9383-
{
9384-
static uint32_t multiplier[] = { 31*31*31*31, 31*31*31*31, 31*31*31*31, 31*31*31*31 };
9385-
generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
9386-
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9387-
generateLabelInstruction(TR::InstOpCode::JGE4, node, endLabel, cg);
9388-
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9389-
generateLabelInstruction(TR::InstOpCode::label, node, loopLabel, cg);
9390-
generateRegRegInstruction(TR::InstOpCode::PMULLDRegReg, node, hashXMM, multiplierXMM, cg);
9391-
generateRegMemInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegMem : TR::InstOpCode::PMOVZXWDRegMem, node, tmpXMM, generateX86MemoryReference(address, index, shift, TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
9392-
generateRegImmInstruction(TR::InstOpCode::ADD4RegImms, node, index, 4, cg);
9393-
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9394-
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9395-
generateLabelInstruction(TR::InstOpCode::JL4, node, loopLabel, cg);
9396-
generateLabelInstruction(TR::InstOpCode::label, node, endLabel, deps, cg);
9397-
}
9398-
9399-
// Finalization
9400-
{
9401-
static uint32_t multiplier[] = { 31*31*31, 31*31, 31, 1 };
9402-
generateRegMemInstruction(TR::InstOpCode::PMULLDRegMem, node, hashXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9403-
generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpXMM, hashXMM, 0x0e, cg);
9404-
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9405-
generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpXMM, hashXMM, 0x01, cg);
9406-
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9407-
}
9408-
9409-
generateRegRegInstruction(TR::InstOpCode::MOVDReg4Reg, node, hash, hashXMM, cg);
9235+
TR::Register *hashResult = TR::TreeEvaluator::vectorizedHashCodeHelper(node, isCompressed ? TR::Int8 : TR::Int16, NULL, false, cg);
9236+
node->setRegister(hashResult);
94109237

9411-
cg->stopUsingRegister(index);
9412-
cg->stopUsingRegister(tmp);
9413-
cg->stopUsingRegister(hashXMM);
9414-
cg->stopUsingRegister(tmpXMM);
9415-
cg->stopUsingRegister(multiplierXMM);
9416-
9417-
node->setRegister(hash);
9418-
cg->decReferenceCount(node->getChild(0));
9419-
cg->recursivelyDecReferenceCount(node->getChild(1));
9420-
cg->decReferenceCount(node->getChild(2));
9421-
return hash;
9238+
return hashResult;
94229239
}
94239240

94249241
TR::Register* J9::X86::TreeEvaluator::inlineVectorizedHashCode(TR::Node* node, TR::CodeGenerator* cg)
94259242
{
9426-
TR::Node *fromIndexNode = node->getChild(1);
94279243
TR::Node *initialValueNode = node->getChild(3);
94289244
TR::Node *elementTypeNode = node->getChild(4);
94299245
TR::Register* registerHash = NULL;
94309246

9431-
if (!(fromIndexNode->getOpCodeValue() == TR::iconst && fromIndexNode->getInt() == 0))
9432-
return NULL; // only supporting offset of const 0
9433-
94349247
switch (elementTypeNode->getConstValue())
94359248
{
94369249
case 4: // T_BOOLEAN
@@ -9569,15 +9382,23 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
95699382
TR::Register *tmpVRF = cg->allocateRegister(TR_VRF);
95709383
TR::Register *multiplierVRF = cg->allocateRegister(TR_VRF);
95719384

9572-
TR::Register *hashRegsVRF[] = {cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF) };
9573-
TR::Register *multiplier31PowNRegsVRF[] = {cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF) };
9385+
TR::Register *hashRegsVRF[4];
9386+
TR::Register *multiplier31PowNRegsVRF[4];
95749387

95759388
deps->addPostCondition(tmp, TR::RealRegister::NoReg, cg);
95769389
deps->addPostCondition(tmpVRF, TR::RealRegister::NoReg, cg);
95779390
deps->addPostCondition(multiplierVRF, TR::RealRegister::NoReg, cg);
95789391

9579-
for (int32_t i = 0; i < 4; i++) deps->addPostCondition(multiplier31PowNRegsVRF[i], TR::RealRegister::NoReg, cg);
9580-
for (int32_t i = 0; i < 4; i++) deps->addPostCondition(hashRegsVRF[i], TR::RealRegister::NoReg, cg);
9392+
for (int32_t i = 0; i < unrollCount; i++)
9393+
{
9394+
hashRegsVRF[i] = cg->allocateRegister(TR_VRF);
9395+
multiplier31PowNRegsVRF[i] = cg->allocateRegister(TR_VRF);
9396+
9397+
deps->addPostCondition(hashRegsVRF[i], TR::RealRegister::NoReg, cg);
9398+
deps->addPostCondition(multiplier31PowNRegsVRF[i], TR::RealRegister::NoReg, cg);
9399+
}
9400+
9401+
deps->stopAddingConditions();
95819402

95829403
TR::LabelSymbol *begLabel = generateLabelSymbol(cg);
95839404
TR::LabelSymbol *endLabel = generateLabelSymbol(cg);
@@ -9687,8 +9508,8 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
96879508
cg->stopUsingRegister(tmpVRF);
96889509
cg->stopUsingRegister(multiplierVRF);
96899510

9690-
for (int32_t i = 0; i < 4; i++) cg->stopUsingRegister(multiplier31PowNRegsVRF[i]);
9691-
for (int32_t i = 0; i < 4; i++) cg->stopUsingRegister(hashRegsVRF[i]);
9511+
for (int32_t i = 0; i < unrollCount; i++) cg->stopUsingRegister(multiplier31PowNRegsVRF[i]);
9512+
for (int32_t i = 0; i < unrollCount; i++) cg->stopUsingRegister(hashRegsVRF[i]);
96929513

96939514
return result;
96949515
}
@@ -9743,7 +9564,6 @@ J9::X86::TreeEvaluator::vectorizedHashCodeHelper(TR::Node *node, TR::DataType dt
97439564
{
97449565
int32_t shift = dt - TR::Int8; /* i8 -> 0, i16 -> 1, i32 -> 2 */
97459566

9746-
TR_ASSERT_FATAL(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "vector hashcode offset can only be const zero.");
97479567
TR_ASSERT_FATAL(shift >= 0 && shift <= 2, "Unsupported datatype for vectorized hashcode");
97489568

97499569
TR::Compilation *comp = cg->comp();
@@ -9754,7 +9574,12 @@ J9::X86::TreeEvaluator::vectorizedHashCodeHelper(TR::Node *node, TR::DataType dt
97549574
else if (comp->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX2))
97559575
vl = TR::VectorLength256;
97569576

9757-
TR::Register *address = cg->evaluate(node->getChild(0));
9577+
TR::Node *addressNode = node->getChild(0);
9578+
9579+
bool nonZeroOffset = node->getChild(1)->getOpCodeValue() != TR::iconst || node->getChild(1)->getInt() != 0;
9580+
bool addressIs64bits = TR::TreeEvaluator::getNodeIs64Bit(addressNode, cg);
9581+
9582+
TR::Register *address = nonZeroOffset ? TR::TreeEvaluator::intOrLongClobberEvaluate(addressNode, addressIs64bits, cg) : cg->evaluate(addressNode);
97589583
TR::Register *length = cg->evaluate(node->getChild(2));
97599584
TR::Register *initHash = nodeHash ? cg->intClobberEvaluate(nodeHash) : cg->allocateRegister(TR_GPR);
97609585
TR::Register *index = cg->allocateRegister();
@@ -9769,6 +9594,14 @@ J9::X86::TreeEvaluator::vectorizedHashCodeHelper(TR::Node *node, TR::DataType dt
97699594
deps->addPostCondition(tmp, TR::RealRegister::NoReg, cg);
97709595
deps->addPostCondition(initHash, TR::RealRegister::NoReg, cg);
97719596
deps->addPostCondition(length, TR::RealRegister::NoReg, cg);
9597+
deps->stopAddingConditions();
9598+
9599+
if (nonZeroOffset)
9600+
{
9601+
TR::Register *offset = cg->evaluate(node->getChild(1));
9602+
TR::MemoryReference *memRef = generateX86MemoryReference(address, offset, shift, 0, cg);
9603+
generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, address, memRef, cg);
9604+
}
97729605

97739606
if (!nodeHash)
97749607
{
@@ -9781,14 +9614,19 @@ J9::X86::TreeEvaluator::vectorizedHashCodeHelper(TR::Node *node, TR::DataType dt
97819614

97829615
// Generate Main Loop; 4x Unrolled seems to yield the best performance for large arrays
97839616
static char *unrollVar = feGetEnv("TR_setInlineVectorHashCodeUnrollCount");
9617+
9618+
#ifdef TR_TARGET_64BIT
97849619
int32_t unrollCount = unrollVar ? atoi(unrollVar) : 4;
9620+
#else
9621+
int32_t unrollCount = 1;
9622+
#endif
97859623

97869624
vectorizedHashCodeLoopHelper(node, dt, vl, isSigned, result, initHash, index, length, address, unrollCount, cg);
97879625

97889626
static bool disableSecondLoop = feGetEnv("TR_disableVectorHashCodeSecondLoop") != NULL;
97899627

9790-
// Generate a second vectorized loop;
9791-
if (!disableSecondLoop)
9628+
// Generate a second vectorized loop if not disabled and Vl/unrollCount are not the same as the first loop
9629+
if (!disableSecondLoop && (unrollCount != 1 || vl != TR::VectorLength128))
97929630
{
97939631
generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, initHash, result, cg);
97949632
vectorizedHashCodeLoopHelper(node, dt, TR::VectorLength128, isSigned, result, initHash, index, length, address, 1, cg);
@@ -9828,6 +9666,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeHelper(TR::Node *node, TR::DataType dt
98289666
generateLabelInstruction(TR::InstOpCode::label, node, residueEndLoopLabel, deps, cg);
98299667
}
98309668

9669+
if (nonZeroOffset)
9670+
{
9671+
cg->stopUsingRegister(address);
9672+
}
9673+
98319674
cg->stopUsingRegister(initHash);
98329675
cg->stopUsingRegister(index);
98339676
cg->stopUsingRegister(tmp);
@@ -12193,14 +12036,14 @@ J9::X86::TreeEvaluator::directCallEvaluator(TR::Node *node, TR::CodeGenerator *c
1219312036
return TR::TreeEvaluator::encodeUTF16Evaluator(node, cg);
1219412037

1219512038
case TR::java_lang_String_hashCodeImplDecompressed:
12196-
if (cg->getSupportsInlineStringHashCode())
12039+
if (cg->getSupportsInlineStringHashCode() && !node->getBlock()->isCold())
1219712040
returnRegister = inlineStringHashCode(node, false, cg);
1219812041

1219912042
callInlined = (returnRegister != NULL);
1220012043
break;
1220112044

1220212045
case TR::java_lang_String_hashCodeImplCompressed:
12203-
if (cg->getSupportsInlineStringHashCode())
12046+
if (cg->getSupportsInlineStringHashCode() && !node->getBlock()->isCold())
1220412047
returnRegister = inlineStringHashCode(node, true, cg);
1220512048

1220612049
callInlined = (returnRegister != NULL);

0 commit comments

Comments
 (0)