@@ -9230,207 +9230,20 @@ TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGene
9230
9230
return result;
9231
9231
}
9232
9232
9233
- // Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
9234
- //
9235
- // Conversion process example:
9236
- //
9237
- // str[8] = example string representing 8 characters (compressed or decompressed)
9238
- //
9239
- // The serial method for creating the hash:
9240
- // hash = 0, offset = 0, count = 8
9241
- // for (int i = offset; i < offset+count; ++i) {
9242
- // hash = (hash << 5) - hash + str[i];
9243
- // }
9244
- //
9245
- // Note that ((hash << 5) - hash) is equivalent to hash * 31
9246
- //
9247
- // Expanding out the for loop:
9248
- // hash = ((((((((0*31+str[0])*31+str[1])*31+str[2])*31+str[3])*31+str[4])*31+str[5])*31+str[6])*31+str[7])
9249
- //
9250
- // Simplified:
9251
- // hash = (31^7)*str[0] + (31^6)*str[1] + (31^5)*str[2] + (31^4)*str[3]
9252
- // + (31^3)*str[4] + (31^2)*str[5] + (31^1)*str[6] + (31^0)*str[7]
9253
- //
9254
- // Rearranged:
9255
- // hash = (31^7)*str[0] + (31^3)*str[4]
9256
- // + (31^6)*str[1] + (31^2)*str[5]
9257
- // + (31^5)*str[2] + (31^1)*str[6]
9258
- // + (31^4)*str[3] + (31^0)*str[7]
9259
- //
9260
- // Factor out [31^3, 31^2, 31^1, 31^0]:
9261
- // hash = 31^3*((31^4)*str[0] + str[4]) Vector[0]
9262
- // + 31^2*((31^4)*str[1] + str[5]) Vector[1]
9263
- // + 31^1*((31^4)*str[2] + str[6]) Vector[2]
9264
- // + 31^0*((31^4)*str[3] + str[7]) Vector[3]
9265
- //
9266
- // Keep factoring out any 31^4 if possible (this example has no such case). If the string was 12 characters long then:
9267
- // 31^3*((31^8)*str[0] + (31^4)*str[4] + (31^0)*str[8]) would become 31^3*(31^4((31^4)*str[0] + str[4]) + (31^0)*str[8])
9268
- //
9269
- // Vectorization is done by simultaneously calculating the four sums that hash is made of (each -> is a successive step):
9270
- // Vector[0] = str[0] -> multiply 31^4 -> add str[4] -> multiply 31^3
9271
- // Vector[1] = str[1] -> multiply 31^4 -> add str[5] -> multiply 31^2
9272
- // Vector[2] = str[2] -> multiply 31^4 -> add str[6] -> multiply 31^1
9273
- // Vector[3] = str[3] -> multiply 31^4 -> add str[7] -> multiply 1
9274
- //
9275
- // Adding these four vectorized values together produces the required hash.
9276
- // If the number of characters in the string is not a multiple of 4, then the remainder of the hash is calculated serially.
9277
- //
9278
- // Implementation overview:
9279
- //
9280
- // start_label
9281
- // if size < threshold, goto serial_label, current threshold is 4
9282
- // xmm0 = load 16 bytes align constant [923521, 923521, 923521, 923521]
9283
- // xmm1 = 0
9284
- // SSEloop
9285
- // xmm2 = decompressed: load 8 byte value in lower 8 bytes.
9286
- // compressed: load 4 byte value in lower 4 bytes
9287
- // xmm1 = xmm1 * xmm0
9288
- // if(isCompressed)
9289
- // movzxbd xmm2, xmm2
9290
- // else
9291
- // movzxwd xmm2, xmm2
9292
- // xmm1 = xmm1 + xmm2
9293
- // i = i + 4;
9294
- // cmp i, end -3
9295
- // jge SSEloop
9296
- // xmm0 = load 16 bytes align [31^3, 31^2, 31, 1]
9297
- // xmm1 = xmm1 * xmm0 value contains [a0, a1, a2, a3]
9298
- // xmm0 = xmm1
9299
- // xmm0 = xmm0 >> 64 bits
9300
- // xmm1 = xmm1 + xmm0 reduce add [a0+a2, a1+a3, .., ...]
9301
- // xmm0 = xmm1
9302
- // xmm0 = xmm0 >> 32 bits
9303
- // xmm1 = xmm1 + xmm0 reduce add [a0+a2 + a1+a3, .., .., ..]
9304
- // movd xmm1, GPR1
9305
- //
9306
- // serial_label
9307
- //
9308
- // cmp i end
9309
- // jle end
9310
- // serial_loop
9311
- // GPR2 = GPR1
9312
- // GPR1 = GPR1 << 5
9313
- // GPR1 = GPR1 - GPR2
9314
- // GPR2 = load c[i]
9315
- // add GPR1, GPR2
9316
- // dec i
9317
- // cmp i, end
9318
- // jl serial_loop
9319
- //
9320
- // end_label
9321
9233
static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR::CodeGenerator* cg)
9322
9234
{
9323
- TR_ASSERT(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "String hashcode offset can only be const zero.");
9324
-
9325
- const int size = 4;
9326
- auto shift = isCompressed ? 0 : 1;
9327
-
9328
- auto address = cg->evaluate(node->getChild(0));
9329
- auto length = cg->evaluate(node->getChild(2));
9330
- auto index = cg->allocateRegister();
9331
- auto hash = cg->allocateRegister();
9332
- auto tmp = cg->allocateRegister();
9333
- auto hashXMM = cg->allocateRegister(TR_VRF);
9334
- auto tmpXMM = cg->allocateRegister(TR_VRF);
9335
- auto multiplierXMM = cg->allocateRegister(TR_VRF);
9336
-
9337
- auto begLabel = generateLabelSymbol(cg);
9338
- auto endLabel = generateLabelSymbol(cg);
9339
- auto loopLabel = generateLabelSymbol(cg);
9340
- begLabel->setStartInternalControlFlow();
9341
- endLabel->setEndInternalControlFlow();
9342
- auto deps = generateRegisterDependencyConditions((uint8_t)6, (uint8_t)6, cg);
9343
- deps->addPreCondition(address, TR::RealRegister::NoReg, cg);
9344
- deps->addPreCondition(index, TR::RealRegister::NoReg, cg);
9345
- deps->addPreCondition(length, TR::RealRegister::NoReg, cg);
9346
- deps->addPreCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
9347
- deps->addPreCondition(tmpXMM, TR::RealRegister::NoReg, cg);
9348
- deps->addPreCondition(hashXMM, TR::RealRegister::NoReg, cg);
9349
- deps->addPostCondition(address, TR::RealRegister::NoReg, cg);
9350
- deps->addPostCondition(index, TR::RealRegister::NoReg, cg);
9351
- deps->addPostCondition(length, TR::RealRegister::NoReg, cg);
9352
- deps->addPostCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
9353
- deps->addPostCondition(tmpXMM, TR::RealRegister::NoReg, cg);
9354
- deps->addPostCondition(hashXMM, TR::RealRegister::NoReg, cg);
9355
-
9356
- generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, index, length, cg);
9357
- generateRegImmInstruction(TR::InstOpCode::AND4RegImms, node, index, size-1, cg); // mod size
9358
- generateRegMemInstruction(TR::InstOpCode::CMOVE4RegMem, node, index, generateX86MemoryReference(cg->findOrCreate4ByteConstant(node, size), cg), cg);
9359
-
9360
- // Prepend zeros
9361
- {
9362
- TR::Compilation *comp = cg->comp();
9363
-
9364
- static uint64_t MASKDECOMPRESSED[] = { 0x0000000000000000ULL, 0xffffffffffffffffULL };
9365
- static uint64_t MASKCOMPRESSED[] = { 0xffffffff00000000ULL, 0x0000000000000000ULL };
9366
- generateRegMemInstruction(isCompressed ? TR::InstOpCode::MOVDRegMem : TR::InstOpCode::MOVQRegMem, node, hashXMM, generateX86MemoryReference(address, index, shift, -(size << shift) + TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
9367
- generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, tmp, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, isCompressed ? MASKCOMPRESSED : MASKDECOMPRESSED), cg), cg);
9368
-
9369
- auto mr = generateX86MemoryReference(tmp, index, shift, 0, cg);
9370
- if (comp->target().cpu.supportsAVX())
9371
- {
9372
- generateRegMemInstruction(TR::InstOpCode::PANDRegMem, node, hashXMM, mr, cg);
9373
- }
9374
- else
9375
- {
9376
- generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXMM, mr, cg);
9377
- generateRegRegInstruction(TR::InstOpCode::PANDRegReg, node, hashXMM, tmpXMM, cg);
9378
- }
9379
- generateRegRegInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegReg : TR::InstOpCode::PMOVZXWDRegReg, node, hashXMM, hashXMM, cg);
9380
- }
9381
-
9382
- // Reduction Loop
9383
- {
9384
- static uint32_t multiplier[] = { 31*31*31*31, 31*31*31*31, 31*31*31*31, 31*31*31*31 };
9385
- generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
9386
- generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9387
- generateLabelInstruction(TR::InstOpCode::JGE4, node, endLabel, cg);
9388
- generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9389
- generateLabelInstruction(TR::InstOpCode::label, node, loopLabel, cg);
9390
- generateRegRegInstruction(TR::InstOpCode::PMULLDRegReg, node, hashXMM, multiplierXMM, cg);
9391
- generateRegMemInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegMem : TR::InstOpCode::PMOVZXWDRegMem, node, tmpXMM, generateX86MemoryReference(address, index, shift, TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
9392
- generateRegImmInstruction(TR::InstOpCode::ADD4RegImms, node, index, 4, cg);
9393
- generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9394
- generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9395
- generateLabelInstruction(TR::InstOpCode::JL4, node, loopLabel, cg);
9396
- generateLabelInstruction(TR::InstOpCode::label, node, endLabel, deps, cg);
9397
- }
9398
-
9399
- // Finalization
9400
- {
9401
- static uint32_t multiplier[] = { 31*31*31, 31*31, 31, 1 };
9402
- generateRegMemInstruction(TR::InstOpCode::PMULLDRegMem, node, hashXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9403
- generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpXMM, hashXMM, 0x0e, cg);
9404
- generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9405
- generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpXMM, hashXMM, 0x01, cg);
9406
- generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9407
- }
9408
-
9409
- generateRegRegInstruction(TR::InstOpCode::MOVDReg4Reg, node, hash, hashXMM, cg);
9235
+ TR::Register *hashResult = TR::TreeEvaluator::vectorizedHashCodeHelper(node, isCompressed ? TR::Int8 : TR::Int16, NULL, false, cg);
9236
+ node->setRegister(hashResult);
9410
9237
9411
- cg->stopUsingRegister(index);
9412
- cg->stopUsingRegister(tmp);
9413
- cg->stopUsingRegister(hashXMM);
9414
- cg->stopUsingRegister(tmpXMM);
9415
- cg->stopUsingRegister(multiplierXMM);
9416
-
9417
- node->setRegister(hash);
9418
- cg->decReferenceCount(node->getChild(0));
9419
- cg->recursivelyDecReferenceCount(node->getChild(1));
9420
- cg->decReferenceCount(node->getChild(2));
9421
- return hash;
9238
+ return hashResult;
9422
9239
}
9423
9240
9424
9241
TR::Register* J9::X86::TreeEvaluator::inlineVectorizedHashCode(TR::Node* node, TR::CodeGenerator* cg)
9425
9242
{
9426
- TR::Node *fromIndexNode = node->getChild(1);
9427
9243
TR::Node *initialValueNode = node->getChild(3);
9428
9244
TR::Node *elementTypeNode = node->getChild(4);
9429
9245
TR::Register* registerHash = NULL;
9430
9246
9431
- if (!(fromIndexNode->getOpCodeValue() == TR::iconst && fromIndexNode->getInt() == 0))
9432
- return NULL; // only supporting offset of const 0
9433
-
9434
9247
switch (elementTypeNode->getConstValue())
9435
9248
{
9436
9249
case 4: // T_BOOLEAN
@@ -9569,15 +9382,23 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
9569
9382
TR::Register *tmpVRF = cg->allocateRegister(TR_VRF);
9570
9383
TR::Register *multiplierVRF = cg->allocateRegister(TR_VRF);
9571
9384
9572
- TR::Register *hashRegsVRF[] = {cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF) } ;
9573
- TR::Register *multiplier31PowNRegsVRF[] = {cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF), cg->allocateRegister(TR_VRF) } ;
9385
+ TR::Register *hashRegsVRF[4] ;
9386
+ TR::Register *multiplier31PowNRegsVRF[4] ;
9574
9387
9575
9388
deps->addPostCondition(tmp, TR::RealRegister::NoReg, cg);
9576
9389
deps->addPostCondition(tmpVRF, TR::RealRegister::NoReg, cg);
9577
9390
deps->addPostCondition(multiplierVRF, TR::RealRegister::NoReg, cg);
9578
9391
9579
- for (int32_t i = 0; i < 4; i++) deps->addPostCondition(multiplier31PowNRegsVRF[i], TR::RealRegister::NoReg, cg);
9580
- for (int32_t i = 0; i < 4; i++) deps->addPostCondition(hashRegsVRF[i], TR::RealRegister::NoReg, cg);
9392
+ for (int32_t i = 0; i < unrollCount; i++)
9393
+ {
9394
+ hashRegsVRF[i] = cg->allocateRegister(TR_VRF);
9395
+ multiplier31PowNRegsVRF[i] = cg->allocateRegister(TR_VRF);
9396
+
9397
+ deps->addPostCondition(hashRegsVRF[i], TR::RealRegister::NoReg, cg);
9398
+ deps->addPostCondition(multiplier31PowNRegsVRF[i], TR::RealRegister::NoReg, cg);
9399
+ }
9400
+
9401
+ deps->stopAddingConditions();
9581
9402
9582
9403
TR::LabelSymbol *begLabel = generateLabelSymbol(cg);
9583
9404
TR::LabelSymbol *endLabel = generateLabelSymbol(cg);
@@ -9687,8 +9508,8 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
9687
9508
cg->stopUsingRegister(tmpVRF);
9688
9509
cg->stopUsingRegister(multiplierVRF);
9689
9510
9690
- for (int32_t i = 0; i < 4 ; i++) cg->stopUsingRegister(multiplier31PowNRegsVRF[i]);
9691
- for (int32_t i = 0; i < 4 ; i++) cg->stopUsingRegister(hashRegsVRF[i]);
9511
+ for (int32_t i = 0; i < unrollCount ; i++) cg->stopUsingRegister(multiplier31PowNRegsVRF[i]);
9512
+ for (int32_t i = 0; i < unrollCount ; i++) cg->stopUsingRegister(hashRegsVRF[i]);
9692
9513
9693
9514
return result;
9694
9515
}
@@ -9743,7 +9564,6 @@ J9::X86::TreeEvaluator::vectorizedHashCodeHelper(TR::Node *node, TR::DataType dt
9743
9564
{
9744
9565
int32_t shift = dt - TR::Int8; /* i8 -> 0, i16 -> 1, i32 -> 2 */
9745
9566
9746
- TR_ASSERT_FATAL(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "vector hashcode offset can only be const zero.");
9747
9567
TR_ASSERT_FATAL(shift >= 0 && shift <= 2, "Unsupported datatype for vectorized hashcode");
9748
9568
9749
9569
TR::Compilation *comp = cg->comp();
@@ -9754,7 +9574,12 @@ J9::X86::TreeEvaluator::vectorizedHashCodeHelper(TR::Node *node, TR::DataType dt
9754
9574
else if (comp->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX2))
9755
9575
vl = TR::VectorLength256;
9756
9576
9757
- TR::Register *address = cg->evaluate(node->getChild(0));
9577
+ TR::Node *addressNode = node->getChild(0);
9578
+
9579
+ bool nonZeroOffset = node->getChild(1)->getOpCodeValue() != TR::iconst || node->getChild(1)->getInt() != 0;
9580
+ bool addressIs64bits = TR::TreeEvaluator::getNodeIs64Bit(addressNode, cg);
9581
+
9582
+ TR::Register *address = nonZeroOffset ? TR::TreeEvaluator::intOrLongClobberEvaluate(addressNode, addressIs64bits, cg) : cg->evaluate(addressNode);
9758
9583
TR::Register *length = cg->evaluate(node->getChild(2));
9759
9584
TR::Register *initHash = nodeHash ? cg->intClobberEvaluate(nodeHash) : cg->allocateRegister(TR_GPR);
9760
9585
TR::Register *index = cg->allocateRegister();
@@ -9769,6 +9594,14 @@ J9::X86::TreeEvaluator::vectorizedHashCodeHelper(TR::Node *node, TR::DataType dt
9769
9594
deps->addPostCondition(tmp, TR::RealRegister::NoReg, cg);
9770
9595
deps->addPostCondition(initHash, TR::RealRegister::NoReg, cg);
9771
9596
deps->addPostCondition(length, TR::RealRegister::NoReg, cg);
9597
+ deps->stopAddingConditions();
9598
+
9599
+ if (nonZeroOffset)
9600
+ {
9601
+ TR::Register *offset = cg->evaluate(node->getChild(1));
9602
+ TR::MemoryReference *memRef = generateX86MemoryReference(address, offset, shift, 0, cg);
9603
+ generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, address, memRef, cg);
9604
+ }
9772
9605
9773
9606
if (!nodeHash)
9774
9607
{
@@ -9781,14 +9614,19 @@ J9::X86::TreeEvaluator::vectorizedHashCodeHelper(TR::Node *node, TR::DataType dt
9781
9614
9782
9615
// Generate Main Loop; 4x Unrolled seems to yield the best performance for large arrays
9783
9616
static char *unrollVar = feGetEnv("TR_setInlineVectorHashCodeUnrollCount");
9617
+
9618
+ #ifdef TR_TARGET_64BIT
9784
9619
int32_t unrollCount = unrollVar ? atoi(unrollVar) : 4;
9620
+ #else
9621
+ int32_t unrollCount = 1;
9622
+ #endif
9785
9623
9786
9624
vectorizedHashCodeLoopHelper(node, dt, vl, isSigned, result, initHash, index, length, address, unrollCount, cg);
9787
9625
9788
9626
static bool disableSecondLoop = feGetEnv("TR_disableVectorHashCodeSecondLoop") != NULL;
9789
9627
9790
- // Generate a second vectorized loop;
9791
- if (!disableSecondLoop)
9628
+ // Generate a second vectorized loop if not disabled and Vl/unrollCount are not the same as the first loop
9629
+ if (!disableSecondLoop && (unrollCount != 1 || vl != TR::VectorLength128) )
9792
9630
{
9793
9631
generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, initHash, result, cg);
9794
9632
vectorizedHashCodeLoopHelper(node, dt, TR::VectorLength128, isSigned, result, initHash, index, length, address, 1, cg);
@@ -9828,6 +9666,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeHelper(TR::Node *node, TR::DataType dt
9828
9666
generateLabelInstruction(TR::InstOpCode::label, node, residueEndLoopLabel, deps, cg);
9829
9667
}
9830
9668
9669
+ if (nonZeroOffset)
9670
+ {
9671
+ cg->stopUsingRegister(address);
9672
+ }
9673
+
9831
9674
cg->stopUsingRegister(initHash);
9832
9675
cg->stopUsingRegister(index);
9833
9676
cg->stopUsingRegister(tmp);
@@ -12193,14 +12036,14 @@ J9::X86::TreeEvaluator::directCallEvaluator(TR::Node *node, TR::CodeGenerator *c
12193
12036
return TR::TreeEvaluator::encodeUTF16Evaluator(node, cg);
12194
12037
12195
12038
case TR::java_lang_String_hashCodeImplDecompressed:
12196
- if (cg->getSupportsInlineStringHashCode())
12039
+ if (cg->getSupportsInlineStringHashCode() && !node->getBlock()->isCold() )
12197
12040
returnRegister = inlineStringHashCode(node, false, cg);
12198
12041
12199
12042
callInlined = (returnRegister != NULL);
12200
12043
break;
12201
12044
12202
12045
case TR::java_lang_String_hashCodeImplCompressed:
12203
- if (cg->getSupportsInlineStringHashCode())
12046
+ if (cg->getSupportsInlineStringHashCode() && !node->getBlock()->isCold() )
12204
12047
returnRegister = inlineStringHashCode(node, true, cg);
12205
12048
12206
12049
callInlined = (returnRegister != NULL);
0 commit comments