|
20 | 20 | //===----------------------------------------------------------------------===//
|
21 | 21 |
|
22 | 22 | #include "AArch64TargetMachine.h"
|
| 23 | +#include "llvm/ADT/STLExtras.h" |
23 | 24 | #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
|
| 25 | +#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" |
24 | 26 | #include "llvm/CodeGen/GlobalISel/Combiner.h"
|
25 | 27 | #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
|
26 | 28 | #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
|
@@ -439,6 +441,22 @@ class AArch64PostLegalizerCombiner : public MachineFunctionPass {
|
439 | 441 | private:
|
440 | 442 | bool IsOptNone;
|
441 | 443 | AArch64PostLegalizerCombinerImplRuleConfig RuleConfig;
|
| 444 | + |
| 445 | + |
| 446 | + struct StoreInfo { |
| 447 | + GStore *St = nullptr; |
| 448 | + // The G_PTR_ADD that's used by the store. We keep this to cache the |
| 449 | + // MachineInstr def. |
| 450 | + GPtrAdd *Ptr = nullptr; |
| 451 | + // The signed offset to the Ptr instruction. |
| 452 | + int64_t Offset = 0; |
| 453 | + LLT StoredType; |
| 454 | + }; |
| 455 | + bool tryOptimizeConsecStores(SmallVectorImpl<StoreInfo> &Stores, |
| 456 | + CSEMIRBuilder &MIB); |
| 457 | + |
| 458 | + bool optimizeConsecutiveMemOpAddressing(MachineFunction &MF, |
| 459 | + CSEMIRBuilder &MIB); |
442 | 460 | };
|
443 | 461 | } // end anonymous namespace
|
444 | 462 |
|
@@ -492,7 +510,191 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
|
492 | 510 | F.hasMinSize());
|
493 | 511 | AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo,
|
494 | 512 | RuleConfig, ST, MDT, LI);
|
495 |
| - return Impl.combineMachineInstrs(); |
| 513 | + bool Changed = Impl.combineMachineInstrs(); |
| 514 | + |
| 515 | + auto MIB = CSEMIRBuilder(MF); |
| 516 | + MIB.setCSEInfo(CSEInfo); |
| 517 | + Changed |= optimizeConsecutiveMemOpAddressing(MF, MIB); |
| 518 | + return Changed; |
| 519 | +} |
| 520 | + |
| 521 | +bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores( |
| 522 | + SmallVectorImpl<StoreInfo> &Stores, CSEMIRBuilder &MIB) { |
| 523 | + if (Stores.size() <= 2) |
| 524 | + return false; |
| 525 | + |
| 526 | + // Profitabity checks: |
| 527 | + int64_t BaseOffset = Stores[0].Offset; |
| 528 | + unsigned NumPairsExpected = Stores.size() / 2; |
| 529 | + unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % 2); |
| 530 | + // Size savings will depend on whether we can fold the offset, as an |
| 531 | + // immediate of an ADD. |
| 532 | + auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering(); |
| 533 | + if (!TLI.isLegalAddImmediate(BaseOffset)) |
| 534 | + TotalInstsExpected++; |
| 535 | + int SavingsExpected = Stores.size() - TotalInstsExpected; |
| 536 | + if (SavingsExpected <= 0) |
| 537 | + return false; |
| 538 | + |
| 539 | + auto &MRI = MIB.getMF().getRegInfo(); |
| 540 | + |
| 541 | + // We have a series of consecutive stores. Factor out the common base |
| 542 | + // pointer and rewrite the offsets. |
| 543 | + Register NewBase = Stores[0].Ptr->getReg(0); |
| 544 | + for (auto &SInfo : Stores) { |
| 545 | + // Compute a new pointer with the new base ptr and adjusted offset. |
| 546 | + MIB.setInstrAndDebugLoc(*SInfo.St); |
| 547 | + auto NewOff = MIB.buildConstant(LLT::scalar(64), SInfo.Offset - BaseOffset); |
| 548 | + auto NewPtr = MIB.buildPtrAdd(MRI.getType(SInfo.St->getPointerReg()), |
| 549 | + NewBase, NewOff); |
| 550 | + if (MIB.getObserver()) |
| 551 | + MIB.getObserver()->changingInstr(*SInfo.St); |
| 552 | + SInfo.St->getOperand(1).setReg(NewPtr.getReg(0)); |
| 553 | + if (MIB.getObserver()) |
| 554 | + MIB.getObserver()->changedInstr(*SInfo.St); |
| 555 | + } |
| 556 | + LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size() |
| 557 | + << " stores into a base pointer and offsets.\n"); |
| 558 | + return true; |
| 559 | +} |
| 560 | + |
| 561 | +static cl::opt<bool> |
| 562 | + EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops", |
| 563 | + cl::init(true), cl::Hidden, |
| 564 | + cl::desc("Enable consecutive memop optimization " |
| 565 | + "in AArch64PostLegalizerCombiner")); |
| 566 | + |
| 567 | +bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing( |
| 568 | + MachineFunction &MF, CSEMIRBuilder &MIB) { |
| 569 | + // This combine needs to run after all reassociations/folds on pointer |
| 570 | + // addressing have been done, specifically those that combine two G_PTR_ADDs |
| 571 | + // with constant offsets into a single G_PTR_ADD with a combined offset. |
| 572 | + // The goal of this optimization is to undo that combine in the case where |
| 573 | + // doing so has prevented the formation of pair stores due to illegal |
| 574 | + // addressing modes of STP. The reason that we do it here is because |
| 575 | + // it's much easier to undo the transformation of a series consecutive |
| 576 | + // mem ops, than it is to detect when doing it would be a bad idea looking |
| 577 | + // at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine. |
| 578 | + // |
| 579 | + // An example: |
| 580 | + // G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1) |
| 581 | + // %off1:_(s64) = G_CONSTANT i64 4128 |
| 582 | + // %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64) |
| 583 | + // G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1) |
| 584 | + // %off2:_(s64) = G_CONSTANT i64 4144 |
| 585 | + // %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64) |
| 586 | + // G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1) |
| 587 | + // %off3:_(s64) = G_CONSTANT i64 4160 |
| 588 | + // %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64) |
| 589 | + // G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1) |
| 590 | + bool Changed = false; |
| 591 | + auto &MRI = MF.getRegInfo(); |
| 592 | + |
| 593 | + if (!EnableConsecutiveMemOpOpt) |
| 594 | + return Changed; |
| 595 | + |
| 596 | + SmallVector<StoreInfo, 8> Stores; |
| 597 | + // If we see a load, then we keep track of any values defined by it. |
| 598 | + // In the following example, STP formation will fail anyway because |
| 599 | + // the latter store is using a load result that appears after the |
| 600 | + // the prior store. In this situation if we factor out the offset then |
| 601 | + // we increase code size for no benefit. |
| 602 | + // G_STORE %v1:_(s64), %base:_(p0) :: (store (s64)) |
| 603 | + // %v2:_(s64) = G_LOAD %ldptr:_(p0) :: (load (s64)) |
| 604 | + // G_STORE %v2:_(s64), %base:_(p0) :: (store (s64)) |
| 605 | + SmallVector<Register> LoadValsSinceLastStore; |
| 606 | + |
| 607 | + auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) { |
| 608 | + // Check if this store is consecutive to the last one. |
| 609 | + if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() || |
| 610 | + (Last.Offset + static_cast<int64_t>(Last.StoredType.getSizeInBytes()) != |
| 611 | + New.Offset) || |
| 612 | + Last.StoredType != New.StoredType) |
| 613 | + return false; |
| 614 | + |
| 615 | + // Check if this store is using a load result that appears after the |
| 616 | + // last store. If so, bail out. |
| 617 | + if (any_of(LoadValsSinceLastStore, [&](Register LoadVal) { |
| 618 | + return New.St->getValueReg() == LoadVal; |
| 619 | + })) |
| 620 | + return false; |
| 621 | + |
| 622 | + // Check if the current offset would be too large for STP. |
| 623 | + // If not, then STP formation should be able to handle it, so we don't |
| 624 | + // need to do anything. |
| 625 | + int64_t MaxLegalOffset; |
| 626 | + switch (New.StoredType.getSizeInBits()) { |
| 627 | + case 32: |
| 628 | + MaxLegalOffset = 252; |
| 629 | + break; |
| 630 | + case 64: |
| 631 | + MaxLegalOffset = 504; |
| 632 | + break; |
| 633 | + case 128: |
| 634 | + MaxLegalOffset = 1008; |
| 635 | + break; |
| 636 | + default: |
| 637 | + llvm_unreachable("Unexpected stored type size"); |
| 638 | + } |
| 639 | + if (New.Offset < MaxLegalOffset) |
| 640 | + return false; |
| 641 | + |
| 642 | + // If factoring it out still wouldn't help then don't bother. |
| 643 | + return New.Offset - Stores[0].Offset <= MaxLegalOffset; |
| 644 | + }; |
| 645 | + |
| 646 | + auto resetState = [&]() { |
| 647 | + Stores.clear(); |
| 648 | + LoadValsSinceLastStore.clear(); |
| 649 | + }; |
| 650 | + |
| 651 | + for (auto &MBB : MF) { |
| 652 | + // We're looking inside a single BB at a time since the memset pattern |
| 653 | + // should only be in a single block. |
| 654 | + resetState(); |
| 655 | + for (auto &MI : MBB) { |
| 656 | + if (auto *St = dyn_cast<GStore>(&MI)) { |
| 657 | + Register PtrBaseReg; |
| 658 | + APInt Offset; |
| 659 | + LLT StoredValTy = MRI.getType(St->getValueReg()); |
| 660 | + unsigned ValSize = StoredValTy.getSizeInBits(); |
| 661 | + if (ValSize < 32 || ValSize != St->getMMO().getSizeInBits()) |
| 662 | + continue; |
| 663 | + |
| 664 | + Register PtrReg = St->getPointerReg(); |
| 665 | + if (mi_match( |
| 666 | + PtrReg, MRI, |
| 667 | + m_OneNonDBGUse(m_GPtrAdd(m_Reg(PtrBaseReg), m_ICst(Offset))))) { |
| 668 | + GPtrAdd *PtrAdd = cast<GPtrAdd>(MRI.getVRegDef(PtrReg)); |
| 669 | + StoreInfo New = {St, PtrAdd, Offset.getSExtValue(), StoredValTy}; |
| 670 | + |
| 671 | + if (Stores.empty()) { |
| 672 | + Stores.push_back(New); |
| 673 | + continue; |
| 674 | + } |
| 675 | + |
| 676 | + // Check if this store is a valid continuation of the sequence. |
| 677 | + auto &Last = Stores.back(); |
| 678 | + if (storeIsValid(Last, New)) { |
| 679 | + Stores.push_back(New); |
| 680 | + LoadValsSinceLastStore.clear(); // Reset the load value tracking. |
| 681 | + } else { |
| 682 | + // The store isn't a valid to consider for the prior sequence, |
| 683 | + // so try to optimize what we have so far and start a new sequence. |
| 684 | + Changed |= tryOptimizeConsecStores(Stores, MIB); |
| 685 | + resetState(); |
| 686 | + Stores.push_back(New); |
| 687 | + } |
| 688 | + } |
| 689 | + } else if (auto *Ld = dyn_cast<GLoad>(&MI)) { |
| 690 | + LoadValsSinceLastStore.push_back(Ld->getDstReg()); |
| 691 | + } |
| 692 | + } |
| 693 | + Changed |= tryOptimizeConsecStores(Stores, MIB); |
| 694 | + resetState(); |
| 695 | + } |
| 696 | + |
| 697 | + return Changed; |
496 | 698 | }
|
497 | 699 |
|
498 | 700 | char AArch64PostLegalizerCombiner::ID = 0;
|
|
0 commit comments