46#define DEBUG_TYPE "hardware-loops"
48#define HW_LOOPS_NAME "Hardware Loop Insertion"
54 cl::desc(
"Force hardware loops intrinsics to be inserted"));
59 cl::desc(
"Force hardware loop counter to be updated through a phi"));
63 cl::desc(
"Force allowance of nested hardware loops"));
67 cl::desc(
"Set the loop decrement value"));
71 cl::desc(
"Set the loop counter bitwidth"));
76 cl::desc(
"Force generation of loop guard intrinsic"));
78STATISTIC(NumHWLoops,
"Number of loops converted to hardware loops");
83 dbgs() <<
"HWLoops: " << DebugMsg;
94 Value *CodeRegion = L->getHeader();
98 CodeRegion =
I->getParent();
101 if (
I->getDebugLoc())
102 DL =
I->getDebugLoc();
106 R <<
"hardware-loop not created: ";
144 class HardwareLoopsImpl {
151 : SE(SE), LI(LI), PreserveLCSSA(PreserveLCSSA), DT(DT),
DL(
DL),
TTI(
TTI),
152 TLI(TLI), AC(AC), ORE(ORE), Opts(Opts) { }
174 bool MadeChange =
false;
179 Value *InitLoopCount();
182 Value *InsertIterationSetup(
Value *LoopCountInit);
185 void InsertLoopDec();
197 void UpdateBranch(
Value *EltsRem);
204 SE(SE),
DL(
DL), ORE(ORE), Opts(Opts),
L(
Info.
L),
M(
L->getHeader()->getModule()),
205 ExitCount(
Info.ExitCount),
206 CountType(
Info.CountType),
207 ExitBranch(
Info.ExitBranch),
209 UsePHICounter(
Info.CounterInReg),
210 UseLoopGuard(
Info.PerformEntryTest) { }
221 const SCEV *ExitCount =
nullptr;
222 Type *CountType =
nullptr;
225 bool UsePHICounter =
false;
226 bool UseLoopGuard =
false;
231char HardwareLoopsLegacy::ID = 0;
233bool HardwareLoopsLegacy::runOnFunction(
Function &
F) {
239 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
240 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
241 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
242 auto &
TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
243 auto &
DL =
F.getDataLayout();
244 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
245 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
246 auto *TLI = TLIP ? &TLIP->getTLI(
F) :
nullptr;
247 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
248 bool PreserveLCSSA = mustPreserveAnalysisID(
LCSSAID);
264 HardwareLoopsImpl Impl(SE, LI, PreserveLCSSA, DT,
DL,
TTI, TLI, AC, ORE,
278 auto &
DL =
F.getDataLayout();
280 HardwareLoopsImpl Impl(SE, LI,
true, DT,
DL,
TTI, TLI, AC, ORE, Opts);
281 bool Changed = Impl.run(
F);
293bool HardwareLoopsImpl::run(
Function &
F) {
296 if (L->isOutermost())
297 TryConvertLoop(L, Ctx);
305 bool AnyChanged =
false;
307 AnyChanged |= TryConvertLoop(SL, Ctx);
309 reportHWLoopFailure(
"nested hardware-loops not supported",
"HWLoopNested",
314 LLVM_DEBUG(
dbgs() <<
"HWLoops: Loop " <<
L->getHeader()->getName() <<
"\n");
317 if (!HWLoopInfo.canAnalyze(LI)) {
318 reportHWLoopFailure(
"cannot analyze loop, irreducible control flow",
319 "HWLoopCannotAnalyze", ORE, L);
325 reportHWLoopFailure(
"it's not profitable to create a hardware-loop",
326 "HWLoopNotProfitable", ORE, L);
336 HWLoopInfo.LoopDecrement =
337 ConstantInt::get(HWLoopInfo.CountType, Opts.
Decrement.value());
339 MadeChange |= TryConvertLoop(HWLoopInfo);
340 return MadeChange && (!HWLoopInfo.IsNestingLegal && !Opts.
ForceNested);
346 LLVM_DEBUG(
dbgs() <<
"HWLoops: Try to convert profitable loop: " << *L);
353 reportHWLoopFailure(
"loop is not a candidate",
"HWLoopNoCandidate", ORE, L);
359 "Hardware Loop must have set exit info.");
369 HardwareLoop HWLoop(HWLoopInfo, SE,
DL, ORE, Opts);
375void HardwareLoop::Create() {
378 Value *LoopCountInit = InitLoopCount();
379 if (!LoopCountInit) {
380 reportHWLoopFailure(
"could not safely create a loop count expression",
381 "HWLoopNotSafe", ORE, L);
385 Value *
Setup = InsertIterationSetup(LoopCountInit);
387 if (UsePHICounter || Opts.
ForcePhi) {
388 Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
389 Value *EltsRem = InsertPHICounter(Setup, LoopDec);
391 UpdateBranch(LoopDec);
397 for (
auto *
I :
L->blocks())
402 BasicBlock *Preheader = L->getLoopPreheader();
411 if (BI->isUnconditional() || !isa<ICmpInst>(BI->getCondition()))
416 auto ICmp = cast<ICmpInst>(BI->getCondition());
418 if (!ICmp->isEquality())
421 auto IsCompareZero = [](
ICmpInst *ICmp,
Value *Count,
unsigned OpIdx) {
422 if (
auto *Const = dyn_cast<ConstantInt>(ICmp->
getOperand(OpIdx)))
423 return Const->isZero() && ICmp->
getOperand(OpIdx ^ 1) == Count;
428 Value *CountBefZext =
429 isa<ZExtInst>(Count) ? cast<ZExtInst>(Count)->getOperand(0) :
nullptr;
431 if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) &&
432 !IsCompareZero(ICmp, CountBefZext, 0) &&
433 !IsCompareZero(ICmp, CountBefZext, 1))
437 if (BI->getSuccessor(SuccIdx) != Preheader)
443Value *HardwareLoop::InitLoopCount() {
444 LLVM_DEBUG(
dbgs() <<
"HWLoops: Initialising loop counter value:\n");
449 if (!ExitCount->getType()->isPointerTy() &&
450 ExitCount->getType() != CountType)
451 ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
453 ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
460 SE.getZero(ExitCount->getType()))) {
465 UseLoopGuard =
false;
473 if (!SCEVE.isSafeToExpandAt(ExitCount, Predecessor->
getTerminator()))
474 UseLoopGuard =
false;
479 if (!SCEVE.isSafeToExpandAt(ExitCount, BB->
getTerminator())) {
481 << *ExitCount <<
"\n");
485 Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
496 BeginBB = UseLoopGuard ? BB :
L->getLoopPreheader();
498 <<
" - Expanded Count in " << BB->
getName() <<
"\n"
499 <<
" - Will insert set counter intrinsic into: "
500 << BeginBB->getName() <<
"\n");
504Value* HardwareLoop::InsertIterationSetup(
Value *LoopCountInit) {
506 if (BeginBB->getParent()->getAttributes().hasFnAttr(Attribute::StrictFP))
507 Builder.setIsFPConstrained(
true);
509 bool UsePhi = UsePHICounter || Opts.
ForcePhi;
511 ? (UsePhi ? Intrinsic::test_start_loop_iterations
512 : Intrinsic::test_set_loop_iterations)
513 : (UsePhi ? Intrinsic::start_loop_iterations
514 : Intrinsic::set_loop_iterations);
516 Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit);
520 assert((isa<BranchInst>(BeginBB->getTerminator()) &&
521 cast<BranchInst>(BeginBB->getTerminator())->isConditional()) &&
522 "Expected conditional branch");
525 UsePhi ? Builder.CreateExtractValue(LoopSetup, 1) : LoopSetup;
526 auto *LoopGuard = cast<BranchInst>(BeginBB->getTerminator());
527 LoopGuard->setCondition(SetCount);
528 if (LoopGuard->getSuccessor(0) !=
L->getLoopPreheader())
529 LoopGuard->swapSuccessors();
531 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop counter: " << *LoopSetup
533 if (UsePhi && UseLoopGuard)
534 LoopSetup = Builder.CreateExtractValue(LoopSetup, 0);
535 return !UsePhi ? LoopCountInit : LoopSetup;
538void HardwareLoop::InsertLoopDec() {
540 if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr(
541 Attribute::StrictFP))
542 CondBuilder.setIsFPConstrained(
true);
548 Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
549 Value *OldCond = ExitBranch->getCondition();
550 ExitBranch->setCondition(NewCond);
553 if (!
L->contains(ExitBranch->getSuccessor(0)))
554 ExitBranch->swapSuccessors();
560 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop dec: " << *NewCond <<
"\n");
565 if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr(
566 Attribute::StrictFP))
567 CondBuilder.setIsFPConstrained(
true);
573 Value *
Call = CondBuilder.CreateCall(DecFunc, Ops);
575 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop dec: " << *Call <<
"\n");
576 return cast<Instruction>(Call);
583 IRBuilder<> Builder(Header, Header->getFirstNonPHIIt());
586 Index->addIncoming(EltsRem, Latch);
591void HardwareLoop::UpdateBranch(
Value *EltsRem) {
594 CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->
getType(), 0));
595 Value *OldCond = ExitBranch->getCondition();
596 ExitBranch->setCondition(NewCond);
599 if (!
L->contains(ExitBranch->getSuccessor(0)))
600 ExitBranch->swapSuccessors();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< bool > ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false), cl::desc("Force allowance of nested hardware loops"))
static cl::opt< unsigned > CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32), cl::desc("Set the loop counter bitwidth"))
static OptimizationRemarkAnalysis createHWLoopAnalysis(StringRef RemarkName, Loop *L, Instruction *I)
static cl::opt< bool > ForceGuardLoopEntry("force-hardware-loop-guard", cl::Hidden, cl::init(false), cl::desc("Force generation of loop guard intrinsic"))
static void debugHWLoopFailure(const StringRef DebugMsg, Instruction *I)
static cl::opt< unsigned > LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1), cl::desc("Set the loop decrement value"))
static cl::opt< bool > ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false), cl::desc("Force hardware loops intrinsics to be inserted"))
static bool CanGenerateTest(Loop *L, Value *Count)
static cl::opt< bool > ForceHardwareLoopPHI("force-hardware-loop-phi", cl::Hidden, cl::init(false), cl::desc("Force hardware loop counter to be updated through a phi"))
Defines an IR pass for the creation of hardware loops.
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Function * getParent() const
Return the enclosing method, or null if none.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Conditional or Unconditional Branch instruction.
Analysis pass which computes BranchProbabilityInfo.
Legacy analysis pass which computes BranchProbabilityInfo.
Predicate getPredicate() const
Return the predicate for this instruction.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
This is an important class for using LLVM in a threaded context.
Analysis pass that exposes the LoopInfo for a function.
The legacy pass manager's analysis pass to compute loop information.
Represents a single loop in the control flow graph.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserve()
Mark an analysis as preserved.
This class uses information about analyze scalars to rewrite expressions in canonical form.
This class represents an analyzed expression in the program.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
StringRef getName() const
Return a constant reference to the value's name.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
void initializeHardwareLoopsLegacyPass(PassRegistry &)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionPass * createHardwareLoopsLegacyPass()
Create Hardware Loop pass.
Attributes of a target dependent hardware loop.
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
std::optional< bool > Force
HardwareLoopOptions & setForceNested(bool Force)
std::optional< bool > ForceGuard
std::optional< unsigned > Decrement
HardwareLoopOptions & setDecrement(unsigned Count)
HardwareLoopOptions & setForceGuard(bool Force)
HardwareLoopOptions & setForce(bool Force)
HardwareLoopOptions & setCounterBitwidth(unsigned Width)
std::optional< unsigned > Bitwidth
HardwareLoopOptions & setForcePhi(bool Force)
std::optional< bool > ForcePhi
std::optional< bool > ForceNested
bool getForceNested() const