44#define DEBUG_TYPE "hardware-loops"
46#define HW_LOOPS_NAME "Hardware Loop Insertion"
52 cl::desc(
"Force hardware loops intrinsics to be inserted"));
57 cl::desc(
"Force hardware loop counter to be updated through a phi"));
61 cl::desc(
"Force allowance of nested hardware loops"));
65 cl::desc(
"Set the loop decrement value"));
69 cl::desc(
"Set the loop counter bitwidth"));
74 cl::desc(
"Force generation of loop guard intrinsic"));
76STATISTIC(NumHWLoops,
"Number of loops converted to hardware loops");
81 dbgs() <<
"HWLoops: " << DebugMsg;
96 CodeRegion =
I->getParent();
100 DL =
I->getDebugLoc();
104 R <<
"hardware-loop not created: ";
140 bool TryConvertLoop(
Loop *L);
153 bool PreserveLCSSA =
false;
157 bool MadeChange =
false;
162 Value *InitLoopCount();
165 Value *InsertIterationSetup(
Value *LoopCountInit);
168 void InsertLoopDec();
180 void UpdateBranch(
Value *EltsRem);
186 SE(SE),
DL(
DL), ORE(ORE), L(
Info.L),
M(L->getHeader()->getModule()),
187 ExitCount(
Info.ExitCount),
188 CountType(
Info.CountType),
189 ExitBranch(
Info.ExitBranch),
191 UsePHICounter(
Info.CounterInReg),
192 UseLoopGuard(
Info.PerformEntryTest) { }
202 const SCEV *ExitCount =
nullptr;
203 Type *CountType =
nullptr;
206 bool UsePHICounter =
false;
207 bool UseLoopGuard =
false;
212char HardwareLoops::ID = 0;
214bool HardwareLoops::runOnFunction(
Function &
F) {
220 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
221 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
222 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
223 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
224 DL = &
F.getParent()->getDataLayout();
225 ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
226 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
227 LibInfo = TLIP ? &TLIP->getTLI(
F) :
nullptr;
228 PreserveLCSSA = mustPreserveAnalysisID(
LCSSAID);
229 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
233 if (L->isOutermost())
241bool HardwareLoops::TryConvertLoop(
Loop *L) {
243 bool AnyChanged =
false;
245 AnyChanged |= TryConvertLoop(SL);
247 reportHWLoopFailure(
"nested hardware-loops not supported",
"HWLoopNested",
252 LLVM_DEBUG(
dbgs() <<
"HWLoops: Loop " << L->getHeader()->getName() <<
"\n");
255 if (!HWLoopInfo.canAnalyze(*LI)) {
256 reportHWLoopFailure(
"cannot analyze loop, irreducible control flow",
257 "HWLoopCannotAnalyze", ORE, L);
263 reportHWLoopFailure(
"it's not profitable to create a hardware-loop",
264 "HWLoopNotProfitable", ORE, L);
270 HWLoopInfo.CountType =
274 HWLoopInfo.LoopDecrement =
277 MadeChange |= TryConvertLoop(HWLoopInfo);
283 Loop *L = HWLoopInfo.
L;
284 LLVM_DEBUG(
dbgs() <<
"HWLoops: Try to convert profitable loop: " << *L);
291 reportHWLoopFailure(
"loop is not a candidate",
"HWLoopNoCandidate", ORE, L);
297 "Hardware Loop must have set exit info.");
307 HardwareLoop HWLoop(HWLoopInfo, *SE, *
DL, ORE);
313void HardwareLoop::Create() {
316 Value *LoopCountInit = InitLoopCount();
317 if (!LoopCountInit) {
318 reportHWLoopFailure(
"could not safely create a loop count expression",
319 "HWLoopNotSafe", ORE, L);
323 Value *
Setup = InsertIterationSetup(LoopCountInit);
326 Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
327 Value *EltsRem = InsertPHICounter(Setup, LoopDec);
329 UpdateBranch(LoopDec);
349 if (BI->isUnconditional() || !isa<ICmpInst>(BI->getCondition()))
354 auto ICmp = cast<ICmpInst>(BI->getCondition());
356 if (!ICmp->isEquality())
359 auto IsCompareZero = [](
ICmpInst *ICmp,
Value *Count,
unsigned OpIdx) {
360 if (
auto *Const = dyn_cast<ConstantInt>(ICmp->
getOperand(OpIdx)))
361 return Const->isZero() && ICmp->
getOperand(OpIdx ^ 1) == Count;
366 Value *CountBefZext =
367 isa<ZExtInst>(Count) ? cast<ZExtInst>(Count)->getOperand(0) :
nullptr;
369 if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) &&
370 !IsCompareZero(ICmp, CountBefZext, 0) &&
371 !IsCompareZero(ICmp, CountBefZext, 1))
374 unsigned SuccIdx = ICmp->
getPredicate() == ICmpInst::ICMP_NE ? 0 : 1;
375 if (BI->getSuccessor(SuccIdx) != Preheader)
381Value *HardwareLoop::InitLoopCount() {
382 LLVM_DEBUG(
dbgs() <<
"HWLoops: Initialising loop counter value:\n");
387 if (!ExitCount->getType()->isPointerTy() &&
388 ExitCount->getType() != CountType)
389 ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
391 ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
397 if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount,
398 SE.getZero(ExitCount->getType()))) {
402 UseLoopGuard =
false;
410 if (!SCEVE.isSafeToExpandAt(ExitCount, Predecessor->
getTerminator()))
411 UseLoopGuard =
false;
416 if (!SCEVE.isSafeToExpandAt(ExitCount, BB->
getTerminator())) {
418 << *ExitCount <<
"\n");
422 Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
435 <<
" - Expanded Count in " << BB->
getName() <<
"\n"
436 <<
" - Will insert set counter intrinsic into: "
437 << BeginBB->getName() <<
"\n");
441Value* HardwareLoop::InsertIterationSetup(
Value *LoopCountInit) {
446 ? (UsePhi ? Intrinsic::test_start_loop_iterations
447 : Intrinsic::test_set_loop_iterations)
448 : (UsePhi ? Intrinsic::start_loop_iterations
449 : Intrinsic::set_loop_iterations);
451 Value *LoopSetup =
Builder.CreateCall(LoopIter, LoopCountInit);
455 assert((isa<BranchInst>(BeginBB->getTerminator()) &&
456 cast<BranchInst>(BeginBB->getTerminator())->isConditional()) &&
457 "Expected conditional branch");
460 UsePhi ?
Builder.CreateExtractValue(LoopSetup, 1) : LoopSetup;
461 auto *LoopGuard = cast<BranchInst>(BeginBB->getTerminator());
462 LoopGuard->setCondition(SetCount);
464 LoopGuard->swapSuccessors();
466 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop counter: " << *LoopSetup
468 if (UsePhi && UseLoopGuard)
469 LoopSetup =
Builder.CreateExtractValue(LoopSetup, 0);
470 return !UsePhi ? LoopCountInit : LoopSetup;
473void HardwareLoop::InsertLoopDec() {
480 Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
481 Value *OldCond = ExitBranch->getCondition();
482 ExitBranch->setCondition(NewCond);
485 if (!L->
contains(ExitBranch->getSuccessor(0)))
486 ExitBranch->swapSuccessors();
492 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop dec: " << *NewCond <<
"\n");
502 Value *
Call = CondBuilder.CreateCall(DecFunc, Ops);
504 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop dec: " << *Call <<
"\n");
505 return cast<Instruction>(Call);
514 Index->addIncoming(NumElts, Preheader);
515 Index->addIncoming(EltsRem, Latch);
520void HardwareLoop::UpdateBranch(
Value *EltsRem) {
524 Value *OldCond = ExitBranch->getCondition();
525 ExitBranch->setCondition(NewCond);
528 if (!L->
contains(ExitBranch->getSuccessor(0)))
529 ExitBranch->swapSuccessors();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< bool > ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false), cl::desc("Force allowance of nested hardware loops"))
static cl::opt< unsigned > CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32), cl::desc("Set the loop counter bitwidth"))
static OptimizationRemarkAnalysis createHWLoopAnalysis(StringRef RemarkName, Loop *L, Instruction *I)
static cl::opt< bool > ForceGuardLoopEntry("force-hardware-loop-guard", cl::Hidden, cl::init(false), cl::desc("Force generation of loop guard intrinsic"))
static void debugHWLoopFailure(const StringRef DebugMsg, Instruction *I)
static cl::opt< unsigned > LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1), cl::desc("Set the loop decrement value"))
static cl::opt< bool > ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false), cl::desc("Force hardware loops intrinsics to be inserted"))
static bool CanGenerateTest(Loop *L, Value *Count)
static cl::opt< bool > ForceHardwareLoopPHI("force-hardware-loop-phi", cl::Hidden, cl::init(false), cl::desc("Force hardware loop counter to be updated through a phi"))
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Function * getParent() const
Return the enclosing method, or null if none.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Conditional or Unconditional Branch instruction.
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
A parsed version of the target data layout string in and methods for querying it.
Legacy analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
The legacy pass manager's analysis pass to compute loop information.
Represents a single loop in the control flow graph.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
A Module instance is used to store all the information related to an LLVM module.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
This class uses information about analyze scalars to rewrite expressions in canonical form.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
StringRef - Represent a constant reference to a string, i.e.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
StringRef getName() const
Return a constant reference to the value's name.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
void initializeHardwareLoopsPass(PassRegistry &)
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionPass * createHardwareLoopsPass()
Create Hardware Loop pass.
Attributes of a target dependent hardware loop.
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)