45#define DEBUG_TYPE "hardware-loops"
47#define HW_LOOPS_NAME "Hardware Loop Insertion"
53 cl::desc(
"Force hardware loops intrinsics to be inserted"));
58 cl::desc(
"Force hardware loop counter to be updated through a phi"));
62 cl::desc(
"Force allowance of nested hardware loops"));
66 cl::desc(
"Set the loop decrement value"));
70 cl::desc(
"Set the loop counter bitwidth"));
75 cl::desc(
"Force generation of loop guard intrinsic"));
77STATISTIC(NumHWLoops,
"Number of loops converted to hardware loops");
82 dbgs() <<
"HWLoops: " << DebugMsg;
93 Value *CodeRegion = L->getHeader();
97 CodeRegion =
I->getParent();
100 if (
I->getDebugLoc())
101 DL =
I->getDebugLoc();
105 R <<
"hardware-loop not created: ";
143 class HardwareLoopsImpl {
150 : SE(SE), LI(LI), PreserveLCSSA(PreserveLCSSA), DT(DT),
DL(
DL),
TTI(
TTI),
151 TLI(TLI), AC(AC), ORE(ORE), Opts(Opts) { }
173 bool MadeChange =
false;
178 Value *InitLoopCount();
181 Value *InsertIterationSetup(
Value *LoopCountInit);
184 void InsertLoopDec();
196 void UpdateBranch(
Value *EltsRem);
203 SE(SE),
DL(
DL), ORE(ORE), Opts(Opts),
L(
Info.
L),
M(
L->getHeader()->getModule()),
204 ExitCount(
Info.ExitCount),
205 CountType(
Info.CountType),
206 ExitBranch(
Info.ExitBranch),
208 UsePHICounter(
Info.CounterInReg),
209 UseLoopGuard(
Info.PerformEntryTest) { }
220 const SCEV *ExitCount =
nullptr;
221 Type *CountType =
nullptr;
224 bool UsePHICounter =
false;
225 bool UseLoopGuard =
false;
230char HardwareLoopsLegacy::ID = 0;
232bool HardwareLoopsLegacy::runOnFunction(
Function &
F) {
238 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
239 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
240 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
241 auto &
TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
242 auto &
DL =
F.getDataLayout();
243 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
244 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
245 auto *TLI = TLIP ? &TLIP->getTLI(
F) :
nullptr;
246 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
247 bool PreserveLCSSA = mustPreserveAnalysisID(
LCSSAID);
263 HardwareLoopsImpl Impl(SE, LI, PreserveLCSSA, DT,
DL,
TTI, TLI, AC, ORE,
277 auto &
DL =
F.getDataLayout();
279 HardwareLoopsImpl Impl(SE, LI,
true, DT,
DL,
TTI, TLI, AC, ORE, Opts);
280 bool Changed = Impl.run(
F);
292bool HardwareLoopsImpl::run(
Function &
F) {
295 if (L->isOutermost())
296 TryConvertLoop(L, Ctx);
304 bool AnyChanged =
false;
306 AnyChanged |= TryConvertLoop(SL, Ctx);
308 reportHWLoopFailure(
"nested hardware-loops not supported",
"HWLoopNested",
313 LLVM_DEBUG(
dbgs() <<
"HWLoops: Loop " <<
L->getHeader()->getName() <<
"\n");
316 if (!HWLoopInfo.canAnalyze(LI)) {
317 reportHWLoopFailure(
"cannot analyze loop, irreducible control flow",
318 "HWLoopCannotAnalyze", ORE, L);
324 reportHWLoopFailure(
"it's not profitable to create a hardware-loop",
325 "HWLoopNotProfitable", ORE, L);
335 HWLoopInfo.LoopDecrement =
336 ConstantInt::get(HWLoopInfo.CountType, Opts.
Decrement.value());
338 MadeChange |= TryConvertLoop(HWLoopInfo);
339 return MadeChange && (!HWLoopInfo.IsNestingLegal && !Opts.
ForceNested);
345 LLVM_DEBUG(
dbgs() <<
"HWLoops: Try to convert profitable loop: " << *L);
352 reportHWLoopFailure(
"loop is not a candidate",
"HWLoopNoCandidate", ORE, L);
358 "Hardware Loop must have set exit info.");
368 HardwareLoop HWLoop(HWLoopInfo, SE,
DL, ORE, Opts);
374void HardwareLoop::Create() {
377 Value *LoopCountInit = InitLoopCount();
378 if (!LoopCountInit) {
379 reportHWLoopFailure(
"could not safely create a loop count expression",
380 "HWLoopNotSafe", ORE, L);
384 Value *
Setup = InsertIterationSetup(LoopCountInit);
386 if (UsePHICounter || Opts.
ForcePhi) {
387 Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
388 Value *EltsRem = InsertPHICounter(Setup, LoopDec);
390 UpdateBranch(LoopDec);
396 for (
auto *
I :
L->blocks())
401 BasicBlock *Preheader = L->getLoopPreheader();
410 if (BI->isUnconditional() || !isa<ICmpInst>(BI->getCondition()))
415 auto ICmp = cast<ICmpInst>(BI->getCondition());
417 if (!ICmp->isEquality())
420 auto IsCompareZero = [](
ICmpInst *ICmp,
Value *Count,
unsigned OpIdx) {
421 if (
auto *Const = dyn_cast<ConstantInt>(ICmp->
getOperand(OpIdx)))
422 return Const->isZero() && ICmp->
getOperand(OpIdx ^ 1) == Count;
427 Value *CountBefZext =
428 isa<ZExtInst>(Count) ? cast<ZExtInst>(Count)->getOperand(0) :
nullptr;
430 if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) &&
431 !IsCompareZero(ICmp, CountBefZext, 0) &&
432 !IsCompareZero(ICmp, CountBefZext, 1))
436 if (BI->getSuccessor(SuccIdx) != Preheader)
442Value *HardwareLoop::InitLoopCount() {
443 LLVM_DEBUG(
dbgs() <<
"HWLoops: Initialising loop counter value:\n");
448 if (!ExitCount->getType()->isPointerTy() &&
449 ExitCount->getType() != CountType)
450 ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
452 ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
459 SE.getZero(ExitCount->getType()))) {
464 UseLoopGuard =
false;
472 if (!SCEVE.isSafeToExpandAt(ExitCount, Predecessor->
getTerminator()))
473 UseLoopGuard =
false;
478 if (!SCEVE.isSafeToExpandAt(ExitCount, BB->
getTerminator())) {
480 << *ExitCount <<
"\n");
484 Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
495 BeginBB = UseLoopGuard ? BB :
L->getLoopPreheader();
497 <<
" - Expanded Count in " << BB->
getName() <<
"\n"
498 <<
" - Will insert set counter intrinsic into: "
499 << BeginBB->getName() <<
"\n");
503Value* HardwareLoop::InsertIterationSetup(
Value *LoopCountInit) {
505 if (BeginBB->getParent()->getAttributes().hasFnAttr(Attribute::StrictFP))
506 Builder.setIsFPConstrained(
true);
508 bool UsePhi = UsePHICounter || Opts.
ForcePhi;
510 ? (UsePhi ? Intrinsic::test_start_loop_iterations
511 : Intrinsic::test_set_loop_iterations)
512 : (UsePhi ? Intrinsic::start_loop_iterations
513 : Intrinsic::set_loop_iterations);
514 Value *LoopSetup = Builder.CreateIntrinsic(
ID, Ty, LoopCountInit);
518 assert((isa<BranchInst>(BeginBB->getTerminator()) &&
519 cast<BranchInst>(BeginBB->getTerminator())->isConditional()) &&
520 "Expected conditional branch");
523 UsePhi ? Builder.CreateExtractValue(LoopSetup, 1) : LoopSetup;
524 auto *LoopGuard = cast<BranchInst>(BeginBB->getTerminator());
525 LoopGuard->setCondition(SetCount);
526 if (LoopGuard->getSuccessor(0) !=
L->getLoopPreheader())
527 LoopGuard->swapSuccessors();
529 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop counter: " << *LoopSetup
531 if (UsePhi && UseLoopGuard)
532 LoopSetup = Builder.CreateExtractValue(LoopSetup, 0);
533 return !UsePhi ? LoopCountInit : LoopSetup;
536void HardwareLoop::InsertLoopDec() {
538 if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr(
539 Attribute::StrictFP))
540 CondBuilder.setIsFPConstrained(
true);
543 Value *NewCond = CondBuilder.CreateIntrinsic(Intrinsic::loop_decrement,
545 Value *OldCond = ExitBranch->getCondition();
546 ExitBranch->setCondition(NewCond);
549 if (!
L->contains(ExitBranch->getSuccessor(0)))
550 ExitBranch->swapSuccessors();
556 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop dec: " << *NewCond <<
"\n");
561 if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr(
562 Attribute::StrictFP))
563 CondBuilder.setIsFPConstrained(
true);
566 Value *
Call = CondBuilder.CreateIntrinsic(Intrinsic::loop_decrement_reg,
569 LLVM_DEBUG(
dbgs() <<
"HWLoops: Inserted loop dec: " << *Call <<
"\n");
570 return cast<Instruction>(Call);
577 IRBuilder<> Builder(Header, Header->getFirstNonPHIIt());
579 Index->addIncoming(NumElts, Preheader);
580 Index->addIncoming(EltsRem, Latch);
585void HardwareLoop::UpdateBranch(
Value *EltsRem) {
588 CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->
getType(), 0));
589 Value *OldCond = ExitBranch->getCondition();
590 ExitBranch->setCondition(NewCond);
593 if (!
L->contains(ExitBranch->getSuccessor(0)))
594 ExitBranch->swapSuccessors();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< bool > ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false), cl::desc("Force allowance of nested hardware loops"))
static cl::opt< unsigned > CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32), cl::desc("Set the loop counter bitwidth"))
static OptimizationRemarkAnalysis createHWLoopAnalysis(StringRef RemarkName, Loop *L, Instruction *I)
static cl::opt< bool > ForceGuardLoopEntry("force-hardware-loop-guard", cl::Hidden, cl::init(false), cl::desc("Force generation of loop guard intrinsic"))
static void debugHWLoopFailure(const StringRef DebugMsg, Instruction *I)
static cl::opt< unsigned > LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1), cl::desc("Set the loop decrement value"))
static cl::opt< bool > ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false), cl::desc("Force hardware loops intrinsics to be inserted"))
static bool CanGenerateTest(Loop *L, Value *Count)
static cl::opt< bool > ForceHardwareLoopPHI("force-hardware-loop-phi", cl::Hidden, cl::init(false), cl::desc("Force hardware loop counter to be updated through a phi"))
Defines an IR pass for the creation of hardware loops.
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Function * getParent() const
Return the enclosing method, or null if none.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Conditional or Unconditional Branch instruction.
Analysis pass which computes BranchProbabilityInfo.
Legacy analysis pass which computes BranchProbabilityInfo.
Predicate getPredicate() const
Return the predicate for this instruction.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
This is an important class for using LLVM in a threaded context.
Analysis pass that exposes the LoopInfo for a function.
The legacy pass manager's analysis pass to compute loop information.
Represents a single loop in the control flow graph.
A Module instance is used to store all the information related to an LLVM module.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserve()
Mark an analysis as preserved.
This class uses information about analyze scalars to rewrite expressions in canonical form.
This class represents an analyzed expression in the program.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
StringRef getName() const
Return a constant reference to the value's name.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
void initializeHardwareLoopsLegacyPass(PassRegistry &)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionPass * createHardwareLoopsLegacyPass()
Create Hardware Loop pass.
Attributes of a target dependent hardware loop.
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
std::optional< bool > Force
HardwareLoopOptions & setForceNested(bool Force)
std::optional< bool > ForceGuard
std::optional< unsigned > Decrement
HardwareLoopOptions & setDecrement(unsigned Count)
HardwareLoopOptions & setForceGuard(bool Force)
HardwareLoopOptions & setForce(bool Force)
HardwareLoopOptions & setCounterBitwidth(unsigned Width)
std::optional< unsigned > Bitwidth
HardwareLoopOptions & setForcePhi(bool Force)
std::optional< bool > ForcePhi
std::optional< bool > ForceNested
bool getForceNested() const