Go to the documentation of this file.
45 #include "llvm/IR/IntrinsicsARM.h"
56 #define DEBUG_TYPE "mve-tail-predication"
57 #define DESC "Transform predicated vector loops to use MVE tail predication"
60 "tail-predication",
cl::desc(
"MVE tail-predication pass options"),
63 "Don't tail-predicate loops"),
65 "enabled-no-reductions",
66 "Enable tail-predication, but not for reduction loops"),
69 "Enable tail-predication, including reduction loops"),
71 "force-enabled-no-reductions",
72 "Enable tail-predication, but not for reduction loops, "
73 "and force this which might be unsafe"),
76 "Enable tail-predication, including reduction loops, "
77 "and force this which might be unsafe")));
82 class MVETailPredication :
public LoopPass {
108 bool TryConvertActiveLaneMask(
Value *TripCount);
122 void RematerializeIterCount();
133 auto &TPC = getAnalysis<TargetPassConfig>();
136 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
137 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
142 if (!
ST->hasMVEIntegerOps() || !
ST->hasV8_1MMainlineOps()) {
152 for (
auto &
I : *
BB) {
153 auto *
Call = dyn_cast<IntrinsicInst>(&
I);
158 if (
ID == Intrinsic::start_loop_iterations ||
159 ID == Intrinsic::test_start_loop_iterations)
160 return cast<IntrinsicInst>(&
I);
177 LLVM_DEBUG(
dbgs() <<
"ARM TP: Running on Loop: " << *L << *Setup <<
"\n");
179 bool Changed = TryConvertActiveLaneMask(
Setup->getArgOperand(0));
201 bool MVETailPredication::IsSafeActiveMask(
IntrinsicInst *ActiveLaneMask,
203 bool ForceTailPredication =
208 bool Changed =
false;
212 auto *
EC= SE->getSCEV(ElemCount);
213 auto *TC = SE->getSCEV(TripCount);
215 cast<FixedVectorType>(ActiveLaneMask->
getType())->getNumElements();
216 if (VectorWidth != 2 && VectorWidth != 4 && VectorWidth != 8 &&
224 if (!SE->isLoopInvariant(EC, L)) {
225 LLVM_DEBUG(
dbgs() <<
"ARM TP: element count must be loop invariant.\n");
229 if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) {
230 ConstantInt *TC = dyn_cast<ConstantInt>(TripCount);
233 "set.loop.iterations\n");
243 (ConstElemCount->
getZExtValue() + VectorWidth - 1) / VectorWidth;
249 LLVM_DEBUG(
dbgs() <<
"ARM TP: inconsistent constant tripcount values: "
250 << TC1 <<
" from set.loop.iterations, and "
251 << TC2 <<
" from get.active.lane.mask\n");
254 }
else if (!ForceTailPredication) {
269 auto *ECPlusVWMinus1 = SE->getAddExpr(EC,
273 auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, VW);
278 dbgs() <<
"ARM TP: Analysing overflow behaviour for:\n";
279 dbgs() <<
"ARM TP: - TripCount = "; TC->
dump();
280 dbgs() <<
"ARM TP: - ElemCount = ";
EC->dump();
281 dbgs() <<
"ARM TP: - VecWidth = " << VectorWidth <<
"\n";
282 dbgs() <<
"ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump();
298 SE->getMinusSCEV(SE->getBackedgeTakenCount(L),
299 SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW),
300 SE->getNegativeSCEV(VW)),
306 Sub = SE->applyLoopGuards(Sub, L);
308 if (!Sub->isZero()) {
309 LLVM_DEBUG(
dbgs() <<
"ARM TP: possible overflow in sub expression.\n");
320 auto *IVExpr = SE->getSCEV(
IV);
321 auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr);
324 LLVM_DEBUG(
dbgs() <<
"ARM TP: induction not an add expr: "; IVExpr->dump());
328 if (AddExpr->getLoop() != L) {
332 auto *
Base = dyn_cast<SCEVConstant>(AddExpr->getOperand(0));
333 if (!Base || !
Base->isZero()) {
337 auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1));
339 LLVM_DEBUG(
dbgs() <<
"ARM TP: induction step is not a constant: ";
340 AddExpr->getOperand(1)->
dump());
343 auto StepValue = Step->getValue()->getSExtValue();
344 if (VectorWidth == StepValue)
348 <<
" doesn't match vector width " << VectorWidth <<
"\n");
353 void MVETailPredication::InsertVCTPIntrinsic(
IntrinsicInst *ActiveLaneMask,
358 unsigned VectorWidth =
359 cast<FixedVectorType>(ActiveLaneMask->
getType())->getNumElements();
368 Builder.SetInsertPoint(ActiveLaneMask);
372 switch (VectorWidth) {
375 case 2: VCTPID = Intrinsic::arm_mve_vctp64;
break;
376 case 4: VCTPID = Intrinsic::arm_mve_vctp32;
break;
377 case 8: VCTPID = Intrinsic::arm_mve_vctp16;
break;
378 case 16: VCTPID = Intrinsic::arm_mve_vctp8;
break;
386 Value *Remaining =
Builder.CreateSub(Processed, Factor);
389 << *Processed <<
"\n"
390 <<
"ARM TP: Inserted VCTP: " << *VCTPCall <<
"\n");
393 bool MVETailPredication::TryConvertActiveLaneMask(
Value *TripCount) {
397 if (
auto *Int = dyn_cast<IntrinsicInst>(&
I))
398 if (
Int->getIntrinsicID() == Intrinsic::get_active_lane_mask)
399 ActiveLaneMasks.push_back(Int);
401 if (ActiveLaneMasks.empty())
406 for (
auto *ActiveLaneMask : ActiveLaneMasks) {
408 << *ActiveLaneMask <<
"\n");
410 if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) {
415 InsertVCTPIntrinsic(ActiveLaneMask, TripCount);
419 for (
auto *II : ActiveLaneMasks)
427 return new MVETailPredication();
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
This is an optimization pass for GlobalISel generic memory operations.
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
const Function * getParent() const
Return the enclosing method, or null if none.
Represents a single loop in the control flow graph.
void dump() const
Support for debugging, callable in GDB: V->dump()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The main scalar evolution driver.
The instances of the Type class are immutable: once they are created, they are never changed.
The legacy pass manager's analysis pass to compute loop information.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
LLVM Basic Block Representation.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
This is the shared class of boolean and integer constants.
Represent the analysis usage information of a pass.
iterator_range< block_iterator > blocks() const
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Target-Independent Code Generator Pass Configuration Options.
This class represents an analyzed expression in the program.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
initializer< Ty > init(const Ty &Val)
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Primary interface to the complete machine description for the target machine.
A Module instance is used to store all the information related to an LLVM module.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Type * getType() const
All values are typed, get the type of this value.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
bool makeLoopInvariant(Value *V, bool &Changed, Instruction *InsertPt=nullptr, MemorySSAUpdater *MSSAU=nullptr, ScalarEvolution *SE=nullptr) const
If the given value is an instruction inside of the loop and it can be hoisted, do so to make it trivi...
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
cl::opt< TailPredication::Mode > EnableTailPredication("tail-predication", cl::desc("MVE tail-predication pass options"), cl::init(TailPredication::Enabled), cl::values(clEnumValN(TailPredication::Disabled, "disabled", "Don't tail-predicate loops"), clEnumValN(TailPredication::EnabledNoReductions, "enabled-no-reductions", "Enable tail-predication, but not for reduction loops"), clEnumValN(TailPredication::Enabled, "enabled", "Enable tail-predication, including reduction loops"), clEnumValN(TailPredication::ForceEnabledNoReductions, "force-enabled-no-reductions", "Enable tail-predication, but not for reduction loops, " "and force this which might be unsafe"), clEnumValN(TailPredication::ForceEnabled, "force-enabled", "Enable tail-predication, including reduction loops, " "and force this which might be unsafe")))
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
BlockT * getHeader() const
Pass * createMVETailPredicationPass()
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
A wrapper class for inspecting calls to intrinsic functions.
Pass interface - Implemented by all 'passes'.
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
static const uint32_t IV[8]
const char LLVMTargetMachineRef TM
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
AnalysisUsage & addRequired()
Value * getOperand(unsigned i) const
LLVM Value Representation.
@ ForceEnabledNoReductions