LLVM 23.0.0git
RISCVCodeGenPrepare.cpp
Go to the documentation of this file.
1//===----- RISCVCodeGenPrepare.cpp ----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is a RISC-V specific version of CodeGenPrepare.
10// It munges the code in the input function to better prepare it for
11// SelectionDAG-based code generation. This works around limitations in it's
12// basic-block-at-a-time approach.
13//
14//===----------------------------------------------------------------------===//
15
16#include "RISCV.h"
17#include "RISCVTargetMachine.h"
18#include "llvm/ADT/Statistic.h"
21#include "llvm/IR/Dominators.h"
22#include "llvm/IR/IRBuilder.h"
23#include "llvm/IR/InstVisitor.h"
25#include "llvm/IR/Intrinsics.h"
28#include "llvm/Pass.h"
30
31using namespace llvm;
32
33#define DEBUG_TYPE "riscv-codegenprepare"
34#define PASS_NAME "RISC-V CodeGenPrepare"
35
36namespace {
37class RISCVCodeGenPrepare : public InstVisitor<RISCVCodeGenPrepare, bool> {
38 Function &F;
39 const DataLayout *DL;
40 const DominatorTree *DT;
41 const RISCVSubtarget *ST;
42
43public:
44 RISCVCodeGenPrepare(Function &F, const DominatorTree *DT,
45 const RISCVSubtarget *ST)
46 : F(F), DL(&F.getDataLayout()), DT(DT), ST(ST) {}
47 bool run();
48 bool visitInstruction(Instruction &I) { return false; }
49 bool visitAnd(BinaryOperator &BO);
50 bool visitIntrinsicInst(IntrinsicInst &I);
51 bool expandVPStrideLoad(IntrinsicInst &I);
52 bool expandMulReduction(IntrinsicInst &I);
53 bool widenVPMerge(Instruction *I);
54 bool visitFreezeInst(FreezeInst &BO);
55};
56} // namespace
57
58namespace {
59class RISCVCodeGenPrepareLegacyPass : public FunctionPass {
60public:
61 static char ID;
62
63 RISCVCodeGenPrepareLegacyPass() : FunctionPass(ID) {}
64
65 bool runOnFunction(Function &F) override;
66 StringRef getPassName() const override { return PASS_NAME; }
67
68 void getAnalysisUsage(AnalysisUsage &AU) const override {
69 AU.setPreservesCFG();
70 AU.addRequired<DominatorTreeWrapperPass>();
71 AU.addRequired<TargetPassConfig>();
72 }
73};
74} // namespace
75
76// Try to optimize (i64 (and (zext/sext (i32 X), C1))) if C1 has bit 31 set,
77// but bits 63:32 are zero. If we know that bit 31 of X is 0, we can fill
78// the upper 32 bits with ones.
79bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
80 if (!ST->is64Bit())
81 return false;
82
83 if (!BO.getType()->isIntegerTy(64))
84 return false;
85
86 using namespace PatternMatch;
87
88 // Left hand side should be a zext nneg.
89 Value *LHSSrc;
90 if (!match(BO.getOperand(0), m_NNegZExt(m_Value(LHSSrc))))
91 return false;
92
93 if (!LHSSrc->getType()->isIntegerTy(32))
94 return false;
95
96 // Right hand side should be a constant.
97 Value *RHS = BO.getOperand(1);
98
99 auto *CI = dyn_cast<ConstantInt>(RHS);
100 if (!CI)
101 return false;
102 uint64_t C = CI->getZExtValue();
103
104 // Look for constants that fit in 32 bits but not simm12, and can be made
105 // into simm12 by sign extending bit 31. This will allow use of ANDI.
106 // TODO: Is worth making simm32?
108 return false;
109
110 // Sign extend the constant and replace the And operand.
112 BO.setOperand(1, ConstantInt::get(RHS->getType(), C));
113
114 return true;
115}
116
117// With EVL tail folding, an AnyOf reduction will generate an i1 vp.merge like
118// follows:
119//
120// loop:
121// %phi = phi <vscale x 4 x i1> [zeroinitializer, %entry], [%freeze, %loop]
122// %cmp = icmp ...
123// %rec = call <vscale x 4 x i1> @llvm.vp.merge(%cmp, i1 true, %phi, %evl)
124// %freeze = freeze <vscale x 4 x i1> %rec [optional]
125// ...
126// middle:
127// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %freeze)
128//
129// However RVV doesn't have any tail undisturbed mask instructions and so we
130// need a convoluted sequence of mask instructions to lower the i1 vp.merge: see
131// llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll.
132//
133// To avoid that this widens the i1 vp.merge to an i8 vp.merge, which will
134// generate a single vmerge.vim:
135//
136// loop:
137// %phi = phi <vscale x 4 x i8> [zeroinitializer, %entry], [%freeze, %loop]
138// %cmp = icmp ...
139// %rec = call <vscale x 4 x i8> @llvm.vp.merge(%cmp, i8 true, %phi, %evl)
140// %freeze = freeze <vscale x 4 x i8> %rec
141// %trunc = trunc <vscale x 4 x i8> %freeze to <vscale x 4 x i1>
142// ...
143// middle:
144// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %trunc)
145//
146// The trunc will normally be sunk outside of the loop, but even if there are
147// users inside the loop it is still profitable.
148bool RISCVCodeGenPrepare::widenVPMerge(Instruction *Root) {
149 if (!Root->getType()->getScalarType()->isIntegerTy(1))
150 return false;
151
152 Value *Mask, *True, *PhiV, *EVL;
153 using namespace PatternMatch;
154 auto m_VPMerge = m_Intrinsic<Intrinsic::vp_merge>(
155 m_Value(Mask), m_Value(True), m_Value(PhiV), m_Value(EVL));
156 if (!match(Root, m_CombineOr(m_VPMerge, m_Freeze(m_VPMerge))))
157 return false;
158
159 auto *Phi = dyn_cast<PHINode>(PhiV);
160 if (!Phi || !Phi->hasOneUse() || Phi->getNumIncomingValues() != 2 ||
161 !match(Phi->getIncomingValue(0), m_Zero()) ||
162 Phi->getIncomingValue(1) != Root)
163 return false;
164
165 Type *WideTy =
166 VectorType::get(IntegerType::getInt8Ty(Root->getContext()),
167 cast<VectorType>(Root->getType())->getElementCount());
168
169 IRBuilder<> Builder(Phi);
170 PHINode *WidePhi = Builder.CreatePHI(WideTy, 2);
172 Phi->getIncomingBlock(0));
173 Builder.SetInsertPoint(Root);
174 Value *WideTrue = Builder.CreateZExt(True, WideTy);
175 Value *WideMerge = Builder.CreateIntrinsic(Intrinsic::vp_merge, {WideTy},
176 {Mask, WideTrue, WidePhi, EVL});
177 if (isa<FreezeInst>(Root))
178 WideMerge = Builder.CreateFreeze(WideMerge);
179 WidePhi->addIncoming(WideMerge, Phi->getIncomingBlock(1));
180 Value *Trunc = Builder.CreateTrunc(WideMerge, Root->getType());
181
182 Root->replaceAllUsesWith(Trunc);
183
184 // Break the cycle and delete the old chain.
185 Phi->setIncomingValue(1, Phi->getIncomingValue(0));
187
188 return true;
189}
190
191bool RISCVCodeGenPrepare::visitFreezeInst(FreezeInst &I) {
192 if (auto *II = dyn_cast<IntrinsicInst>(I.getOperand(0)))
193 if (II->getIntrinsicID() == Intrinsic::vp_merge)
194 return widenVPMerge(&I);
195 return false;
196}
197
198// LLVM vector reduction intrinsics return a scalar result, but on RISC-V vector
199// reduction instructions write the result in the first element of a vector
200// register. So when a reduction in a loop uses a scalar phi, we end up with
201// unnecessary scalar moves:
202//
203// loop:
204// vfmv.s.f v10, fa0
205// vfredosum.vs v8, v8, v10
206// vfmv.f.s fa0, v8
207//
208// This mainly affects ordered fadd reductions and VP reductions that have a
209// scalar start value, since other types of reduction typically use element-wise
210// vectorisation in the loop body. This tries to vectorize any scalar phis that
211// feed into these reductions:
212//
213// loop:
214// %phi = phi <float> [ ..., %entry ], [ %acc, %loop ]
215// %acc = call float @llvm.vector.reduce.fadd.nxv2f32(float %phi,
216// <vscale x 2 x float> %vec)
217//
218// ->
219//
220// loop:
221// %phi = phi <vscale x 2 x float> [ ..., %entry ], [ %acc.vec, %loop ]
222// %phi.scalar = extractelement <vscale x 2 x float> %phi, i64 0
223// %acc = call float @llvm.vector.reduce.fadd.nxv2f32(float %x,
224// <vscale x 2 x float> %vec)
225// %acc.vec = insertelement <vscale x 2 x float> poison, float %acc.next, i64 0
226//
227// Which eliminates the scalar -> vector -> scalar crossing during instruction
228// selection.
229bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
230 if (expandVPStrideLoad(I))
231 return true;
232
233 if (expandMulReduction(I))
234 return true;
235
236 if (widenVPMerge(&I))
237 return true;
238
239 if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd &&
241 return false;
242
243 auto *PHI = dyn_cast<PHINode>(I.getOperand(0));
244 if (!PHI || !PHI->hasOneUse() ||
245 !llvm::is_contained(PHI->incoming_values(), &I))
246 return false;
247
248 Type *VecTy = I.getOperand(1)->getType();
249 IRBuilder<> Builder(PHI);
250 auto *VecPHI = Builder.CreatePHI(VecTy, PHI->getNumIncomingValues());
251
252 for (auto *BB : PHI->blocks()) {
253 Builder.SetInsertPoint(BB->getTerminator());
254 Value *InsertElt = Builder.CreateInsertElement(
255 VecTy, PHI->getIncomingValueForBlock(BB), (uint64_t)0);
256 VecPHI->addIncoming(InsertElt, BB);
257 }
258
259 Builder.SetInsertPoint(&I);
260 I.setOperand(0, Builder.CreateExtractElement(VecPHI, (uint64_t)0));
261
262 PHI->eraseFromParent();
263
264 return true;
265}
266
267// Partially expand a vector_reduce_mul wider than M1 to reduce the
268// number of vsetvlis required when VLEN is exactly known, and
269// reducing register pressure in all cases.
270bool RISCVCodeGenPrepare::expandMulReduction(IntrinsicInst &II) {
271 if (II.getIntrinsicID() != Intrinsic::vector_reduce_mul)
272 return false;
273
274 if (!ST->hasVInstructions())
275 return false;
276
277 Value *TmpVec = II.getArgOperand(0);
278 auto *VecTy = dyn_cast<FixedVectorType>(TmpVec->getType());
279 if (!VecTy)
280 return false;
281
282 unsigned EltSize = VecTy->getScalarSizeInBits();
283 unsigned VF = VecTy->getNumElements();
284 unsigned MinVLen = ST->getRealMinVLen();
285 unsigned M1VF = MinVLen / EltSize;
286
287 if (!isPowerOf2_32(VF) || VF <= M1VF)
288 return false;
289
290 IRBuilder<> Builder(&II);
291
292 // Shuffle-reduce at the original vector width. This just duplicates the
293 // default lowering down to m1.
294 SmallVector<int, 32> ShuffleMask(VF);
295 for (unsigned LiveElts = VF; LiveElts > M1VF; LiveElts /= 2) {
296 unsigned Half = LiveElts / 2;
297 std::iota(ShuffleMask.begin(), ShuffleMask.begin() + Half, Half);
298 std::fill(ShuffleMask.begin() + Half, ShuffleMask.end(), -1);
299 Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf");
300 TmpVec = Builder.CreateMul(TmpVec, Shuf, "bin.rdx");
301 }
302
303 // Extract the M1-sized subvector and emit the final reduction intrinsic.
304 // This is the reason we're here - to force a vsetvli toggle once at m1.
305 auto *M1Ty = FixedVectorType::get(VecTy->getElementType(), M1VF);
306 Value *Sub =
307 Builder.CreateExtractVector(M1Ty, TmpVec, (uint64_t)0, "rdx.sub");
308 Value *Rdx =
309 Builder.CreateIntrinsic(Intrinsic::vector_reduce_mul, {M1Ty}, {Sub});
310 II.replaceAllUsesWith(Rdx);
311 II.eraseFromParent();
312 return true;
313}
314
315// Always expand zero strided loads so we match more .vx splat patterns, even if
316// we have +optimized-zero-stride-loads. RISCVDAGToDAGISel::Select will convert
317// it back to a strided load if it's optimized.
318bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) {
319 Value *BasePtr, *VL;
320
321 using namespace PatternMatch;
323 m_Value(BasePtr), m_Zero(), m_AllOnes(), m_Value(VL))))
324 return false;
325
326 // If SEW>XLEN then a splat will get lowered as a zero strided load anyway, so
327 // avoid expanding here.
328 if (II.getType()->getScalarSizeInBits() > ST->getXLen())
329 return false;
330
331 if (!isKnownNonZero(VL, {*DL, DT, nullptr, &II}))
332 return false;
333
334 auto *VTy = cast<VectorType>(II.getType());
335
336 IRBuilder<> Builder(&II);
337 Type *STy = VTy->getElementType();
338 Value *Val = Builder.CreateLoad(STy, BasePtr);
339 Value *Res = Builder.CreateIntrinsic(
340 Intrinsic::vp_merge, VTy,
341 {II.getOperand(2), Builder.CreateVectorSplat(VTy->getElementCount(), Val),
342 PoisonValue::get(VTy), VL});
343
344 II.replaceAllUsesWith(Res);
345 II.eraseFromParent();
346 return true;
347}
348
349bool RISCVCodeGenPrepare::run() {
350 bool MadeChange = false;
351 for (auto &BB : F)
352 for (Instruction &I : llvm::make_early_inc_range(BB))
353 MadeChange |= visit(I);
354
355 return MadeChange;
356}
357
358bool RISCVCodeGenPrepareLegacyPass::runOnFunction(Function &F) {
359 if (skipFunction(F))
360 return false;
361
362 auto &TPC = getAnalysis<TargetPassConfig>();
363 auto &TM = TPC.getTM<RISCVTargetMachine>();
364 auto ST = &TM.getSubtarget<RISCVSubtarget>(F);
365 auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
366
367 RISCVCodeGenPrepare RVCGP(F, DT, ST);
368 return RVCGP.run();
369}
370
371INITIALIZE_PASS_BEGIN(RISCVCodeGenPrepareLegacyPass, DEBUG_TYPE, PASS_NAME,
372 false, false)
374INITIALIZE_PASS_END(RISCVCodeGenPrepareLegacyPass, DEBUG_TYPE, PASS_NAME, false,
375 false)
376
377char RISCVCodeGenPrepareLegacyPass::ID = 0;
378
380 return new RISCVCodeGenPrepareLegacyPass();
381}
382
385 DominatorTree *DT = &FAM.getResult<DominatorTreeAnalysis>(F);
386 auto ST = &TM->getSubtarget<RISCVSubtarget>(F);
387 bool Changed = RISCVCodeGenPrepare(F, DT, ST).run();
388 if (!Changed)
389 return PreservedAnalyses::all();
390
393 return PA;
394}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
#define PASS_NAME
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
Target-Independent Code Generator Pass Configuration Options pass.
#define PASS_NAME
Value * RHS
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
This class represents a freeze function that returns random concrete value if an operand is either a ...
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Base class for instruction visitors.
Definition InstVisitor.h:78
A wrapper class for inspecting calls to intrinsic functions.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
unsigned getRealMinVLen() const
unsigned getXLen() const
bool hasVInstructions() const
Target-Independent Code Generator Pass Configuration Options.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
Changed
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
OneOps_match< OpTy, Instruction::Freeze > m_Freeze(const OpTy &Op)
Matches FreezeInst.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
NNegZExt_match< OpTy > m_NNegZExt(const OpTy &Op)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
FunctionPass * createRISCVCodeGenPrepareLegacyPass()
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Sub
Subtraction of integers.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.