File: | build/source/llvm/lib/Transforms/Vectorize/VectorCombine.cpp |
Warning: | line 299, column 19 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===------- VectorCombine.cpp - Optimize partial vector operations -------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | // This pass optimizes scalar/vector interactions using target cost models. The | |||
10 | // transforms implemented here may not fit in traditional loop-based or SLP | |||
11 | // vectorization passes. | |||
12 | // | |||
13 | //===----------------------------------------------------------------------===// | |||
14 | ||||
15 | #include "llvm/Transforms/Vectorize/VectorCombine.h" | |||
16 | #include "llvm/ADT/Statistic.h" | |||
17 | #include "llvm/Analysis/AssumptionCache.h" | |||
18 | #include "llvm/Analysis/BasicAliasAnalysis.h" | |||
19 | #include "llvm/Analysis/GlobalsModRef.h" | |||
20 | #include "llvm/Analysis/Loads.h" | |||
21 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
22 | #include "llvm/Analysis/ValueTracking.h" | |||
23 | #include "llvm/Analysis/VectorUtils.h" | |||
24 | #include "llvm/IR/Dominators.h" | |||
25 | #include "llvm/IR/Function.h" | |||
26 | #include "llvm/IR/IRBuilder.h" | |||
27 | #include "llvm/IR/PatternMatch.h" | |||
28 | #include "llvm/InitializePasses.h" | |||
29 | #include "llvm/Pass.h" | |||
30 | #include "llvm/Support/CommandLine.h" | |||
31 | #include "llvm/Transforms/Utils/Local.h" | |||
32 | #include "llvm/Transforms/Vectorize.h" | |||
33 | #include <numeric> | |||
34 | ||||
35 | #define DEBUG_TYPE"vector-combine" "vector-combine" | |||
36 | #include "llvm/Transforms/Utils/InstructionWorklist.h" | |||
37 | ||||
38 | using namespace llvm; | |||
39 | using namespace llvm::PatternMatch; | |||
40 | ||||
41 | STATISTIC(NumVecLoad, "Number of vector loads formed")static llvm::Statistic NumVecLoad = {"vector-combine", "NumVecLoad" , "Number of vector loads formed"}; | |||
42 | STATISTIC(NumVecCmp, "Number of vector compares formed")static llvm::Statistic NumVecCmp = {"vector-combine", "NumVecCmp" , "Number of vector compares formed"}; | |||
43 | STATISTIC(NumVecBO, "Number of vector binops formed")static llvm::Statistic NumVecBO = {"vector-combine", "NumVecBO" , "Number of vector binops formed"}; | |||
44 | STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed")static llvm::Statistic NumVecCmpBO = {"vector-combine", "NumVecCmpBO" , "Number of vector compare + binop formed"}; | |||
45 | STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast")static llvm::Statistic NumShufOfBitcast = {"vector-combine", "NumShufOfBitcast" , "Number of shuffles moved after bitcast"}; | |||
46 | STATISTIC(NumScalarBO, "Number of scalar binops formed")static llvm::Statistic NumScalarBO = {"vector-combine", "NumScalarBO" , "Number of scalar binops formed"}; | |||
47 | STATISTIC(NumScalarCmp, "Number of scalar compares formed")static llvm::Statistic NumScalarCmp = {"vector-combine", "NumScalarCmp" , "Number of scalar compares formed"}; | |||
48 | ||||
49 | static cl::opt<bool> DisableVectorCombine( | |||
50 | "disable-vector-combine", cl::init(false), cl::Hidden, | |||
51 | cl::desc("Disable all vector combine transforms")); | |||
52 | ||||
53 | static cl::opt<bool> DisableBinopExtractShuffle( | |||
54 | "disable-binop-extract-shuffle", cl::init(false), cl::Hidden, | |||
55 | cl::desc("Disable binop extract to shuffle transforms")); | |||
56 | ||||
57 | static cl::opt<unsigned> MaxInstrsToScan( | |||
58 | "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, | |||
59 | cl::desc("Max number of instructions to scan for vector combining.")); | |||
60 | ||||
61 | static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max(); | |||
62 | ||||
63 | namespace { | |||
64 | class VectorCombine { | |||
65 | public: | |||
66 | VectorCombine(Function &F, const TargetTransformInfo &TTI, | |||
67 | const DominatorTree &DT, AAResults &AA, AssumptionCache &AC, | |||
68 | bool TryEarlyFoldsOnly) | |||
69 | : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), | |||
70 | TryEarlyFoldsOnly(TryEarlyFoldsOnly) {} | |||
71 | ||||
72 | bool run(); | |||
73 | ||||
74 | private: | |||
75 | Function &F; | |||
76 | IRBuilder<> Builder; | |||
77 | const TargetTransformInfo &TTI; | |||
78 | const DominatorTree &DT; | |||
79 | AAResults &AA; | |||
80 | AssumptionCache &AC; | |||
81 | ||||
82 | /// If true, only perform beneficial early IR transforms. Do not introduce new | |||
83 | /// vector operations. | |||
84 | bool TryEarlyFoldsOnly; | |||
85 | ||||
86 | InstructionWorklist Worklist; | |||
87 | ||||
88 | // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" | |||
89 | // parameter. That should be updated to specific sub-classes because the | |||
90 | // run loop was changed to dispatch on opcode. | |||
91 | bool vectorizeLoadInsert(Instruction &I); | |||
92 | bool widenSubvectorLoad(Instruction &I); | |||
93 | ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, | |||
94 | ExtractElementInst *Ext1, | |||
95 | unsigned PreferredExtractIndex) const; | |||
96 | bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, | |||
97 | const Instruction &I, | |||
98 | ExtractElementInst *&ConvertToShuffle, | |||
99 | unsigned PreferredExtractIndex); | |||
100 | void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, | |||
101 | Instruction &I); | |||
102 | void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, | |||
103 | Instruction &I); | |||
104 | bool foldExtractExtract(Instruction &I); | |||
105 | bool foldInsExtFNeg(Instruction &I); | |||
106 | bool foldBitcastShuf(Instruction &I); | |||
107 | bool scalarizeBinopOrCmp(Instruction &I); | |||
108 | bool foldExtractedCmps(Instruction &I); | |||
109 | bool foldSingleElementStore(Instruction &I); | |||
110 | bool scalarizeLoadExtract(Instruction &I); | |||
111 | bool foldShuffleOfBinops(Instruction &I); | |||
112 | bool foldShuffleFromReductions(Instruction &I); | |||
113 | bool foldSelectShuffle(Instruction &I, bool FromReduction = false); | |||
114 | ||||
115 | void replaceValue(Value &Old, Value &New) { | |||
116 | Old.replaceAllUsesWith(&New); | |||
117 | if (auto *NewI = dyn_cast<Instruction>(&New)) { | |||
118 | New.takeName(&Old); | |||
119 | Worklist.pushUsersToWorkList(*NewI); | |||
120 | Worklist.pushValue(NewI); | |||
121 | } | |||
122 | Worklist.pushValue(&Old); | |||
123 | } | |||
124 | ||||
125 | void eraseInstruction(Instruction &I) { | |||
126 | for (Value *Op : I.operands()) | |||
127 | Worklist.pushValue(Op); | |||
128 | Worklist.remove(&I); | |||
129 | I.eraseFromParent(); | |||
130 | } | |||
131 | }; | |||
132 | } // namespace | |||
133 | ||||
134 | static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) { | |||
135 | // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan. | |||
136 | // The widened load may load data from dirty regions or create data races | |||
137 | // non-existent in the source. | |||
138 | if (!Load || !Load->isSimple() || !Load->hasOneUse() || | |||
139 | Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || | |||
140 | mustSuppressSpeculation(*Load)) | |||
141 | return false; | |||
142 | ||||
143 | // We are potentially transforming byte-sized (8-bit) memory accesses, so make | |||
144 | // sure we have all of our type-based constraints in place for this target. | |||
145 | Type *ScalarTy = Load->getType()->getScalarType(); | |||
146 | uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); | |||
147 | unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); | |||
148 | if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || | |||
149 | ScalarSize % 8 != 0) | |||
150 | return false; | |||
151 | ||||
152 | return true; | |||
153 | } | |||
154 | ||||
155 | bool VectorCombine::vectorizeLoadInsert(Instruction &I) { | |||
156 | // Match insert into fixed vector of scalar value. | |||
157 | // TODO: Handle non-zero insert index. | |||
158 | Value *Scalar; | |||
159 | if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || | |||
160 | !Scalar->hasOneUse()) | |||
161 | return false; | |||
162 | ||||
163 | // Optionally match an extract from another vector. | |||
164 | Value *X; | |||
165 | bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt())); | |||
166 | if (!HasExtract) | |||
167 | X = Scalar; | |||
168 | ||||
169 | auto *Load = dyn_cast<LoadInst>(X); | |||
170 | if (!canWidenLoad(Load, TTI)) | |||
171 | return false; | |||
172 | ||||
173 | Type *ScalarTy = Scalar->getType(); | |||
174 | uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); | |||
175 | unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); | |||
176 | ||||
177 | // Check safety of replacing the scalar load with a larger vector load. | |||
178 | // We use minimal alignment (maximum flexibility) because we only care about | |||
179 | // the dereferenceable region. When calculating cost and creating a new op, | |||
180 | // we may use a larger value based on alignment attributes. | |||
181 | const DataLayout &DL = I.getModule()->getDataLayout(); | |||
182 | Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); | |||
183 | assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type")(static_cast <bool> (isa<PointerType>(SrcPtr-> getType()) && "Expected a pointer type") ? void (0) : __assert_fail ("isa<PointerType>(SrcPtr->getType()) && \"Expected a pointer type\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 183, __extension__ __PRETTY_FUNCTION__)); | |||
184 | ||||
185 | unsigned MinVecNumElts = MinVectorSize / ScalarSize; | |||
186 | auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); | |||
187 | unsigned OffsetEltIndex = 0; | |||
188 | Align Alignment = Load->getAlign(); | |||
189 | if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC, | |||
190 | &DT)) { | |||
191 | // It is not safe to load directly from the pointer, but we can still peek | |||
192 | // through gep offsets and check if it safe to load from a base address with | |||
193 | // updated alignment. If it is, we can shuffle the element(s) into place | |||
194 | // after loading. | |||
195 | unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType()); | |||
196 | APInt Offset(OffsetBitWidth, 0); | |||
197 | SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); | |||
198 | ||||
199 | // We want to shuffle the result down from a high element of a vector, so | |||
200 | // the offset must be positive. | |||
201 | if (Offset.isNegative()) | |||
202 | return false; | |||
203 | ||||
204 | // The offset must be a multiple of the scalar element to shuffle cleanly | |||
205 | // in the element's size. | |||
206 | uint64_t ScalarSizeInBytes = ScalarSize / 8; | |||
207 | if (Offset.urem(ScalarSizeInBytes) != 0) | |||
208 | return false; | |||
209 | ||||
210 | // If we load MinVecNumElts, will our target element still be loaded? | |||
211 | OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); | |||
212 | if (OffsetEltIndex >= MinVecNumElts) | |||
213 | return false; | |||
214 | ||||
215 | if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC, | |||
216 | &DT)) | |||
217 | return false; | |||
218 | ||||
219 | // Update alignment with offset value. Note that the offset could be negated | |||
220 | // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but | |||
221 | // negation does not change the result of the alignment calculation. | |||
222 | Alignment = commonAlignment(Alignment, Offset.getZExtValue()); | |||
223 | } | |||
224 | ||||
225 | // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 | |||
226 | // Use the greater of the alignment on the load or its source pointer. | |||
227 | Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); | |||
228 | Type *LoadTy = Load->getType(); | |||
229 | unsigned AS = Load->getPointerAddressSpace(); | |||
230 | InstructionCost OldCost = | |||
231 | TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); | |||
232 | APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); | |||
233 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
234 | OldCost += | |||
235 | TTI.getScalarizationOverhead(MinVecTy, DemandedElts, | |||
236 | /* Insert */ true, HasExtract, CostKind); | |||
237 | ||||
238 | // New pattern: load VecPtr | |||
239 | InstructionCost NewCost = | |||
240 | TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); | |||
241 | // Optionally, we are shuffling the loaded vector element(s) into place. | |||
242 | // For the mask set everything but element 0 to undef to prevent poison from | |||
243 | // propagating from the extra loaded memory. This will also optionally | |||
244 | // shrink/grow the vector from the loaded size to the output size. | |||
245 | // We assume this operation has no cost in codegen if there was no offset. | |||
246 | // Note that we could use freeze to avoid poison problems, but then we might | |||
247 | // still need a shuffle to change the vector size. | |||
248 | auto *Ty = cast<FixedVectorType>(I.getType()); | |||
249 | unsigned OutputNumElts = Ty->getNumElements(); | |||
250 | SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem); | |||
251 | assert(OffsetEltIndex < MinVecNumElts && "Address offset too big")(static_cast <bool> (OffsetEltIndex < MinVecNumElts && "Address offset too big") ? void (0) : __assert_fail ("OffsetEltIndex < MinVecNumElts && \"Address offset too big\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 251, __extension__ __PRETTY_FUNCTION__)); | |||
252 | Mask[0] = OffsetEltIndex; | |||
253 | if (OffsetEltIndex) | |||
254 | NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask); | |||
255 | ||||
256 | // We can aggressively convert to the vector form because the backend can | |||
257 | // invert this transform if it does not result in a performance win. | |||
258 | if (OldCost < NewCost || !NewCost.isValid()) | |||
259 | return false; | |||
260 | ||||
261 | // It is safe and potentially profitable to load a vector directly: | |||
262 | // inselt undef, load Scalar, 0 --> load VecPtr | |||
263 | IRBuilder<> Builder(Load); | |||
264 | Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( | |||
265 | SrcPtr, MinVecTy->getPointerTo(AS)); | |||
266 | Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); | |||
267 | VecLd = Builder.CreateShuffleVector(VecLd, Mask); | |||
268 | ||||
269 | replaceValue(I, *VecLd); | |||
270 | ++NumVecLoad; | |||
271 | return true; | |||
272 | } | |||
273 | ||||
274 | /// If we are loading a vector and then inserting it into a larger vector with | |||
275 | /// undefined elements, try to load the larger vector and eliminate the insert. | |||
276 | /// This removes a shuffle in IR and may allow combining of other loaded values. | |||
277 | bool VectorCombine::widenSubvectorLoad(Instruction &I) { | |||
278 | // Match subvector insert of fixed vector. | |||
279 | auto *Shuf = cast<ShuffleVectorInst>(&I); | |||
280 | if (!Shuf->isIdentityWithPadding()) | |||
281 | return false; | |||
282 | ||||
283 | // Allow a non-canonical shuffle mask that is choosing elements from op1. | |||
284 | unsigned NumOpElts = | |||
285 | cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements(); | |||
286 | unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) { | |||
287 | return M >= (int)(NumOpElts); | |||
288 | }); | |||
289 | ||||
290 | auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex)); | |||
291 | if (!canWidenLoad(Load, TTI)) | |||
292 | return false; | |||
293 | ||||
294 | // We use minimal alignment (maximum flexibility) because we only care about | |||
295 | // the dereferenceable region. When calculating cost and creating a new op, | |||
296 | // we may use a larger value based on alignment attributes. | |||
297 | auto *Ty = cast<FixedVectorType>(I.getType()); | |||
298 | const DataLayout &DL = I.getModule()->getDataLayout(); | |||
299 | Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); | |||
| ||||
300 | assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type")(static_cast <bool> (isa<PointerType>(SrcPtr-> getType()) && "Expected a pointer type") ? void (0) : __assert_fail ("isa<PointerType>(SrcPtr->getType()) && \"Expected a pointer type\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 300, __extension__ __PRETTY_FUNCTION__)); | |||
301 | Align Alignment = Load->getAlign(); | |||
302 | if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), DL, Load, &AC, &DT)) | |||
303 | return false; | |||
304 | ||||
305 | Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); | |||
306 | Type *LoadTy = Load->getType(); | |||
307 | unsigned AS = Load->getPointerAddressSpace(); | |||
308 | ||||
309 | // Original pattern: insert_subvector (load PtrOp) | |||
310 | // This conservatively assumes that the cost of a subvector insert into an | |||
311 | // undef value is 0. We could add that cost if the cost model accurately | |||
312 | // reflects the real cost of that operation. | |||
313 | InstructionCost OldCost = | |||
314 | TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); | |||
315 | ||||
316 | // New pattern: load PtrOp | |||
317 | InstructionCost NewCost = | |||
318 | TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS); | |||
319 | ||||
320 | // We can aggressively convert to the vector form because the backend can | |||
321 | // invert this transform if it does not result in a performance win. | |||
322 | if (OldCost < NewCost || !NewCost.isValid()) | |||
323 | return false; | |||
324 | ||||
325 | IRBuilder<> Builder(Load); | |||
326 | Value *CastedPtr = | |||
327 | Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Ty->getPointerTo(AS)); | |||
328 | Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment); | |||
329 | replaceValue(I, *VecLd); | |||
330 | ++NumVecLoad; | |||
331 | return true; | |||
332 | } | |||
333 | ||||
334 | /// Determine which, if any, of the inputs should be replaced by a shuffle | |||
335 | /// followed by extract from a different index. | |||
336 | ExtractElementInst *VectorCombine::getShuffleExtract( | |||
337 | ExtractElementInst *Ext0, ExtractElementInst *Ext1, | |||
338 | unsigned PreferredExtractIndex = InvalidIndex) const { | |||
339 | auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand()); | |||
340 | auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand()); | |||
341 | assert(Index0C && Index1C && "Expected constant extract indexes")(static_cast <bool> (Index0C && Index1C && "Expected constant extract indexes") ? void (0) : __assert_fail ("Index0C && Index1C && \"Expected constant extract indexes\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 341, __extension__ __PRETTY_FUNCTION__)); | |||
342 | ||||
343 | unsigned Index0 = Index0C->getZExtValue(); | |||
344 | unsigned Index1 = Index1C->getZExtValue(); | |||
345 | ||||
346 | // If the extract indexes are identical, no shuffle is needed. | |||
347 | if (Index0 == Index1) | |||
348 | return nullptr; | |||
349 | ||||
350 | Type *VecTy = Ext0->getVectorOperand()->getType(); | |||
351 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
352 | assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types")(static_cast <bool> (VecTy == Ext1->getVectorOperand ()->getType() && "Need matching types") ? void (0) : __assert_fail ("VecTy == Ext1->getVectorOperand()->getType() && \"Need matching types\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 352, __extension__ __PRETTY_FUNCTION__)); | |||
353 | InstructionCost Cost0 = | |||
354 | TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); | |||
355 | InstructionCost Cost1 = | |||
356 | TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); | |||
357 | ||||
358 | // If both costs are invalid no shuffle is needed | |||
359 | if (!Cost0.isValid() && !Cost1.isValid()) | |||
360 | return nullptr; | |||
361 | ||||
362 | // We are extracting from 2 different indexes, so one operand must be shuffled | |||
363 | // before performing a vector operation and/or extract. The more expensive | |||
364 | // extract will be replaced by a shuffle. | |||
365 | if (Cost0 > Cost1) | |||
366 | return Ext0; | |||
367 | if (Cost1 > Cost0) | |||
368 | return Ext1; | |||
369 | ||||
370 | // If the costs are equal and there is a preferred extract index, shuffle the | |||
371 | // opposite operand. | |||
372 | if (PreferredExtractIndex == Index0) | |||
373 | return Ext1; | |||
374 | if (PreferredExtractIndex == Index1) | |||
375 | return Ext0; | |||
376 | ||||
377 | // Otherwise, replace the extract with the higher index. | |||
378 | return Index0 > Index1 ? Ext0 : Ext1; | |||
379 | } | |||
380 | ||||
381 | /// Compare the relative costs of 2 extracts followed by scalar operation vs. | |||
382 | /// vector operation(s) followed by extract. Return true if the existing | |||
383 | /// instructions are cheaper than a vector alternative. Otherwise, return false | |||
384 | /// and if one of the extracts should be transformed to a shufflevector, set | |||
385 | /// \p ConvertToShuffle to that extract instruction. | |||
386 | bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, | |||
387 | ExtractElementInst *Ext1, | |||
388 | const Instruction &I, | |||
389 | ExtractElementInst *&ConvertToShuffle, | |||
390 | unsigned PreferredExtractIndex) { | |||
391 | auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getOperand(1)); | |||
392 | auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getOperand(1)); | |||
393 | assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes")(static_cast <bool> (Ext0IndexC && Ext1IndexC && "Expected constant extract indexes") ? void (0) : __assert_fail ("Ext0IndexC && Ext1IndexC && \"Expected constant extract indexes\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 393, __extension__ __PRETTY_FUNCTION__)); | |||
394 | ||||
395 | unsigned Opcode = I.getOpcode(); | |||
396 | Type *ScalarTy = Ext0->getType(); | |||
397 | auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); | |||
398 | InstructionCost ScalarOpCost, VectorOpCost; | |||
399 | ||||
400 | // Get cost estimates for scalar and vector versions of the operation. | |||
401 | bool IsBinOp = Instruction::isBinaryOp(Opcode); | |||
402 | if (IsBinOp) { | |||
403 | ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); | |||
404 | VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); | |||
405 | } else { | |||
406 | assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&(static_cast <bool> ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && "Expected a compare") ? void (0) : __assert_fail ("(Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && \"Expected a compare\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 407, __extension__ __PRETTY_FUNCTION__)) | |||
407 | "Expected a compare")(static_cast <bool> ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && "Expected a compare") ? void (0) : __assert_fail ("(Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && \"Expected a compare\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 407, __extension__ __PRETTY_FUNCTION__)); | |||
408 | CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); | |||
409 | ScalarOpCost = TTI.getCmpSelInstrCost( | |||
410 | Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); | |||
411 | VectorOpCost = TTI.getCmpSelInstrCost( | |||
412 | Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); | |||
413 | } | |||
414 | ||||
415 | // Get cost estimates for the extract elements. These costs will factor into | |||
416 | // both sequences. | |||
417 | unsigned Ext0Index = Ext0IndexC->getZExtValue(); | |||
418 | unsigned Ext1Index = Ext1IndexC->getZExtValue(); | |||
419 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
420 | ||||
421 | InstructionCost Extract0Cost = | |||
422 | TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index); | |||
423 | InstructionCost Extract1Cost = | |||
424 | TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index); | |||
425 | ||||
426 | // A more expensive extract will always be replaced by a splat shuffle. | |||
427 | // For example, if Ext0 is more expensive: | |||
428 | // opcode (extelt V0, Ext0), (ext V1, Ext1) --> | |||
429 | // extelt (opcode (splat V0, Ext0), V1), Ext1 | |||
430 | // TODO: Evaluate whether that always results in lowest cost. Alternatively, | |||
431 | // check the cost of creating a broadcast shuffle and shuffling both | |||
432 | // operands to element 0. | |||
433 | InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); | |||
434 | ||||
435 | // Extra uses of the extracts mean that we include those costs in the | |||
436 | // vector total because those instructions will not be eliminated. | |||
437 | InstructionCost OldCost, NewCost; | |||
438 | if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { | |||
439 | // Handle a special case. If the 2 extracts are identical, adjust the | |||
440 | // formulas to account for that. The extra use charge allows for either the | |||
441 | // CSE'd pattern or an unoptimized form with identical values: | |||
442 | // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C | |||
443 | bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2) | |||
444 | : !Ext0->hasOneUse() || !Ext1->hasOneUse(); | |||
445 | OldCost = CheapExtractCost + ScalarOpCost; | |||
446 | NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost; | |||
447 | } else { | |||
448 | // Handle the general case. Each extract is actually a different value: | |||
449 | // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C | |||
450 | OldCost = Extract0Cost + Extract1Cost + ScalarOpCost; | |||
451 | NewCost = VectorOpCost + CheapExtractCost + | |||
452 | !Ext0->hasOneUse() * Extract0Cost + | |||
453 | !Ext1->hasOneUse() * Extract1Cost; | |||
454 | } | |||
455 | ||||
456 | ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex); | |||
457 | if (ConvertToShuffle) { | |||
458 | if (IsBinOp && DisableBinopExtractShuffle) | |||
459 | return true; | |||
460 | ||||
461 | // If we are extracting from 2 different indexes, then one operand must be | |||
462 | // shuffled before performing the vector operation. The shuffle mask is | |||
463 | // undefined except for 1 lane that is being translated to the remaining | |||
464 | // extraction lane. Therefore, it is a splat shuffle. Ex: | |||
465 | // ShufMask = { undef, undef, 0, undef } | |||
466 | // TODO: The cost model has an option for a "broadcast" shuffle | |||
467 | // (splat-from-element-0), but no option for a more general splat. | |||
468 | NewCost += | |||
469 | TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); | |||
470 | } | |||
471 | ||||
472 | // Aggressively form a vector op if the cost is equal because the transform | |||
473 | // may enable further optimization. | |||
474 | // Codegen can reverse this transform (scalarize) if it was not profitable. | |||
475 | return OldCost < NewCost; | |||
476 | } | |||
477 | ||||
478 | /// Create a shuffle that translates (shifts) 1 element from the input vector | |||
479 | /// to a new element location. | |||
480 | static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, | |||
481 | unsigned NewIndex, IRBuilder<> &Builder) { | |||
482 | // The shuffle mask is undefined except for 1 lane that is being translated | |||
483 | // to the new element index. Example for OldIndex == 2 and NewIndex == 0: | |||
484 | // ShufMask = { 2, undef, undef, undef } | |||
485 | auto *VecTy = cast<FixedVectorType>(Vec->getType()); | |||
486 | SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem); | |||
487 | ShufMask[NewIndex] = OldIndex; | |||
488 | return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); | |||
489 | } | |||
490 | ||||
491 | /// Given an extract element instruction with constant index operand, shuffle | |||
492 | /// the source vector (shift the scalar element) to a NewIndex for extraction. | |||
493 | /// Return null if the input can be constant folded, so that we are not creating | |||
494 | /// unnecessary instructions. | |||
495 | static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, | |||
496 | unsigned NewIndex, | |||
497 | IRBuilder<> &Builder) { | |||
498 | // Shufflevectors can only be created for fixed-width vectors. | |||
499 | if (!isa<FixedVectorType>(ExtElt->getOperand(0)->getType())) | |||
500 | return nullptr; | |||
501 | ||||
502 | // If the extract can be constant-folded, this code is unsimplified. Defer | |||
503 | // to other passes to handle that. | |||
504 | Value *X = ExtElt->getVectorOperand(); | |||
505 | Value *C = ExtElt->getIndexOperand(); | |||
506 | assert(isa<ConstantInt>(C) && "Expected a constant index operand")(static_cast <bool> (isa<ConstantInt>(C) && "Expected a constant index operand") ? void (0) : __assert_fail ("isa<ConstantInt>(C) && \"Expected a constant index operand\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 506, __extension__ __PRETTY_FUNCTION__)); | |||
507 | if (isa<Constant>(X)) | |||
508 | return nullptr; | |||
509 | ||||
510 | Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(), | |||
511 | NewIndex, Builder); | |||
512 | return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex)); | |||
513 | } | |||
514 | ||||
515 | /// Try to reduce extract element costs by converting scalar compares to vector | |||
516 | /// compares followed by extract. | |||
517 | /// cmp (ext0 V0, C), (ext1 V1, C) | |||
518 | void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, | |||
519 | ExtractElementInst *Ext1, Instruction &I) { | |||
520 | assert(isa<CmpInst>(&I) && "Expected a compare")(static_cast <bool> (isa<CmpInst>(&I) && "Expected a compare") ? void (0) : __assert_fail ("isa<CmpInst>(&I) && \"Expected a compare\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 520, __extension__ __PRETTY_FUNCTION__)); | |||
521 | assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==(static_cast <bool> (cast<ConstantInt>(Ext0->getIndexOperand ())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand ())->getZExtValue() && "Expected matching constant extract indexes" ) ? void (0) : __assert_fail ("cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && \"Expected matching constant extract indexes\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 523, __extension__ __PRETTY_FUNCTION__)) | |||
522 | cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&(static_cast <bool> (cast<ConstantInt>(Ext0->getIndexOperand ())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand ())->getZExtValue() && "Expected matching constant extract indexes" ) ? void (0) : __assert_fail ("cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && \"Expected matching constant extract indexes\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 523, __extension__ __PRETTY_FUNCTION__)) | |||
523 | "Expected matching constant extract indexes")(static_cast <bool> (cast<ConstantInt>(Ext0->getIndexOperand ())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand ())->getZExtValue() && "Expected matching constant extract indexes" ) ? void (0) : __assert_fail ("cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && \"Expected matching constant extract indexes\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 523, __extension__ __PRETTY_FUNCTION__)); | |||
524 | ||||
525 | // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C | |||
526 | ++NumVecCmp; | |||
527 | CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate(); | |||
528 | Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); | |||
529 | Value *VecCmp = Builder.CreateCmp(Pred, V0, V1); | |||
530 | Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand()); | |||
531 | replaceValue(I, *NewExt); | |||
532 | } | |||
533 | ||||
534 | /// Try to reduce extract element costs by converting scalar binops to vector | |||
535 | /// binops followed by extract. | |||
536 | /// bo (ext0 V0, C), (ext1 V1, C) | |||
537 | void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, | |||
538 | ExtractElementInst *Ext1, Instruction &I) { | |||
539 | assert(isa<BinaryOperator>(&I) && "Expected a binary operator")(static_cast <bool> (isa<BinaryOperator>(&I) && "Expected a binary operator") ? void (0) : __assert_fail ("isa<BinaryOperator>(&I) && \"Expected a binary operator\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 539, __extension__ __PRETTY_FUNCTION__)); | |||
540 | assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==(static_cast <bool> (cast<ConstantInt>(Ext0->getIndexOperand ())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand ())->getZExtValue() && "Expected matching constant extract indexes" ) ? void (0) : __assert_fail ("cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && \"Expected matching constant extract indexes\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 542, __extension__ __PRETTY_FUNCTION__)) | |||
541 | cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&(static_cast <bool> (cast<ConstantInt>(Ext0->getIndexOperand ())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand ())->getZExtValue() && "Expected matching constant extract indexes" ) ? void (0) : __assert_fail ("cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && \"Expected matching constant extract indexes\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 542, __extension__ __PRETTY_FUNCTION__)) | |||
542 | "Expected matching constant extract indexes")(static_cast <bool> (cast<ConstantInt>(Ext0->getIndexOperand ())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand ())->getZExtValue() && "Expected matching constant extract indexes" ) ? void (0) : __assert_fail ("cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && \"Expected matching constant extract indexes\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 542, __extension__ __PRETTY_FUNCTION__)); | |||
543 | ||||
544 | // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C | |||
545 | ++NumVecBO; | |||
546 | Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); | |||
547 | Value *VecBO = | |||
548 | Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1); | |||
549 | ||||
550 | // All IR flags are safe to back-propagate because any potential poison | |||
551 | // created in unused vector elements is discarded by the extract. | |||
552 | if (auto *VecBOInst = dyn_cast<Instruction>(VecBO)) | |||
553 | VecBOInst->copyIRFlags(&I); | |||
554 | ||||
555 | Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand()); | |||
556 | replaceValue(I, *NewExt); | |||
557 | } | |||
558 | ||||
559 | /// Match an instruction with extracted vector operands. | |||
560 | bool VectorCombine::foldExtractExtract(Instruction &I) { | |||
561 | // It is not safe to transform things like div, urem, etc. because we may | |||
562 | // create undefined behavior when executing those on unknown vector elements. | |||
563 | if (!isSafeToSpeculativelyExecute(&I)) | |||
564 | return false; | |||
565 | ||||
566 | Instruction *I0, *I1; | |||
567 | CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; | |||
568 | if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) && | |||
569 | !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1)))) | |||
570 | return false; | |||
571 | ||||
572 | Value *V0, *V1; | |||
573 | uint64_t C0, C1; | |||
574 | if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) || | |||
575 | !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) || | |||
576 | V0->getType() != V1->getType()) | |||
577 | return false; | |||
578 | ||||
579 | // If the scalar value 'I' is going to be re-inserted into a vector, then try | |||
580 | // to create an extract to that same element. The extract/insert can be | |||
581 | // reduced to a "select shuffle". | |||
582 | // TODO: If we add a larger pattern match that starts from an insert, this | |||
583 | // probably becomes unnecessary. | |||
584 | auto *Ext0 = cast<ExtractElementInst>(I0); | |||
585 | auto *Ext1 = cast<ExtractElementInst>(I1); | |||
586 | uint64_t InsertIndex = InvalidIndex; | |||
587 | if (I.hasOneUse()) | |||
588 | match(I.user_back(), | |||
589 | m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex))); | |||
590 | ||||
591 | ExtractElementInst *ExtractToChange; | |||
592 | if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex)) | |||
593 | return false; | |||
594 | ||||
595 | if (ExtractToChange) { | |||
596 | unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0; | |||
597 | ExtractElementInst *NewExtract = | |||
598 | translateExtract(ExtractToChange, CheapExtractIdx, Builder); | |||
599 | if (!NewExtract) | |||
600 | return false; | |||
601 | if (ExtractToChange == Ext0) | |||
602 | Ext0 = NewExtract; | |||
603 | else | |||
604 | Ext1 = NewExtract; | |||
605 | } | |||
606 | ||||
607 | if (Pred != CmpInst::BAD_ICMP_PREDICATE) | |||
608 | foldExtExtCmp(Ext0, Ext1, I); | |||
609 | else | |||
610 | foldExtExtBinop(Ext0, Ext1, I); | |||
611 | ||||
612 | Worklist.push(Ext0); | |||
613 | Worklist.push(Ext1); | |||
614 | return true; | |||
615 | } | |||
616 | ||||
617 | /// Try to replace an extract + scalar fneg + insert with a vector fneg + | |||
618 | /// shuffle. | |||
619 | bool VectorCombine::foldInsExtFNeg(Instruction &I) { | |||
620 | // Match an insert (op (extract)) pattern. | |||
621 | Value *DestVec; | |||
622 | uint64_t Index; | |||
623 | Instruction *FNeg; | |||
624 | if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)), | |||
625 | m_ConstantInt(Index)))) | |||
626 | return false; | |||
627 | ||||
628 | // Note: This handles the canonical fneg instruction and "fsub -0.0, X". | |||
629 | Value *SrcVec; | |||
630 | Instruction *Extract; | |||
631 | if (!match(FNeg, m_FNeg(m_CombineAnd( | |||
632 | m_Instruction(Extract), | |||
633 | m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) | |||
634 | return false; | |||
635 | ||||
636 | // TODO: We could handle this with a length-changing shuffle. | |||
637 | auto *VecTy = cast<FixedVectorType>(I.getType()); | |||
638 | if (SrcVec->getType() != VecTy) | |||
639 | return false; | |||
640 | ||||
641 | // Ignore bogus insert/extract index. | |||
642 | unsigned NumElts = VecTy->getNumElements(); | |||
643 | if (Index >= NumElts) | |||
644 | return false; | |||
645 | ||||
646 | // We are inserting the negated element into the same lane that we extracted | |||
647 | // from. This is equivalent to a select-shuffle that chooses all but the | |||
648 | // negated element from the destination vector. | |||
649 | SmallVector<int> Mask(NumElts); | |||
650 | std::iota(Mask.begin(), Mask.end(), 0); | |||
651 | Mask[Index] = Index + NumElts; | |||
652 | ||||
653 | Type *ScalarTy = VecTy->getScalarType(); | |||
654 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
655 | InstructionCost OldCost = | |||
656 | TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) + | |||
657 | TTI.getVectorInstrCost(I, VecTy, CostKind, Index); | |||
658 | ||||
659 | // If the extract has one use, it will be eliminated, so count it in the | |||
660 | // original cost. If it has more than one use, ignore the cost because it will | |||
661 | // be the same before/after. | |||
662 | if (Extract->hasOneUse()) | |||
663 | OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); | |||
664 | ||||
665 | InstructionCost NewCost = | |||
666 | TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) + | |||
667 | TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask); | |||
668 | ||||
669 | if (NewCost > OldCost) | |||
670 | return false; | |||
671 | ||||
672 | // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> | |||
673 | // shuffle DestVec, (fneg SrcVec), Mask | |||
674 | Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); | |||
675 | Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); | |||
676 | replaceValue(I, *Shuf); | |||
677 | return true; | |||
678 | } | |||
679 | ||||
680 | /// If this is a bitcast of a shuffle, try to bitcast the source vector to the | |||
681 | /// destination type followed by shuffle. This can enable further transforms by | |||
682 | /// moving bitcasts or shuffles together. | |||
683 | bool VectorCombine::foldBitcastShuf(Instruction &I) { | |||
684 | Value *V; | |||
685 | ArrayRef<int> Mask; | |||
686 | if (!match(&I, m_BitCast( | |||
687 | m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask)))))) | |||
688 | return false; | |||
689 | ||||
690 | // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for | |||
691 | // scalable type is unknown; Second, we cannot reason if the narrowed shuffle | |||
692 | // mask for scalable type is a splat or not. | |||
693 | // 2) Disallow non-vector casts and length-changing shuffles. | |||
694 | // TODO: We could allow any shuffle. | |||
695 | auto *SrcTy = dyn_cast<FixedVectorType>(V->getType()); | |||
696 | if (!SrcTy || I.getOperand(0)->getType() != SrcTy) | |||
697 | return false; | |||
698 | ||||
699 | auto *DestTy = cast<FixedVectorType>(I.getType()); | |||
700 | unsigned DestNumElts = DestTy->getNumElements(); | |||
701 | unsigned SrcNumElts = SrcTy->getNumElements(); | |||
702 | SmallVector<int, 16> NewMask; | |||
703 | if (SrcNumElts <= DestNumElts) { | |||
704 | // The bitcast is from wide to narrow/equal elements. The shuffle mask can | |||
705 | // always be expanded to the equivalent form choosing narrower elements. | |||
706 | assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask")(static_cast <bool> (DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask") ? void (0) : __assert_fail ("DestNumElts % SrcNumElts == 0 && \"Unexpected shuffle mask\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 706, __extension__ __PRETTY_FUNCTION__)); | |||
707 | unsigned ScaleFactor = DestNumElts / SrcNumElts; | |||
708 | narrowShuffleMaskElts(ScaleFactor, Mask, NewMask); | |||
709 | } else { | |||
710 | // The bitcast is from narrow elements to wide elements. The shuffle mask | |||
711 | // must choose consecutive elements to allow casting first. | |||
712 | assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask")(static_cast <bool> (SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask") ? void (0) : __assert_fail ("SrcNumElts % DestNumElts == 0 && \"Unexpected shuffle mask\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 712, __extension__ __PRETTY_FUNCTION__)); | |||
713 | unsigned ScaleFactor = SrcNumElts / DestNumElts; | |||
714 | if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) | |||
715 | return false; | |||
716 | } | |||
717 | ||||
718 | // The new shuffle must not cost more than the old shuffle. The bitcast is | |||
719 | // moved ahead of the shuffle, so assume that it has the same cost as before. | |||
720 | InstructionCost DestCost = TTI.getShuffleCost( | |||
721 | TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask); | |||
722 | InstructionCost SrcCost = | |||
723 | TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask); | |||
724 | if (DestCost > SrcCost || !DestCost.isValid()) | |||
725 | return false; | |||
726 | ||||
727 | // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' | |||
728 | ++NumShufOfBitcast; | |||
729 | Value *CastV = Builder.CreateBitCast(V, DestTy); | |||
730 | Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask); | |||
731 | replaceValue(I, *Shuf); | |||
732 | return true; | |||
733 | } | |||
734 | ||||
735 | /// Match a vector binop or compare instruction with at least one inserted | |||
736 | /// scalar operand and convert to scalar binop/cmp followed by insertelement. | |||
737 | bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { | |||
738 | CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; | |||
739 | Value *Ins0, *Ins1; | |||
740 | if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) && | |||
741 | !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1)))) | |||
742 | return false; | |||
743 | ||||
744 | // Do not convert the vector condition of a vector select into a scalar | |||
745 | // condition. That may cause problems for codegen because of differences in | |||
746 | // boolean formats and register-file transfers. | |||
747 | // TODO: Can we account for that in the cost model? | |||
748 | bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE; | |||
749 | if (IsCmp) | |||
750 | for (User *U : I.users()) | |||
751 | if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value()))) | |||
752 | return false; | |||
753 | ||||
754 | // Match against one or both scalar values being inserted into constant | |||
755 | // vectors: | |||
756 | // vec_op VecC0, (inselt VecC1, V1, Index) | |||
757 | // vec_op (inselt VecC0, V0, Index), VecC1 | |||
758 | // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) | |||
759 | // TODO: Deal with mismatched index constants and variable indexes? | |||
760 | Constant *VecC0 = nullptr, *VecC1 = nullptr; | |||
761 | Value *V0 = nullptr, *V1 = nullptr; | |||
762 | uint64_t Index0 = 0, Index1 = 0; | |||
763 | if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0), | |||
764 | m_ConstantInt(Index0))) && | |||
765 | !match(Ins0, m_Constant(VecC0))) | |||
766 | return false; | |||
767 | if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1), | |||
768 | m_ConstantInt(Index1))) && | |||
769 | !match(Ins1, m_Constant(VecC1))) | |||
770 | return false; | |||
771 | ||||
772 | bool IsConst0 = !V0; | |||
773 | bool IsConst1 = !V1; | |||
774 | if (IsConst0 && IsConst1) | |||
775 | return false; | |||
776 | if (!IsConst0 && !IsConst1 && Index0 != Index1) | |||
777 | return false; | |||
778 | ||||
779 | // Bail for single insertion if it is a load. | |||
780 | // TODO: Handle this once getVectorInstrCost can cost for load/stores. | |||
781 | auto *I0 = dyn_cast_or_null<Instruction>(V0); | |||
782 | auto *I1 = dyn_cast_or_null<Instruction>(V1); | |||
783 | if ((IsConst0 && I1 && I1->mayReadFromMemory()) || | |||
784 | (IsConst1 && I0 && I0->mayReadFromMemory())) | |||
785 | return false; | |||
786 | ||||
787 | uint64_t Index = IsConst0 ? Index1 : Index0; | |||
788 | Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType(); | |||
789 | Type *VecTy = I.getType(); | |||
790 | assert(VecTy->isVectorTy() &&(static_cast <bool> (VecTy->isVectorTy() && ( IsConst0 || IsConst1 || V0->getType() == V1->getType()) && (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy () || ScalarTy->isPointerTy()) && "Unexpected types for insert element into binop or cmp" ) ? void (0) : __assert_fail ("VecTy->isVectorTy() && (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || ScalarTy->isPointerTy()) && \"Unexpected types for insert element into binop or cmp\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 794, __extension__ __PRETTY_FUNCTION__)) | |||
791 | (IsConst0 || IsConst1 || V0->getType() == V1->getType()) &&(static_cast <bool> (VecTy->isVectorTy() && ( IsConst0 || IsConst1 || V0->getType() == V1->getType()) && (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy () || ScalarTy->isPointerTy()) && "Unexpected types for insert element into binop or cmp" ) ? void (0) : __assert_fail ("VecTy->isVectorTy() && (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || ScalarTy->isPointerTy()) && \"Unexpected types for insert element into binop or cmp\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 794, __extension__ __PRETTY_FUNCTION__)) | |||
792 | (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||(static_cast <bool> (VecTy->isVectorTy() && ( IsConst0 || IsConst1 || V0->getType() == V1->getType()) && (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy () || ScalarTy->isPointerTy()) && "Unexpected types for insert element into binop or cmp" ) ? void (0) : __assert_fail ("VecTy->isVectorTy() && (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || ScalarTy->isPointerTy()) && \"Unexpected types for insert element into binop or cmp\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 794, __extension__ __PRETTY_FUNCTION__)) | |||
793 | ScalarTy->isPointerTy()) &&(static_cast <bool> (VecTy->isVectorTy() && ( IsConst0 || IsConst1 || V0->getType() == V1->getType()) && (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy () || ScalarTy->isPointerTy()) && "Unexpected types for insert element into binop or cmp" ) ? void (0) : __assert_fail ("VecTy->isVectorTy() && (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || ScalarTy->isPointerTy()) && \"Unexpected types for insert element into binop or cmp\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 794, __extension__ __PRETTY_FUNCTION__)) | |||
794 | "Unexpected types for insert element into binop or cmp")(static_cast <bool> (VecTy->isVectorTy() && ( IsConst0 || IsConst1 || V0->getType() == V1->getType()) && (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy () || ScalarTy->isPointerTy()) && "Unexpected types for insert element into binop or cmp" ) ? void (0) : __assert_fail ("VecTy->isVectorTy() && (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || ScalarTy->isPointerTy()) && \"Unexpected types for insert element into binop or cmp\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 794, __extension__ __PRETTY_FUNCTION__)); | |||
795 | ||||
796 | unsigned Opcode = I.getOpcode(); | |||
797 | InstructionCost ScalarOpCost, VectorOpCost; | |||
798 | if (IsCmp) { | |||
799 | CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate(); | |||
800 | ScalarOpCost = TTI.getCmpSelInstrCost( | |||
801 | Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred); | |||
802 | VectorOpCost = TTI.getCmpSelInstrCost( | |||
803 | Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred); | |||
804 | } else { | |||
805 | ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy); | |||
806 | VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy); | |||
807 | } | |||
808 | ||||
809 | // Get cost estimate for the insert element. This cost will factor into | |||
810 | // both sequences. | |||
811 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
812 | InstructionCost InsertCost = TTI.getVectorInstrCost( | |||
813 | Instruction::InsertElement, VecTy, CostKind, Index); | |||
814 | InstructionCost OldCost = | |||
815 | (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; | |||
816 | InstructionCost NewCost = ScalarOpCost + InsertCost + | |||
817 | (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + | |||
818 | (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); | |||
819 | ||||
820 | // We want to scalarize unless the vector variant actually has lower cost. | |||
821 | if (OldCost < NewCost || !NewCost.isValid()) | |||
822 | return false; | |||
823 | ||||
824 | // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> | |||
825 | // inselt NewVecC, (scalar_op V0, V1), Index | |||
826 | if (IsCmp) | |||
827 | ++NumScalarCmp; | |||
828 | else | |||
829 | ++NumScalarBO; | |||
830 | ||||
831 | // For constant cases, extract the scalar element, this should constant fold. | |||
832 | if (IsConst0) | |||
833 | V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index)); | |||
834 | if (IsConst1) | |||
835 | V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index)); | |||
836 | ||||
837 | Value *Scalar = | |||
838 | IsCmp ? Builder.CreateCmp(Pred, V0, V1) | |||
839 | : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1); | |||
840 | ||||
841 | Scalar->setName(I.getName() + ".scalar"); | |||
842 | ||||
843 | // All IR flags are safe to back-propagate. There is no potential for extra | |||
844 | // poison to be created by the scalar instruction. | |||
845 | if (auto *ScalarInst = dyn_cast<Instruction>(Scalar)) | |||
846 | ScalarInst->copyIRFlags(&I); | |||
847 | ||||
848 | // Fold the vector constants in the original vectors into a new base vector. | |||
849 | Value *NewVecC = | |||
850 | IsCmp ? Builder.CreateCmp(Pred, VecC0, VecC1) | |||
851 | : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, VecC0, VecC1); | |||
852 | Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index); | |||
853 | replaceValue(I, *Insert); | |||
854 | return true; | |||
855 | } | |||
856 | ||||
857 | /// Try to combine a scalar binop + 2 scalar compares of extracted elements of | |||
858 | /// a vector into vector operations followed by extract. Note: The SLP pass | |||
859 | /// may miss this pattern because of implementation problems. | |||
860 | bool VectorCombine::foldExtractedCmps(Instruction &I) { | |||
861 | // We are looking for a scalar binop of booleans. | |||
862 | // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1) | |||
863 | if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1)) | |||
864 | return false; | |||
865 | ||||
866 | // The compare predicates should match, and each compare should have a | |||
867 | // constant operand. | |||
868 | // TODO: Relax the one-use constraints. | |||
869 | Value *B0 = I.getOperand(0), *B1 = I.getOperand(1); | |||
870 | Instruction *I0, *I1; | |||
871 | Constant *C0, *C1; | |||
872 | CmpInst::Predicate P0, P1; | |||
873 | if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) || | |||
874 | !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) || | |||
875 | P0 != P1) | |||
876 | return false; | |||
877 | ||||
878 | // The compare operands must be extracts of the same vector with constant | |||
879 | // extract indexes. | |||
880 | // TODO: Relax the one-use constraints. | |||
881 | Value *X; | |||
882 | uint64_t Index0, Index1; | |||
883 | if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) || | |||
884 | !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))) | |||
885 | return false; | |||
886 | ||||
887 | auto *Ext0 = cast<ExtractElementInst>(I0); | |||
888 | auto *Ext1 = cast<ExtractElementInst>(I1); | |||
889 | ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1); | |||
890 | if (!ConvertToShuf) | |||
891 | return false; | |||
892 | ||||
893 | // The original scalar pattern is: | |||
894 | // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1) | |||
895 | CmpInst::Predicate Pred = P0; | |||
896 | unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp | |||
897 | : Instruction::ICmp; | |||
898 | auto *VecTy = dyn_cast<FixedVectorType>(X->getType()); | |||
899 | if (!VecTy) | |||
900 | return false; | |||
901 | ||||
902 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
903 | InstructionCost OldCost = | |||
904 | TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); | |||
905 | OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); | |||
906 | OldCost += | |||
907 | TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), | |||
908 | CmpInst::makeCmpResultType(I0->getType()), Pred) * | |||
909 | 2; | |||
910 | OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); | |||
911 | ||||
912 | // The proposed vector pattern is: | |||
913 | // vcmp = cmp Pred X, VecC | |||
914 | // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0 | |||
915 | int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; | |||
916 | int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; | |||
917 | auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); | |||
918 | InstructionCost NewCost = TTI.getCmpSelInstrCost( | |||
919 | CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred); | |||
920 | SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem); | |||
921 | ShufMask[CheapIndex] = ExpensiveIndex; | |||
922 | NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, | |||
923 | ShufMask); | |||
924 | NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); | |||
925 | NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); | |||
926 | ||||
927 | // Aggressively form vector ops if the cost is equal because the transform | |||
928 | // may enable further optimization. | |||
929 | // Codegen can reverse this transform (scalarize) if it was not profitable. | |||
930 | if (OldCost < NewCost || !NewCost.isValid()) | |||
931 | return false; | |||
932 | ||||
933 | // Create a vector constant from the 2 scalar constants. | |||
934 | SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(), | |||
935 | UndefValue::get(VecTy->getElementType())); | |||
936 | CmpC[Index0] = C0; | |||
937 | CmpC[Index1] = C1; | |||
938 | Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC)); | |||
939 | ||||
940 | Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder); | |||
941 | Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(), | |||
942 | VCmp, Shuf); | |||
943 | Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex); | |||
944 | replaceValue(I, *NewExt); | |||
945 | ++NumVecCmpBO; | |||
946 | return true; | |||
947 | } | |||
948 | ||||
949 | // Check if memory loc modified between two instrs in the same BB | |||
950 | static bool isMemModifiedBetween(BasicBlock::iterator Begin, | |||
951 | BasicBlock::iterator End, | |||
952 | const MemoryLocation &Loc, AAResults &AA) { | |||
953 | unsigned NumScanned = 0; | |||
954 | return std::any_of(Begin, End, [&](const Instruction &Instr) { | |||
955 | return isModSet(AA.getModRefInfo(&Instr, Loc)) || | |||
956 | ++NumScanned > MaxInstrsToScan; | |||
957 | }); | |||
958 | } | |||
959 | ||||
960 | namespace { | |||
961 | /// Helper class to indicate whether a vector index can be safely scalarized and | |||
962 | /// if a freeze needs to be inserted. | |||
963 | class ScalarizationResult { | |||
964 | enum class StatusTy { Unsafe, Safe, SafeWithFreeze }; | |||
965 | ||||
966 | StatusTy Status; | |||
967 | Value *ToFreeze; | |||
968 | ||||
969 | ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr) | |||
970 | : Status(Status), ToFreeze(ToFreeze) {} | |||
971 | ||||
972 | public: | |||
973 | ScalarizationResult(const ScalarizationResult &Other) = default; | |||
974 | ~ScalarizationResult() { | |||
975 | assert(!ToFreeze && "freeze() not called with ToFreeze being set")(static_cast <bool> (!ToFreeze && "freeze() not called with ToFreeze being set" ) ? void (0) : __assert_fail ("!ToFreeze && \"freeze() not called with ToFreeze being set\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 975, __extension__ __PRETTY_FUNCTION__)); | |||
976 | } | |||
977 | ||||
978 | static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; } | |||
979 | static ScalarizationResult safe() { return {StatusTy::Safe}; } | |||
980 | static ScalarizationResult safeWithFreeze(Value *ToFreeze) { | |||
981 | return {StatusTy::SafeWithFreeze, ToFreeze}; | |||
982 | } | |||
983 | ||||
984 | /// Returns true if the index can be scalarize without requiring a freeze. | |||
985 | bool isSafe() const { return Status == StatusTy::Safe; } | |||
986 | /// Returns true if the index cannot be scalarized. | |||
987 | bool isUnsafe() const { return Status == StatusTy::Unsafe; } | |||
988 | /// Returns true if the index can be scalarize, but requires inserting a | |||
989 | /// freeze. | |||
990 | bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; } | |||
991 | ||||
992 | /// Reset the state of Unsafe and clear ToFreze if set. | |||
993 | void discard() { | |||
994 | ToFreeze = nullptr; | |||
995 | Status = StatusTy::Unsafe; | |||
996 | } | |||
997 | ||||
998 | /// Freeze the ToFreeze and update the use in \p User to use it. | |||
999 | void freeze(IRBuilder<> &Builder, Instruction &UserI) { | |||
1000 | assert(isSafeWithFreeze() &&(static_cast <bool> (isSafeWithFreeze() && "should only be used when freezing is required" ) ? void (0) : __assert_fail ("isSafeWithFreeze() && \"should only be used when freezing is required\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 1001, __extension__ __PRETTY_FUNCTION__)) | |||
1001 | "should only be used when freezing is required")(static_cast <bool> (isSafeWithFreeze() && "should only be used when freezing is required" ) ? void (0) : __assert_fail ("isSafeWithFreeze() && \"should only be used when freezing is required\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 1001, __extension__ __PRETTY_FUNCTION__)); | |||
1002 | assert(is_contained(ToFreeze->users(), &UserI) &&(static_cast <bool> (is_contained(ToFreeze->users(), &UserI) && "UserI must be a user of ToFreeze") ? void (0) : __assert_fail ("is_contained(ToFreeze->users(), &UserI) && \"UserI must be a user of ToFreeze\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 1003, __extension__ __PRETTY_FUNCTION__)) | |||
1003 | "UserI must be a user of ToFreeze")(static_cast <bool> (is_contained(ToFreeze->users(), &UserI) && "UserI must be a user of ToFreeze") ? void (0) : __assert_fail ("is_contained(ToFreeze->users(), &UserI) && \"UserI must be a user of ToFreeze\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 1003, __extension__ __PRETTY_FUNCTION__)); | |||
1004 | IRBuilder<>::InsertPointGuard Guard(Builder); | |||
1005 | Builder.SetInsertPoint(cast<Instruction>(&UserI)); | |||
1006 | Value *Frozen = | |||
1007 | Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen"); | |||
1008 | for (Use &U : make_early_inc_range((UserI.operands()))) | |||
1009 | if (U.get() == ToFreeze) | |||
1010 | U.set(Frozen); | |||
1011 | ||||
1012 | ToFreeze = nullptr; | |||
1013 | } | |||
1014 | }; | |||
1015 | } // namespace | |||
1016 | ||||
1017 | /// Check if it is legal to scalarize a memory access to \p VecTy at index \p | |||
1018 | /// Idx. \p Idx must access a valid vector element. | |||
1019 | static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy, | |||
1020 | Value *Idx, Instruction *CtxI, | |||
1021 | AssumptionCache &AC, | |||
1022 | const DominatorTree &DT) { | |||
1023 | if (auto *C = dyn_cast<ConstantInt>(Idx)) { | |||
1024 | if (C->getValue().ult(VecTy->getNumElements())) | |||
1025 | return ScalarizationResult::safe(); | |||
1026 | return ScalarizationResult::unsafe(); | |||
1027 | } | |||
1028 | ||||
1029 | unsigned IntWidth = Idx->getType()->getScalarSizeInBits(); | |||
1030 | APInt Zero(IntWidth, 0); | |||
1031 | APInt MaxElts(IntWidth, VecTy->getNumElements()); | |||
1032 | ConstantRange ValidIndices(Zero, MaxElts); | |||
1033 | ConstantRange IdxRange(IntWidth, true); | |||
1034 | ||||
1035 | if (isGuaranteedNotToBePoison(Idx, &AC)) { | |||
1036 | if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false, | |||
1037 | true, &AC, CtxI, &DT))) | |||
1038 | return ScalarizationResult::safe(); | |||
1039 | return ScalarizationResult::unsafe(); | |||
1040 | } | |||
1041 | ||||
1042 | // If the index may be poison, check if we can insert a freeze before the | |||
1043 | // range of the index is restricted. | |||
1044 | Value *IdxBase; | |||
1045 | ConstantInt *CI; | |||
1046 | if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) { | |||
1047 | IdxRange = IdxRange.binaryAnd(CI->getValue()); | |||
1048 | } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) { | |||
1049 | IdxRange = IdxRange.urem(CI->getValue()); | |||
1050 | } | |||
1051 | ||||
1052 | if (ValidIndices.contains(IdxRange)) | |||
1053 | return ScalarizationResult::safeWithFreeze(IdxBase); | |||
1054 | return ScalarizationResult::unsafe(); | |||
1055 | } | |||
1056 | ||||
1057 | /// The memory operation on a vector of \p ScalarType had alignment of | |||
1058 | /// \p VectorAlignment. Compute the maximal, but conservatively correct, | |||
1059 | /// alignment that will be valid for the memory operation on a single scalar | |||
1060 | /// element of the same type with index \p Idx. | |||
1061 | static Align computeAlignmentAfterScalarization(Align VectorAlignment, | |||
1062 | Type *ScalarType, Value *Idx, | |||
1063 | const DataLayout &DL) { | |||
1064 | if (auto *C = dyn_cast<ConstantInt>(Idx)) | |||
1065 | return commonAlignment(VectorAlignment, | |||
1066 | C->getZExtValue() * DL.getTypeStoreSize(ScalarType)); | |||
1067 | return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType)); | |||
1068 | } | |||
1069 | ||||
1070 | // Combine patterns like: | |||
1071 | // %0 = load <4 x i32>, <4 x i32>* %a | |||
1072 | // %1 = insertelement <4 x i32> %0, i32 %b, i32 1 | |||
1073 | // store <4 x i32> %1, <4 x i32>* %a | |||
1074 | // to: | |||
1075 | // %0 = bitcast <4 x i32>* %a to i32* | |||
1076 | // %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1 | |||
1077 | // store i32 %b, i32* %1 | |||
1078 | bool VectorCombine::foldSingleElementStore(Instruction &I) { | |||
1079 | auto *SI = cast<StoreInst>(&I); | |||
1080 | if (!SI->isSimple() || | |||
1081 | !isa<FixedVectorType>(SI->getValueOperand()->getType())) | |||
1082 | return false; | |||
1083 | ||||
1084 | // TODO: Combine more complicated patterns (multiple insert) by referencing | |||
1085 | // TargetTransformInfo. | |||
1086 | Instruction *Source; | |||
1087 | Value *NewElement; | |||
1088 | Value *Idx; | |||
1089 | if (!match(SI->getValueOperand(), | |||
1090 | m_InsertElt(m_Instruction(Source), m_Value(NewElement), | |||
1091 | m_Value(Idx)))) | |||
1092 | return false; | |||
1093 | ||||
1094 | if (auto *Load = dyn_cast<LoadInst>(Source)) { | |||
1095 | auto VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType()); | |||
1096 | const DataLayout &DL = I.getModule()->getDataLayout(); | |||
1097 | Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); | |||
1098 | // Don't optimize for atomic/volatile load or store. Ensure memory is not | |||
1099 | // modified between, vector type matches store size, and index is inbounds. | |||
1100 | if (!Load->isSimple() || Load->getParent() != SI->getParent() || | |||
1101 | !DL.typeSizeEqualsStoreSize(Load->getType()) || | |||
1102 | SrcAddr != SI->getPointerOperand()->stripPointerCasts()) | |||
1103 | return false; | |||
1104 | ||||
1105 | auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT); | |||
1106 | if (ScalarizableIdx.isUnsafe() || | |||
1107 | isMemModifiedBetween(Load->getIterator(), SI->getIterator(), | |||
1108 | MemoryLocation::get(SI), AA)) | |||
1109 | return false; | |||
1110 | ||||
1111 | if (ScalarizableIdx.isSafeWithFreeze()) | |||
1112 | ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx)); | |||
1113 | Value *GEP = Builder.CreateInBoundsGEP( | |||
1114 | SI->getValueOperand()->getType(), SI->getPointerOperand(), | |||
1115 | {ConstantInt::get(Idx->getType(), 0), Idx}); | |||
1116 | StoreInst *NSI = Builder.CreateStore(NewElement, GEP); | |||
1117 | NSI->copyMetadata(*SI); | |||
1118 | Align ScalarOpAlignment = computeAlignmentAfterScalarization( | |||
1119 | std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx, | |||
1120 | DL); | |||
1121 | NSI->setAlignment(ScalarOpAlignment); | |||
1122 | replaceValue(I, *NSI); | |||
1123 | eraseInstruction(I); | |||
1124 | return true; | |||
1125 | } | |||
1126 | ||||
1127 | return false; | |||
1128 | } | |||
1129 | ||||
1130 | /// Try to scalarize vector loads feeding extractelement instructions. | |||
1131 | bool VectorCombine::scalarizeLoadExtract(Instruction &I) { | |||
1132 | Value *Ptr; | |||
1133 | if (!match(&I, m_Load(m_Value(Ptr)))) | |||
1134 | return false; | |||
1135 | ||||
1136 | auto *FixedVT = cast<FixedVectorType>(I.getType()); | |||
1137 | auto *LI = cast<LoadInst>(&I); | |||
1138 | const DataLayout &DL = I.getModule()->getDataLayout(); | |||
1139 | if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(FixedVT)) | |||
1140 | return false; | |||
1141 | ||||
1142 | InstructionCost OriginalCost = | |||
1143 | TTI.getMemoryOpCost(Instruction::Load, FixedVT, LI->getAlign(), | |||
1144 | LI->getPointerAddressSpace()); | |||
1145 | InstructionCost ScalarizedCost = 0; | |||
1146 | ||||
1147 | Instruction *LastCheckedInst = LI; | |||
1148 | unsigned NumInstChecked = 0; | |||
1149 | // Check if all users of the load are extracts with no memory modifications | |||
1150 | // between the load and the extract. Compute the cost of both the original | |||
1151 | // code and the scalarized version. | |||
1152 | for (User *U : LI->users()) { | |||
1153 | auto *UI = dyn_cast<ExtractElementInst>(U); | |||
1154 | if (!UI || UI->getParent() != LI->getParent()) | |||
1155 | return false; | |||
1156 | ||||
1157 | if (!isGuaranteedNotToBePoison(UI->getOperand(1), &AC, LI, &DT)) | |||
1158 | return false; | |||
1159 | ||||
1160 | // Check if any instruction between the load and the extract may modify | |||
1161 | // memory. | |||
1162 | if (LastCheckedInst->comesBefore(UI)) { | |||
1163 | for (Instruction &I : | |||
1164 | make_range(std::next(LI->getIterator()), UI->getIterator())) { | |||
1165 | // Bail out if we reached the check limit or the instruction may write | |||
1166 | // to memory. | |||
1167 | if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory()) | |||
1168 | return false; | |||
1169 | NumInstChecked++; | |||
1170 | } | |||
1171 | LastCheckedInst = UI; | |||
1172 | } | |||
1173 | ||||
1174 | auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT); | |||
1175 | if (!ScalarIdx.isSafe()) { | |||
1176 | // TODO: Freeze index if it is safe to do so. | |||
1177 | ScalarIdx.discard(); | |||
1178 | return false; | |||
1179 | } | |||
1180 | ||||
1181 | auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1)); | |||
1182 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
1183 | OriginalCost += | |||
1184 | TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, CostKind, | |||
1185 | Index ? Index->getZExtValue() : -1); | |||
1186 | ScalarizedCost += | |||
1187 | TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(), | |||
1188 | Align(1), LI->getPointerAddressSpace()); | |||
1189 | ScalarizedCost += TTI.getAddressComputationCost(FixedVT->getElementType()); | |||
1190 | } | |||
1191 | ||||
1192 | if (ScalarizedCost >= OriginalCost) | |||
1193 | return false; | |||
1194 | ||||
1195 | // Replace extracts with narrow scalar loads. | |||
1196 | for (User *U : LI->users()) { | |||
1197 | auto *EI = cast<ExtractElementInst>(U); | |||
1198 | Builder.SetInsertPoint(EI); | |||
1199 | ||||
1200 | Value *Idx = EI->getOperand(1); | |||
1201 | Value *GEP = | |||
1202 | Builder.CreateInBoundsGEP(FixedVT, Ptr, {Builder.getInt32(0), Idx}); | |||
1203 | auto *NewLoad = cast<LoadInst>(Builder.CreateLoad( | |||
1204 | FixedVT->getElementType(), GEP, EI->getName() + ".scalar")); | |||
1205 | ||||
1206 | Align ScalarOpAlignment = computeAlignmentAfterScalarization( | |||
1207 | LI->getAlign(), FixedVT->getElementType(), Idx, DL); | |||
1208 | NewLoad->setAlignment(ScalarOpAlignment); | |||
1209 | ||||
1210 | replaceValue(*EI, *NewLoad); | |||
1211 | } | |||
1212 | ||||
1213 | return true; | |||
1214 | } | |||
1215 | ||||
1216 | /// Try to convert "shuffle (binop), (binop)" with a shared binop operand into | |||
1217 | /// "binop (shuffle), (shuffle)". | |||
1218 | bool VectorCombine::foldShuffleOfBinops(Instruction &I) { | |||
1219 | auto *VecTy = cast<FixedVectorType>(I.getType()); | |||
1220 | BinaryOperator *B0, *B1; | |||
1221 | ArrayRef<int> Mask; | |||
1222 | if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)), | |||
1223 | m_Mask(Mask))) || | |||
1224 | B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy) | |||
1225 | return false; | |||
1226 | ||||
1227 | // Try to replace a binop with a shuffle if the shuffle is not costly. | |||
1228 | // The new shuffle will choose from a single, common operand, so it may be | |||
1229 | // cheaper than the existing two-operand shuffle. | |||
1230 | SmallVector<int> UnaryMask = createUnaryMask(Mask, Mask.size()); | |||
1231 | Instruction::BinaryOps Opcode = B0->getOpcode(); | |||
1232 | InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy); | |||
1233 | InstructionCost ShufCost = TTI.getShuffleCost( | |||
1234 | TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask); | |||
1235 | if (ShufCost > BinopCost) | |||
1236 | return false; | |||
1237 | ||||
1238 | // If we have something like "add X, Y" and "add Z, X", swap ops to match. | |||
1239 | Value *X = B0->getOperand(0), *Y = B0->getOperand(1); | |||
1240 | Value *Z = B1->getOperand(0), *W = B1->getOperand(1); | |||
1241 | if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W) | |||
1242 | std::swap(X, Y); | |||
1243 | ||||
1244 | Value *Shuf0, *Shuf1; | |||
1245 | if (X == Z) { | |||
1246 | // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W) | |||
1247 | Shuf0 = Builder.CreateShuffleVector(X, UnaryMask); | |||
1248 | Shuf1 = Builder.CreateShuffleVector(Y, W, Mask); | |||
1249 | } else if (Y == W) { | |||
1250 | // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y) | |||
1251 | Shuf0 = Builder.CreateShuffleVector(X, Z, Mask); | |||
1252 | Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask); | |||
1253 | } else { | |||
1254 | return false; | |||
1255 | } | |||
1256 | ||||
1257 | Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1); | |||
1258 | // Intersect flags from the old binops. | |||
1259 | if (auto *NewInst = dyn_cast<Instruction>(NewBO)) { | |||
1260 | NewInst->copyIRFlags(B0); | |||
1261 | NewInst->andIRFlags(B1); | |||
1262 | } | |||
1263 | replaceValue(I, *NewBO); | |||
1264 | return true; | |||
1265 | } | |||
1266 | ||||
1267 | /// Given a commutative reduction, the order of the input lanes does not alter | |||
1268 | /// the results. We can use this to remove certain shuffles feeding the | |||
1269 | /// reduction, removing the need to shuffle at all. | |||
1270 | bool VectorCombine::foldShuffleFromReductions(Instruction &I) { | |||
1271 | auto *II = dyn_cast<IntrinsicInst>(&I); | |||
1272 | if (!II) | |||
1273 | return false; | |||
1274 | switch (II->getIntrinsicID()) { | |||
1275 | case Intrinsic::vector_reduce_add: | |||
1276 | case Intrinsic::vector_reduce_mul: | |||
1277 | case Intrinsic::vector_reduce_and: | |||
1278 | case Intrinsic::vector_reduce_or: | |||
1279 | case Intrinsic::vector_reduce_xor: | |||
1280 | case Intrinsic::vector_reduce_smin: | |||
1281 | case Intrinsic::vector_reduce_smax: | |||
1282 | case Intrinsic::vector_reduce_umin: | |||
1283 | case Intrinsic::vector_reduce_umax: | |||
1284 | break; | |||
1285 | default: | |||
1286 | return false; | |||
1287 | } | |||
1288 | ||||
1289 | // Find all the inputs when looking through operations that do not alter the | |||
1290 | // lane order (binops, for example). Currently we look for a single shuffle, | |||
1291 | // and can ignore splat values. | |||
1292 | std::queue<Value *> Worklist; | |||
1293 | SmallPtrSet<Value *, 4> Visited; | |||
1294 | ShuffleVectorInst *Shuffle = nullptr; | |||
1295 | if (auto *Op = dyn_cast<Instruction>(I.getOperand(0))) | |||
1296 | Worklist.push(Op); | |||
1297 | ||||
1298 | while (!Worklist.empty()) { | |||
1299 | Value *CV = Worklist.front(); | |||
1300 | Worklist.pop(); | |||
1301 | if (Visited.contains(CV)) | |||
1302 | continue; | |||
1303 | ||||
1304 | // Splats don't change the order, so can be safely ignored. | |||
1305 | if (isSplatValue(CV)) | |||
1306 | continue; | |||
1307 | ||||
1308 | Visited.insert(CV); | |||
1309 | ||||
1310 | if (auto *CI = dyn_cast<Instruction>(CV)) { | |||
1311 | if (CI->isBinaryOp()) { | |||
1312 | for (auto *Op : CI->operand_values()) | |||
1313 | Worklist.push(Op); | |||
1314 | continue; | |||
1315 | } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) { | |||
1316 | if (Shuffle && Shuffle != SV) | |||
1317 | return false; | |||
1318 | Shuffle = SV; | |||
1319 | continue; | |||
1320 | } | |||
1321 | } | |||
1322 | ||||
1323 | // Anything else is currently an unknown node. | |||
1324 | return false; | |||
1325 | } | |||
1326 | ||||
1327 | if (!Shuffle) | |||
1328 | return false; | |||
1329 | ||||
1330 | // Check all uses of the binary ops and shuffles are also included in the | |||
1331 | // lane-invariant operations (Visited should be the list of lanewise | |||
1332 | // instructions, including the shuffle that we found). | |||
1333 | for (auto *V : Visited) | |||
1334 | for (auto *U : V->users()) | |||
1335 | if (!Visited.contains(U) && U != &I) | |||
1336 | return false; | |||
1337 | ||||
1338 | FixedVectorType *VecType = | |||
1339 | dyn_cast<FixedVectorType>(II->getOperand(0)->getType()); | |||
1340 | if (!VecType) | |||
1341 | return false; | |||
1342 | FixedVectorType *ShuffleInputType = | |||
1343 | dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType()); | |||
1344 | if (!ShuffleInputType) | |||
1345 | return false; | |||
1346 | int NumInputElts = ShuffleInputType->getNumElements(); | |||
1347 | ||||
1348 | // Find the mask from sorting the lanes into order. This is most likely to | |||
1349 | // become a identity or concat mask. Undef elements are pushed to the end. | |||
1350 | SmallVector<int> ConcatMask; | |||
1351 | Shuffle->getShuffleMask(ConcatMask); | |||
1352 | sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; }); | |||
1353 | bool UsesSecondVec = | |||
1354 | any_of(ConcatMask, [&](int M) { return M >= NumInputElts; }); | |||
1355 | InstructionCost OldCost = TTI.getShuffleCost( | |||
1356 | UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, | |||
1357 | Shuffle->getShuffleMask()); | |||
1358 | InstructionCost NewCost = TTI.getShuffleCost( | |||
1359 | UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType, | |||
1360 | ConcatMask); | |||
1361 | ||||
1362 | LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffledo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("vector-combine")) { dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle << "\n"; } } while (false) | |||
1363 | << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("vector-combine")) { dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle << "\n"; } } while (false); | |||
1364 | LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("vector-combine")) { dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost << "\n"; } } while (false) | |||
1365 | << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("vector-combine")) { dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost << "\n"; } } while (false); | |||
1366 | if (NewCost < OldCost) { | |||
1367 | Builder.SetInsertPoint(Shuffle); | |||
1368 | Value *NewShuffle = Builder.CreateShuffleVector( | |||
1369 | Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask); | |||
1370 | LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("vector-combine")) { dbgs() << "Created new shuffle: " << *NewShuffle << "\n"; } } while (false); | |||
1371 | replaceValue(*Shuffle, *NewShuffle); | |||
1372 | } | |||
1373 | ||||
1374 | // See if we can re-use foldSelectShuffle, getting it to reduce the size of | |||
1375 | // the shuffle into a nicer order, as it can ignore the order of the shuffles. | |||
1376 | return foldSelectShuffle(*Shuffle, true); | |||
1377 | } | |||
1378 | ||||
1379 | /// This method looks for groups of shuffles acting on binops, of the form: | |||
1380 | /// %x = shuffle ... | |||
1381 | /// %y = shuffle ... | |||
1382 | /// %a = binop %x, %y | |||
1383 | /// %b = binop %x, %y | |||
1384 | /// shuffle %a, %b, selectmask | |||
1385 | /// We may, especially if the shuffle is wider than legal, be able to convert | |||
1386 | /// the shuffle to a form where only parts of a and b need to be computed. On | |||
1387 | /// architectures with no obvious "select" shuffle, this can reduce the total | |||
1388 | /// number of operations if the target reports them as cheaper. | |||
1389 | bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { | |||
1390 | auto *SVI = cast<ShuffleVectorInst>(&I); | |||
1391 | auto *VT = cast<FixedVectorType>(I.getType()); | |||
1392 | auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0)); | |||
1393 | auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1)); | |||
1394 | if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || | |||
1395 | VT != Op0->getType()) | |||
1396 | return false; | |||
1397 | ||||
1398 | auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0)); | |||
1399 | auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1)); | |||
1400 | auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0)); | |||
1401 | auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1)); | |||
1402 | SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B}); | |||
1403 | auto checkSVNonOpUses = [&](Instruction *I) { | |||
1404 | if (!I || I->getOperand(0)->getType() != VT) | |||
1405 | return true; | |||
1406 | return any_of(I->users(), [&](User *U) { | |||
1407 | return U != Op0 && U != Op1 && | |||
1408 | !(isa<ShuffleVectorInst>(U) && | |||
1409 | (InputShuffles.contains(cast<Instruction>(U)) || | |||
1410 | isInstructionTriviallyDead(cast<Instruction>(U)))); | |||
1411 | }); | |||
1412 | }; | |||
1413 | if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || | |||
1414 | checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) | |||
1415 | return false; | |||
1416 | ||||
1417 | // Collect all the uses that are shuffles that we can transform together. We | |||
1418 | // may not have a single shuffle, but a group that can all be transformed | |||
1419 | // together profitably. | |||
1420 | SmallVector<ShuffleVectorInst *> Shuffles; | |||
1421 | auto collectShuffles = [&](Instruction *I) { | |||
1422 | for (auto *U : I->users()) { | |||
1423 | auto *SV = dyn_cast<ShuffleVectorInst>(U); | |||
1424 | if (!SV || SV->getType() != VT) | |||
1425 | return false; | |||
1426 | if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) || | |||
1427 | (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1)) | |||
1428 | return false; | |||
1429 | if (!llvm::is_contained(Shuffles, SV)) | |||
1430 | Shuffles.push_back(SV); | |||
1431 | } | |||
1432 | return true; | |||
1433 | }; | |||
1434 | if (!collectShuffles(Op0) || !collectShuffles(Op1)) | |||
1435 | return false; | |||
1436 | // From a reduction, we need to be processing a single shuffle, otherwise the | |||
1437 | // other uses will not be lane-invariant. | |||
1438 | if (FromReduction && Shuffles.size() > 1) | |||
1439 | return false; | |||
1440 | ||||
1441 | // Add any shuffle uses for the shuffles we have found, to include them in our | |||
1442 | // cost calculations. | |||
1443 | if (!FromReduction) { | |||
1444 | for (ShuffleVectorInst *SV : Shuffles) { | |||
1445 | for (auto *U : SV->users()) { | |||
1446 | ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U); | |||
1447 | if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT) | |||
1448 | Shuffles.push_back(SSV); | |||
1449 | } | |||
1450 | } | |||
1451 | } | |||
1452 | ||||
1453 | // For each of the output shuffles, we try to sort all the first vector | |||
1454 | // elements to the beginning, followed by the second array elements at the | |||
1455 | // end. If the binops are legalized to smaller vectors, this may reduce total | |||
1456 | // number of binops. We compute the ReconstructMask mask needed to convert | |||
1457 | // back to the original lane order. | |||
1458 | SmallVector<std::pair<int, int>> V1, V2; | |||
1459 | SmallVector<SmallVector<int>> OrigReconstructMasks; | |||
1460 | int MaxV1Elt = 0, MaxV2Elt = 0; | |||
1461 | unsigned NumElts = VT->getNumElements(); | |||
1462 | for (ShuffleVectorInst *SVN : Shuffles) { | |||
1463 | SmallVector<int> Mask; | |||
1464 | SVN->getShuffleMask(Mask); | |||
1465 | ||||
1466 | // Check the operands are the same as the original, or reversed (in which | |||
1467 | // case we need to commute the mask). | |||
1468 | Value *SVOp0 = SVN->getOperand(0); | |||
1469 | Value *SVOp1 = SVN->getOperand(1); | |||
1470 | if (isa<UndefValue>(SVOp1)) { | |||
1471 | auto *SSV = cast<ShuffleVectorInst>(SVOp0); | |||
1472 | SVOp0 = SSV->getOperand(0); | |||
1473 | SVOp1 = SSV->getOperand(1); | |||
1474 | for (unsigned I = 0, E = Mask.size(); I != E; I++) { | |||
1475 | if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size())) | |||
1476 | return false; | |||
1477 | Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Mask[I]); | |||
1478 | } | |||
1479 | } | |||
1480 | if (SVOp0 == Op1 && SVOp1 == Op0) { | |||
1481 | std::swap(SVOp0, SVOp1); | |||
1482 | ShuffleVectorInst::commuteShuffleMask(Mask, NumElts); | |||
1483 | } | |||
1484 | if (SVOp0 != Op0 || SVOp1 != Op1) | |||
1485 | return false; | |||
1486 | ||||
1487 | // Calculate the reconstruction mask for this shuffle, as the mask needed to | |||
1488 | // take the packed values from Op0/Op1 and reconstructing to the original | |||
1489 | // order. | |||
1490 | SmallVector<int> ReconstructMask; | |||
1491 | for (unsigned I = 0; I < Mask.size(); I++) { | |||
1492 | if (Mask[I] < 0) { | |||
1493 | ReconstructMask.push_back(-1); | |||
1494 | } else if (Mask[I] < static_cast<int>(NumElts)) { | |||
1495 | MaxV1Elt = std::max(MaxV1Elt, Mask[I]); | |||
1496 | auto It = find_if(V1, [&](const std::pair<int, int> &A) { | |||
1497 | return Mask[I] == A.first; | |||
1498 | }); | |||
1499 | if (It != V1.end()) | |||
1500 | ReconstructMask.push_back(It - V1.begin()); | |||
1501 | else { | |||
1502 | ReconstructMask.push_back(V1.size()); | |||
1503 | V1.emplace_back(Mask[I], V1.size()); | |||
1504 | } | |||
1505 | } else { | |||
1506 | MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts); | |||
1507 | auto It = find_if(V2, [&](const std::pair<int, int> &A) { | |||
1508 | return Mask[I] - static_cast<int>(NumElts) == A.first; | |||
1509 | }); | |||
1510 | if (It != V2.end()) | |||
1511 | ReconstructMask.push_back(NumElts + It - V2.begin()); | |||
1512 | else { | |||
1513 | ReconstructMask.push_back(NumElts + V2.size()); | |||
1514 | V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size()); | |||
1515 | } | |||
1516 | } | |||
1517 | } | |||
1518 | ||||
1519 | // For reductions, we know that the lane ordering out doesn't alter the | |||
1520 | // result. In-order can help simplify the shuffle away. | |||
1521 | if (FromReduction) | |||
1522 | sort(ReconstructMask); | |||
1523 | OrigReconstructMasks.push_back(std::move(ReconstructMask)); | |||
1524 | } | |||
1525 | ||||
1526 | // If the Maximum element used from V1 and V2 are not larger than the new | |||
1527 | // vectors, the vectors are already packes and performing the optimization | |||
1528 | // again will likely not help any further. This also prevents us from getting | |||
1529 | // stuck in a cycle in case the costs do not also rule it out. | |||
1530 | if (V1.empty() || V2.empty() || | |||
1531 | (MaxV1Elt == static_cast<int>(V1.size()) - 1 && | |||
1532 | MaxV2Elt == static_cast<int>(V2.size()) - 1)) | |||
1533 | return false; | |||
1534 | ||||
1535 | // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a | |||
1536 | // shuffle of another shuffle, or not a shuffle (that is treated like a | |||
1537 | // identity shuffle). | |||
1538 | auto GetBaseMaskValue = [&](Instruction *I, int M) { | |||
1539 | auto *SV = dyn_cast<ShuffleVectorInst>(I); | |||
1540 | if (!SV) | |||
1541 | return M; | |||
1542 | if (isa<UndefValue>(SV->getOperand(1))) | |||
1543 | if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) | |||
1544 | if (InputShuffles.contains(SSV)) | |||
1545 | return SSV->getMaskValue(SV->getMaskValue(M)); | |||
1546 | return SV->getMaskValue(M); | |||
1547 | }; | |||
1548 | ||||
1549 | // Attempt to sort the inputs my ascending mask values to make simpler input | |||
1550 | // shuffles and push complex shuffles down to the uses. We sort on the first | |||
1551 | // of the two input shuffle orders, to try and get at least one input into a | |||
1552 | // nice order. | |||
1553 | auto SortBase = [&](Instruction *A, std::pair<int, int> X, | |||
1554 | std::pair<int, int> Y) { | |||
1555 | int MXA = GetBaseMaskValue(A, X.first); | |||
1556 | int MYA = GetBaseMaskValue(A, Y.first); | |||
1557 | return MXA < MYA; | |||
1558 | }; | |||
1559 | stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) { | |||
1560 | return SortBase(SVI0A, A, B); | |||
1561 | }); | |||
1562 | stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) { | |||
1563 | return SortBase(SVI1A, A, B); | |||
1564 | }); | |||
1565 | // Calculate our ReconstructMasks from the OrigReconstructMasks and the | |||
1566 | // modified order of the input shuffles. | |||
1567 | SmallVector<SmallVector<int>> ReconstructMasks; | |||
1568 | for (auto Mask : OrigReconstructMasks) { | |||
1569 | SmallVector<int> ReconstructMask; | |||
1570 | for (int M : Mask) { | |||
1571 | auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) { | |||
1572 | auto It = find_if(V, [M](auto A) { return A.second == M; }); | |||
1573 | assert(It != V.end() && "Expected all entries in Mask")(static_cast <bool> (It != V.end() && "Expected all entries in Mask" ) ? void (0) : __assert_fail ("It != V.end() && \"Expected all entries in Mask\"" , "llvm/lib/Transforms/Vectorize/VectorCombine.cpp", 1573, __extension__ __PRETTY_FUNCTION__)); | |||
1574 | return std::distance(V.begin(), It); | |||
1575 | }; | |||
1576 | if (M < 0) | |||
1577 | ReconstructMask.push_back(-1); | |||
1578 | else if (M < static_cast<int>(NumElts)) { | |||
1579 | ReconstructMask.push_back(FindIndex(V1, M)); | |||
1580 | } else { | |||
1581 | ReconstructMask.push_back(NumElts + FindIndex(V2, M)); | |||
1582 | } | |||
1583 | } | |||
1584 | ReconstructMasks.push_back(std::move(ReconstructMask)); | |||
1585 | } | |||
1586 | ||||
1587 | // Calculate the masks needed for the new input shuffles, which get padded | |||
1588 | // with undef | |||
1589 | SmallVector<int> V1A, V1B, V2A, V2B; | |||
1590 | for (unsigned I = 0; I < V1.size(); I++) { | |||
1591 | V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first)); | |||
1592 | V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first)); | |||
1593 | } | |||
1594 | for (unsigned I = 0; I < V2.size(); I++) { | |||
1595 | V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first)); | |||
1596 | V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first)); | |||
1597 | } | |||
1598 | while (V1A.size() < NumElts) { | |||
1599 | V1A.push_back(UndefMaskElem); | |||
1600 | V1B.push_back(UndefMaskElem); | |||
1601 | } | |||
1602 | while (V2A.size() < NumElts) { | |||
1603 | V2A.push_back(UndefMaskElem); | |||
1604 | V2B.push_back(UndefMaskElem); | |||
1605 | } | |||
1606 | ||||
1607 | auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { | |||
1608 | auto *SV = dyn_cast<ShuffleVectorInst>(I); | |||
1609 | if (!SV) | |||
1610 | return C; | |||
1611 | return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1)) | |||
1612 | ? TTI::SK_PermuteSingleSrc | |||
1613 | : TTI::SK_PermuteTwoSrc, | |||
1614 | VT, SV->getShuffleMask()); | |||
1615 | }; | |||
1616 | auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { | |||
1617 | return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask); | |||
1618 | }; | |||
1619 | ||||
1620 | // Get the costs of the shuffles + binops before and after with the new | |||
1621 | // shuffle masks. | |||
1622 | InstructionCost CostBefore = | |||
1623 | TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) + | |||
1624 | TTI.getArithmeticInstrCost(Op1->getOpcode(), VT); | |||
1625 | CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(), | |||
1626 | InstructionCost(0), AddShuffleCost); | |||
1627 | CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(), | |||
1628 | InstructionCost(0), AddShuffleCost); | |||
1629 | ||||
1630 | // The new binops will be unused for lanes past the used shuffle lengths. | |||
1631 | // These types attempt to get the correct cost for that from the target. | |||
1632 | FixedVectorType *Op0SmallVT = | |||
1633 | FixedVectorType::get(VT->getScalarType(), V1.size()); | |||
1634 | FixedVectorType *Op1SmallVT = | |||
1635 | FixedVectorType::get(VT->getScalarType(), V2.size()); | |||
1636 | InstructionCost CostAfter = | |||
1637 | TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) + | |||
1638 | TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT); | |||
1639 | CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(), | |||
1640 | InstructionCost(0), AddShuffleMaskCost); | |||
1641 | std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); | |||
1642 | CostAfter += | |||
1643 | std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(), | |||
1644 | InstructionCost(0), AddShuffleMaskCost); | |||
1645 | ||||
1646 | LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("vector-combine")) { dbgs() << "Found a binop select shuffle pattern: " << I << "\n"; } } while (false); | |||
1647 | LLVM_DEBUG(dbgs() << " CostBefore: " << CostBeforedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("vector-combine")) { dbgs() << " CostBefore: " << CostBefore << " vs CostAfter: " << CostAfter << "\n"; } } while (false) | |||
1648 | << " vs CostAfter: " << CostAfter << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("vector-combine")) { dbgs() << " CostBefore: " << CostBefore << " vs CostAfter: " << CostAfter << "\n"; } } while (false); | |||
1649 | if (CostBefore <= CostAfter) | |||
1650 | return false; | |||
1651 | ||||
1652 | // The cost model has passed, create the new instructions. | |||
1653 | auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * { | |||
1654 | auto *SV = dyn_cast<ShuffleVectorInst>(I); | |||
1655 | if (!SV) | |||
1656 | return I; | |||
1657 | if (isa<UndefValue>(SV->getOperand(1))) | |||
1658 | if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0))) | |||
1659 | if (InputShuffles.contains(SSV)) | |||
1660 | return SSV->getOperand(Op); | |||
1661 | return SV->getOperand(Op); | |||
1662 | }; | |||
1663 | Builder.SetInsertPoint(SVI0A->getNextNode()); | |||
1664 | Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0), | |||
1665 | GetShuffleOperand(SVI0A, 1), V1A); | |||
1666 | Builder.SetInsertPoint(SVI0B->getNextNode()); | |||
1667 | Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0), | |||
1668 | GetShuffleOperand(SVI0B, 1), V1B); | |||
1669 | Builder.SetInsertPoint(SVI1A->getNextNode()); | |||
1670 | Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0), | |||
1671 | GetShuffleOperand(SVI1A, 1), V2A); | |||
1672 | Builder.SetInsertPoint(SVI1B->getNextNode()); | |||
1673 | Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0), | |||
1674 | GetShuffleOperand(SVI1B, 1), V2B); | |||
1675 | Builder.SetInsertPoint(Op0); | |||
1676 | Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(), | |||
1677 | NSV0A, NSV0B); | |||
1678 | if (auto *I = dyn_cast<Instruction>(NOp0)) | |||
1679 | I->copyIRFlags(Op0, true); | |||
1680 | Builder.SetInsertPoint(Op1); | |||
1681 | Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(), | |||
1682 | NSV1A, NSV1B); | |||
1683 | if (auto *I = dyn_cast<Instruction>(NOp1)) | |||
1684 | I->copyIRFlags(Op1, true); | |||
1685 | ||||
1686 | for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { | |||
1687 | Builder.SetInsertPoint(Shuffles[S]); | |||
1688 | Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); | |||
1689 | replaceValue(*Shuffles[S], *NSV); | |||
1690 | } | |||
1691 | ||||
1692 | Worklist.pushValue(NSV0A); | |||
1693 | Worklist.pushValue(NSV0B); | |||
1694 | Worklist.pushValue(NSV1A); | |||
1695 | Worklist.pushValue(NSV1B); | |||
1696 | for (auto *S : Shuffles) | |||
1697 | Worklist.add(S); | |||
1698 | return true; | |||
1699 | } | |||
1700 | ||||
1701 | /// This is the entry point for all transforms. Pass manager differences are | |||
1702 | /// handled in the callers of this function. | |||
1703 | bool VectorCombine::run() { | |||
1704 | if (DisableVectorCombine) | |||
1705 | return false; | |||
1706 | ||||
1707 | // Don't attempt vectorization if the target does not support vectors. | |||
1708 | if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) | |||
1709 | return false; | |||
1710 | ||||
1711 | bool MadeChange = false; | |||
1712 | auto FoldInst = [this, &MadeChange](Instruction &I) { | |||
1713 | Builder.SetInsertPoint(&I); | |||
1714 | bool IsFixedVectorType = isa<FixedVectorType>(I.getType()); | |||
1715 | auto Opcode = I.getOpcode(); | |||
1716 | ||||
1717 | // These folds should be beneficial regardless of when this pass is run | |||
1718 | // in the optimization pipeline. | |||
1719 | // The type checking is for run-time efficiency. We can avoid wasting time | |||
1720 | // dispatching to folding functions if there's no chance of matching. | |||
1721 | if (IsFixedVectorType
| |||
1722 | switch (Opcode) { | |||
1723 | case Instruction::InsertElement: | |||
1724 | MadeChange |= vectorizeLoadInsert(I); | |||
1725 | break; | |||
1726 | case Instruction::ShuffleVector: | |||
1727 | MadeChange |= widenSubvectorLoad(I); | |||
1728 | break; | |||
1729 | case Instruction::Load: | |||
1730 | MadeChange |= scalarizeLoadExtract(I); | |||
1731 | break; | |||
1732 | default: | |||
1733 | break; | |||
1734 | } | |||
1735 | } | |||
1736 | ||||
1737 | // This transform works with scalable and fixed vectors | |||
1738 | // TODO: Identify and allow other scalable transforms | |||
1739 | if (isa<VectorType>(I.getType())) | |||
1740 | MadeChange |= scalarizeBinopOrCmp(I); | |||
1741 | ||||
1742 | if (Opcode == Instruction::Store) | |||
1743 | MadeChange |= foldSingleElementStore(I); | |||
1744 | ||||
1745 | ||||
1746 | // If this is an early pipeline invocation of this pass, we are done. | |||
1747 | if (TryEarlyFoldsOnly) | |||
1748 | return; | |||
1749 | ||||
1750 | // Otherwise, try folds that improve codegen but may interfere with | |||
1751 | // early IR canonicalizations. | |||
1752 | // The type checking is for run-time efficiency. We can avoid wasting time | |||
1753 | // dispatching to folding functions if there's no chance of matching. | |||
1754 | if (IsFixedVectorType) { | |||
1755 | switch (Opcode) { | |||
1756 | case Instruction::InsertElement: | |||
1757 | MadeChange |= foldInsExtFNeg(I); | |||
1758 | break; | |||
1759 | case Instruction::ShuffleVector: | |||
1760 | MadeChange |= foldShuffleOfBinops(I); | |||
1761 | MadeChange |= foldSelectShuffle(I); | |||
1762 | break; | |||
1763 | case Instruction::BitCast: | |||
1764 | MadeChange |= foldBitcastShuf(I); | |||
1765 | break; | |||
1766 | } | |||
1767 | } else { | |||
1768 | switch (Opcode) { | |||
1769 | case Instruction::Call: | |||
1770 | MadeChange |= foldShuffleFromReductions(I); | |||
1771 | break; | |||
1772 | case Instruction::ICmp: | |||
1773 | case Instruction::FCmp: | |||
1774 | MadeChange |= foldExtractExtract(I); | |||
1775 | break; | |||
1776 | default: | |||
1777 | if (Instruction::isBinaryOp(Opcode)) { | |||
1778 | MadeChange |= foldExtractExtract(I); | |||
1779 | MadeChange |= foldExtractedCmps(I); | |||
1780 | } | |||
1781 | break; | |||
1782 | } | |||
1783 | } | |||
1784 | }; | |||
1785 | ||||
1786 | for (BasicBlock &BB : F) { | |||
1787 | // Ignore unreachable basic blocks. | |||
1788 | if (!DT.isReachableFromEntry(&BB)) | |||
1789 | continue; | |||
1790 | // Use early increment range so that we can erase instructions in loop. | |||
1791 | for (Instruction &I : make_early_inc_range(BB)) { | |||
1792 | if (I.isDebugOrPseudoInst()) | |||
1793 | continue; | |||
1794 | FoldInst(I); | |||
1795 | } | |||
1796 | } | |||
1797 | ||||
1798 | while (!Worklist.isEmpty()) { | |||
1799 | Instruction *I = Worklist.removeOne(); | |||
1800 | if (!I) | |||
1801 | continue; | |||
1802 | ||||
1803 | if (isInstructionTriviallyDead(I)) { | |||
1804 | eraseInstruction(*I); | |||
1805 | continue; | |||
1806 | } | |||
1807 | ||||
1808 | FoldInst(*I); | |||
1809 | } | |||
1810 | ||||
1811 | return MadeChange; | |||
1812 | } | |||
1813 | ||||
1814 | // Pass manager boilerplate below here. | |||
1815 | ||||
1816 | namespace { | |||
1817 | class VectorCombineLegacyPass : public FunctionPass { | |||
1818 | public: | |||
1819 | static char ID; | |||
1820 | VectorCombineLegacyPass() : FunctionPass(ID) { | |||
1821 | initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry()); | |||
1822 | } | |||
1823 | ||||
1824 | void getAnalysisUsage(AnalysisUsage &AU) const override { | |||
1825 | AU.addRequired<AssumptionCacheTracker>(); | |||
1826 | AU.addRequired<DominatorTreeWrapperPass>(); | |||
1827 | AU.addRequired<TargetTransformInfoWrapperPass>(); | |||
1828 | AU.addRequired<AAResultsWrapperPass>(); | |||
1829 | AU.setPreservesCFG(); | |||
1830 | AU.addPreserved<DominatorTreeWrapperPass>(); | |||
1831 | AU.addPreserved<GlobalsAAWrapperPass>(); | |||
1832 | AU.addPreserved<AAResultsWrapperPass>(); | |||
1833 | AU.addPreserved<BasicAAWrapperPass>(); | |||
1834 | FunctionPass::getAnalysisUsage(AU); | |||
1835 | } | |||
1836 | ||||
1837 | bool runOnFunction(Function &F) override { | |||
1838 | if (skipFunction(F)) | |||
| ||||
1839 | return false; | |||
1840 | auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); | |||
1841 | auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); | |||
1842 | auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); | |||
1843 | auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); | |||
1844 | VectorCombine Combiner(F, TTI, DT, AA, AC, false); | |||
1845 | return Combiner.run(); | |||
1846 | } | |||
1847 | }; | |||
1848 | } // namespace | |||
1849 | ||||
1850 | char VectorCombineLegacyPass::ID = 0; | |||
1851 | INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine",static void *initializeVectorCombineLegacyPassPassOnce(PassRegistry &Registry) { | |||
1852 | "Optimize scalar/vector ops", false,static void *initializeVectorCombineLegacyPassPassOnce(PassRegistry &Registry) { | |||
1853 | false)static void *initializeVectorCombineLegacyPassPassOnce(PassRegistry &Registry) { | |||
1854 | INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry); | |||
1855 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry); | |||
1856 | INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine",PassInfo *PI = new PassInfo( "Optimize scalar/vector ops", "vector-combine" , &VectorCombineLegacyPass::ID, PassInfo::NormalCtor_t(callDefaultCtor <VectorCombineLegacyPass>), false, false); Registry.registerPass (*PI, true); return PI; } static llvm::once_flag InitializeVectorCombineLegacyPassPassFlag ; void llvm::initializeVectorCombineLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeVectorCombineLegacyPassPassFlag , initializeVectorCombineLegacyPassPassOnce, std::ref(Registry )); } | |||
1857 | "Optimize scalar/vector ops", false, false)PassInfo *PI = new PassInfo( "Optimize scalar/vector ops", "vector-combine" , &VectorCombineLegacyPass::ID, PassInfo::NormalCtor_t(callDefaultCtor <VectorCombineLegacyPass>), false, false); Registry.registerPass (*PI, true); return PI; } static llvm::once_flag InitializeVectorCombineLegacyPassPassFlag ; void llvm::initializeVectorCombineLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeVectorCombineLegacyPassPassFlag , initializeVectorCombineLegacyPassPassOnce, std::ref(Registry )); } | |||
1858 | Pass *llvm::createVectorCombinePass() { | |||
1859 | return new VectorCombineLegacyPass(); | |||
1860 | } | |||
1861 | ||||
1862 | PreservedAnalyses VectorCombinePass::run(Function &F, | |||
1863 | FunctionAnalysisManager &FAM) { | |||
1864 | auto &AC = FAM.getResult<AssumptionAnalysis>(F); | |||
1865 | TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); | |||
1866 | DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); | |||
1867 | AAResults &AA = FAM.getResult<AAManager>(F); | |||
1868 | VectorCombine Combiner(F, TTI, DT, AA, AC, TryEarlyFoldsOnly); | |||
1869 | if (!Combiner.run()) | |||
1870 | return PreservedAnalyses::all(); | |||
1871 | PreservedAnalyses PA; | |||
1872 | PA.preserveSet<CFGAnalyses>(); | |||
1873 | return PA; | |||
1874 | } |