LLVM 23.0.0git
LoopIdiomVectorize.cpp
Go to the documentation of this file.
1//===-------- LoopIdiomVectorize.cpp - Loop idiom vectorization -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements a pass that recognizes certain loop idioms and
10// transforms them into more optimized versions of the same loop. In cases
11// where this happens, it can be a significant performance win.
12//
13// We currently support two loops:
14//
15// 1. A loop that finds the first mismatched byte in an array and returns the
16// index, i.e. something like:
17//
18// while (++i != n) {
19// if (a[i] != b[i])
20// break;
21// }
22//
23// In this example we can actually vectorize the loop despite the early exit,
24// although the loop vectorizer does not support it. It requires some extra
25// checks to deal with the possibility of faulting loads when crossing page
26// boundaries. However, even with these checks it is still profitable to do the
27// transformation.
28//
29// TODO List:
30//
31// * Add support for the inverse case where we scan for a matching element.
32// * Permit 64-bit induction variable types.
33// * Recognize loops that increment the IV *after* comparing bytes.
34// * Allow 32-bit sign-extends of the IV used by the GEP.
35//
36// 2. A loop that finds the first matching character in an array among a set of
37// possible matches, e.g.:
38//
39// for (; first != last; ++first)
40// for (s_it = s_first; s_it != s_last; ++s_it)
41// if (*first == *s_it)
42// return first;
43// return last;
44//
45// This corresponds to std::find_first_of (for arrays of bytes) from the C++
46// standard library. This function can be implemented efficiently for targets
47// that support @llvm.experimental.vector.match. For example, on AArch64 targets
48// that implement SVE2, this lower to a MATCH instruction, which enables us to
49// perform up to 16x16=256 comparisons in one go. This can lead to very
50// significant speedups.
51//
52// TODO:
53//
54// * Add support for `find_first_not_of' loops (i.e. with not-equal comparison).
55// * Make VF a configurable parameter (right now we assume 128-bit vectors).
56// * Potentially adjust the cost model to let the transformation kick-in even if
57// @llvm.experimental.vector.match doesn't have direct support in hardware.
58//
59//===----------------------------------------------------------------------===//
60//
61// NOTE: This Pass matches really specific loop patterns because it's only
62// supposed to be a temporary solution until our LoopVectorizer is powerful
63// enough to vectorize them automatically.
64//
65//===----------------------------------------------------------------------===//
66
72#include "llvm/IR/Dominators.h"
73#include "llvm/IR/IRBuilder.h"
74#include "llvm/IR/Intrinsics.h"
75#include "llvm/IR/MDBuilder.h"
79
80using namespace llvm;
81using namespace PatternMatch;
82
83#define DEBUG_TYPE "loop-idiom-vectorize"
84
85static cl::opt<bool> DisableAll("disable-loop-idiom-vectorize-all", cl::Hidden,
86 cl::init(false),
87 cl::desc("Disable Loop Idiom Vectorize Pass."));
88
90 LITVecStyle("loop-idiom-vectorize-style", cl::Hidden,
91 cl::desc("The vectorization style for loop idiom transform."),
93 "Use masked vector intrinsics"),
95 "predicated", "Use VP intrinsics")),
97
98static cl::opt<bool>
99 DisableByteCmp("disable-loop-idiom-vectorize-bytecmp", cl::Hidden,
100 cl::init(false),
101 cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
102 "not convert byte-compare loop(s)."));
103
105 ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden,
106 cl::desc("The vectorization factor for byte-compare patterns."),
107 cl::init(16));
108
109static cl::opt<bool>
110 DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte",
111 cl::Hidden, cl::init(false),
112 cl::desc("Do not convert find-first-byte loop(s)."));
113
114static cl::opt<bool>
115 VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
116 cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
117
118namespace {
119class LoopIdiomVectorize {
120 LoopIdiomVectorizeStyle VectorizeStyle;
121 unsigned ByteCompareVF;
122 Loop *CurLoop = nullptr;
123 DominatorTree *DT;
124 LoopInfo *LI;
126 const DataLayout *DL;
127
128 /// Interface to emit optimization remarks.
130
131 // Blocks that will be used for inserting vectorized code.
132 BasicBlock *EndBlock = nullptr;
133 BasicBlock *VectorLoopPreheaderBlock = nullptr;
134 BasicBlock *VectorLoopStartBlock = nullptr;
135 BasicBlock *VectorLoopMismatchBlock = nullptr;
136 BasicBlock *VectorLoopIncBlock = nullptr;
137
138public:
139 LoopIdiomVectorize(LoopIdiomVectorizeStyle S, unsigned VF, DominatorTree *DT,
140 LoopInfo *LI, const TargetTransformInfo *TTI,
142 : VectorizeStyle(S), ByteCompareVF(VF), DT(DT), LI(LI), TTI(TTI), DL(DL),
143 ORE(ORE) {}
144
145 bool run(Loop *L);
146
147private:
148 /// \name Countable Loop Idiom Handling
149 /// @{
150
151 bool runOnCountableLoop();
152 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
153 SmallVectorImpl<BasicBlock *> &ExitBlocks);
154
155 bool recognizeByteCompare();
156
157 Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
158 GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
159 Instruction *Index, Value *Start, Value *MaxLen);
160
161 Value *createMaskedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
162 GetElementPtrInst *GEPA,
163 GetElementPtrInst *GEPB, Value *ExtStart,
164 Value *ExtEnd);
165 Value *createPredicatedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
166 GetElementPtrInst *GEPA,
167 GetElementPtrInst *GEPB, Value *ExtStart,
168 Value *ExtEnd);
169
170 void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
171 PHINode *IndPhi, Value *MaxLen, Instruction *Index,
172 Value *Start, bool IncIdx, BasicBlock *FoundBB,
173 BasicBlock *EndBB);
174
175 bool recognizeFindFirstByte();
176
177 Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
178 unsigned VF, Type *CharTy, Value *IndPhi,
179 BasicBlock *ExitSucc, BasicBlock *ExitFail,
180 Value *SearchStart, Value *SearchEnd,
181 Value *NeedleStart, Value *NeedleEnd);
182
183 void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy,
184 BasicBlock *ExitSucc, BasicBlock *ExitFail,
185 Value *SearchStart, Value *SearchEnd,
186 Value *NeedleStart, Value *NeedleEnd);
187 /// @}
188};
189} // anonymous namespace
190
193 LPMUpdater &) {
194 if (DisableAll)
195 return PreservedAnalyses::all();
196
197 const auto *DL = &L.getHeader()->getDataLayout();
198
199 LoopIdiomVectorizeStyle VecStyle = VectorizeStyle;
200 if (LITVecStyle.getNumOccurrences())
201 VecStyle = LITVecStyle;
202
203 unsigned BCVF = ByteCompareVF;
204 if (ByteCmpVF.getNumOccurrences())
205 BCVF = ByteCmpVF;
206
207 Function &F = *L.getHeader()->getParent();
208 auto &FAMP = AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR);
209 auto *ORE = FAMP.getCachedResult<OptimizationRemarkEmitterAnalysis>(F);
210
211 std::optional<OptimizationRemarkEmitter> ORELocal;
212 if (!ORE) {
213 ORELocal.emplace(&F);
214 ORE = &*ORELocal;
215 }
216
217 LoopIdiomVectorize LIV(VecStyle, BCVF, &AR.DT, &AR.LI, &AR.TTI, DL, *ORE);
218 if (!LIV.run(&L))
219 return PreservedAnalyses::all();
220
222}
223
224//===----------------------------------------------------------------------===//
225//
226// Implementation of LoopIdiomVectorize
227//
228//===----------------------------------------------------------------------===//
229
230bool LoopIdiomVectorize::run(Loop *L) {
231 CurLoop = L;
232
233 Function &F = *L->getHeader()->getParent();
234 if (DisableAll || F.hasOptSize())
235 return false;
236
237 // Bail if vectorization is disabled on loop.
238 LoopVectorizeHints Hints(L, /*InterleaveOnlyWhenForced=*/true, ORE);
239 if (!Hints.allowVectorization(&F, L, /*VectorizeOnlyWhenForced=*/false)) {
240 LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on " << L->getName()
241 << " due to vectorization hints\n");
242 return false;
243 }
244
245 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) {
246 LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on " << F.getName()
247 << " due to its NoImplicitFloat attribute");
248 return false;
249 }
250
251 // If the loop could not be converted to canonical form, it must have an
252 // indirectbr in it, just give up.
253 if (!L->getLoopPreheader())
254 return false;
255
256 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %"
257 << CurLoop->getHeader()->getName() << "\n");
258
259 if (recognizeByteCompare())
260 return true;
261
262 if (recognizeFindFirstByte())
263 return true;
264
265 return false;
266}
267
268static void fixSuccessorPhis(Loop *L, Value *ScalarRes, Value *VectorRes,
269 BasicBlock *SuccBB, BasicBlock *IncBB) {
270 for (PHINode &PN : SuccBB->phis()) {
271 // Look through the incoming values to find ScalarRes, meaning this is a
272 // PHI collecting the results of the transformation.
273 bool ResPhi = false;
274 for (Value *Op : PN.incoming_values())
275 if (Op == ScalarRes) {
276 ResPhi = true;
277 break;
278 }
279
280 // Any PHI that depended upon the result of the transformation needs a new
281 // incoming value from IncBB.
282 if (ResPhi)
283 PN.addIncoming(VectorRes, IncBB);
284 else {
285 // There should be no other outside uses of other values in the
286 // original loop. Any incoming values should either:
287 // 1. Be for blocks outside the loop, which aren't interesting. Or ..
288 // 2. These are from blocks in the loop with values defined outside
289 // the loop. We should a similar incoming value from CmpBB.
290 for (BasicBlock *BB : PN.blocks())
291 if (L->contains(BB)) {
292 PN.addIncoming(PN.getIncomingValueForBlock(BB), IncBB);
293 break;
294 }
295 }
296 }
297}
298
299bool LoopIdiomVectorize::recognizeByteCompare() {
300 // Currently the transformation only works on scalable vector types, although
301 // there is no fundamental reason why it cannot be made to work for fixed
302 // width too.
303
304 // We also need to know the minimum page size for the target in order to
305 // generate runtime memory checks to ensure the vector version won't fault.
306 if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() ||
308 return false;
309
310 BasicBlock *Header = CurLoop->getHeader();
311
312 // In LoopIdiomVectorize::run we have already checked that the loop
313 // has a preheader so we can assume it's in a canonical form.
314 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2)
315 return false;
316
317 PHINode *PN = dyn_cast<PHINode>(&Header->front());
318 if (!PN || PN->getNumIncomingValues() != 2)
319 return false;
320
321 auto LoopBlocks = CurLoop->getBlocks();
322 // The first block in the loop should contain only 4 instructions, e.g.
323 //
324 // while.cond:
325 // %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ]
326 // %inc = add i32 %res.phi, 1
327 // %cmp.not = icmp eq i32 %inc, %n
328 // br i1 %cmp.not, label %while.end, label %while.body
329 //
330 if (LoopBlocks[0]->sizeWithoutDebug() > 4)
331 return false;
332
333 // The second block should contain 7 instructions, e.g.
334 //
335 // while.body:
336 // %idx = zext i32 %inc to i64
337 // %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx
338 // %load.a = load i8, ptr %idx.a
339 // %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx
340 // %load.b = load i8, ptr %idx.b
341 // %cmp.not.ld = icmp eq i8 %load.a, %load.b
342 // br i1 %cmp.not.ld, label %while.cond, label %while.end
343 //
344 if (LoopBlocks[1]->sizeWithoutDebug() > 7)
345 return false;
346
347 // The incoming value to the PHI node from the loop should be an add of 1.
348 Value *StartIdx = nullptr;
349 Instruction *Index = nullptr;
350 if (!CurLoop->contains(PN->getIncomingBlock(0))) {
351 StartIdx = PN->getIncomingValue(0);
353 } else {
354 StartIdx = PN->getIncomingValue(1);
356 }
357
358 // Limit to 32-bit types for now
359 if (!Index || !Index->getType()->isIntegerTy(32) ||
360 !match(Index, m_c_Add(m_Specific(PN), m_One())))
361 return false;
362
363 // If we match the pattern, PN and Index will be replaced with the result of
364 // the cttz.elts intrinsic. If any other instructions are used outside of
365 // the loop, we cannot replace it.
366 for (BasicBlock *BB : LoopBlocks)
367 for (Instruction &I : *BB)
368 if (&I != PN && &I != Index)
369 for (User *U : I.users())
370 if (!CurLoop->contains(cast<Instruction>(U)))
371 return false;
372
373 // Match the branch instruction for the header
374 Value *MaxLen;
375 BasicBlock *EndBB, *WhileBB;
376 if (!match(Header->getTerminator(),
378 m_Value(MaxLen)),
379 m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) ||
380 !CurLoop->contains(WhileBB))
381 return false;
382
383 // WhileBB should contain the pattern of load & compare instructions. Match
384 // the pattern and find the GEP instructions used by the loads.
385 BasicBlock *FoundBB;
386 BasicBlock *TrueBB;
387 Value *LoadA, *LoadB;
388 if (!match(WhileBB->getTerminator(),
390 m_Value(LoadB)),
391 m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) ||
392 !CurLoop->contains(TrueBB))
393 return false;
394
395 Value *A, *B;
396 if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B))))
397 return false;
398
399 LoadInst *LoadAI = cast<LoadInst>(LoadA);
400 LoadInst *LoadBI = cast<LoadInst>(LoadB);
401 if (!LoadAI->isSimple() || !LoadBI->isSimple())
402 return false;
403
406
407 if (!GEPA || !GEPB)
408 return false;
409
410 Value *PtrA = GEPA->getPointerOperand();
411 Value *PtrB = GEPB->getPointerOperand();
412
413 // Check we are loading i8 values from two loop invariant pointers
414 if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) ||
415 !GEPA->getResultElementType()->isIntegerTy(8) ||
416 !GEPB->getResultElementType()->isIntegerTy(8) ||
417 !LoadAI->getType()->isIntegerTy(8) ||
418 !LoadBI->getType()->isIntegerTy(8) || PtrA == PtrB)
419 return false;
420
421 // Check that the index to the GEPs is the index we found earlier
422 if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1)
423 return false;
424
425 Value *IdxA = GEPA->getOperand(GEPA->getNumIndices());
426 Value *IdxB = GEPB->getOperand(GEPB->getNumIndices());
427 if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index))))
428 return false;
429
430 // We only ever expect the pre-incremented index value to be used inside the
431 // loop.
432 if (!PN->hasOneUse())
433 return false;
434
435 // Ensure that when the Found and End blocks are identical the PHIs have the
436 // supported format. We don't currently allow cases like this:
437 // while.cond:
438 // ...
439 // br i1 %cmp.not, label %while.end, label %while.body
440 //
441 // while.body:
442 // ...
443 // br i1 %cmp.not2, label %while.cond, label %while.end
444 //
445 // while.end:
446 // %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
447 //
448 // Where the incoming values for %final_ptr are unique and from each of the
449 // loop blocks, but not actually defined in the loop. This requires extra
450 // work setting up the byte.compare block, i.e. by introducing a select to
451 // choose the correct value.
452 // TODO: We could add support for this in future.
453 if (FoundBB == EndBB) {
454 for (PHINode &EndPN : EndBB->phis()) {
455 Value *WhileCondVal = EndPN.getIncomingValueForBlock(Header);
456 Value *WhileBodyVal = EndPN.getIncomingValueForBlock(WhileBB);
457
458 // The value of the index when leaving the while.cond block is always the
459 // same as the end value (MaxLen) so we permit either. The value when
460 // leaving the while.body block should only be the index. Otherwise for
461 // any other values we only allow ones that are same for both blocks.
462 if (WhileCondVal != WhileBodyVal &&
463 ((WhileCondVal != Index && WhileCondVal != MaxLen) ||
464 (WhileBodyVal != Index)))
465 return false;
466 }
467 }
468
469 LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n"
470 << *(EndBB->getParent()) << "\n\n");
471
472 // The index is incremented before the GEP/Load pair so we need to
473 // add 1 to the start value.
474 transformByteCompare(GEPA, GEPB, PN, MaxLen, Index, StartIdx, /*IncIdx=*/true,
475 FoundBB, EndBB);
476 return true;
477}
478
479Value *LoopIdiomVectorize::createMaskedFindMismatch(
480 IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
481 GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) {
482 Type *I64Type = Builder.getInt64Ty();
483 Type *ResType = Builder.getInt32Ty();
484 Type *LoadType = Builder.getInt8Ty();
485 Value *PtrA = GEPA->getPointerOperand();
486 Value *PtrB = GEPB->getPointerOperand();
487
488 ScalableVectorType *PredVTy =
489 ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF);
490
491 Value *InitialPred = Builder.CreateIntrinsic(
492 Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
493
494 Value *VecLen = Builder.CreateVScale(I64Type);
495 VecLen =
496 Builder.CreateMul(VecLen, ConstantInt::get(I64Type, ByteCompareVF), "",
497 /*HasNUW=*/true, /*HasNSW=*/true);
498
499 Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
500 Builder.getInt1(false));
501
502 BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
503 Builder.Insert(JumpToVectorLoop);
504
505 DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
506 VectorLoopStartBlock}});
507
508 // Set up the first vector loop block by creating the PHIs, doing the vector
509 // loads and comparing the vectors.
510 Builder.SetInsertPoint(VectorLoopStartBlock);
511 PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred");
512 LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
513 PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
514 VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
515 Type *VectorLoadType =
516 ScalableVectorType::get(Builder.getInt8Ty(), ByteCompareVF);
517 Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
518
519 Value *VectorLhsGep =
520 Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds());
521 Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep,
522 Align(1), LoopPred, Passthru);
523
524 Value *VectorRhsGep =
525 Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds());
526 Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep,
527 Align(1), LoopPred, Passthru);
528
529 Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
530 VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
531 Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
532 BranchInst *VectorEarlyExit = BranchInst::Create(
533 VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
534 Builder.Insert(VectorEarlyExit);
535
536 DTU.applyUpdates(
537 {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
538 {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
539
540 // Increment the index counter and calculate the predicate for the next
541 // iteration of the loop. We branch back to the start of the loop if there
542 // is at least one active lane.
543 Builder.SetInsertPoint(VectorLoopIncBlock);
544 Value *NewVectorIndexPhi =
545 Builder.CreateAdd(VectorIndexPhi, VecLen, "",
546 /*HasNUW=*/true, /*HasNSW=*/true);
547 VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
548 Value *NewPred =
549 Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
550 {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd});
551 LoopPred->addIncoming(NewPred, VectorLoopIncBlock);
552
553 Value *PredHasActiveLanes =
554 Builder.CreateExtractElement(NewPred, uint64_t(0));
555 BranchInst *VectorLoopBranchBack =
556 BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
557 Builder.Insert(VectorLoopBranchBack);
558
559 DTU.applyUpdates(
560 {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
561 {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
562
563 // If we found a mismatch then we need to calculate which lane in the vector
564 // had a mismatch and add that on to the current loop index.
565 Builder.SetInsertPoint(VectorLoopMismatchBlock);
566 PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred");
567 FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock);
568 PHINode *LastLoopPred =
569 Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred");
570 LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock);
571 PHINode *VectorFoundIndex =
572 Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index");
573 VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
574
575 Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
576 Value *Ctz = Builder.CreateCountTrailingZeroElems(ResType, PredMatchCmp);
577 Ctz = Builder.CreateZExt(Ctz, I64Type);
578 Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "",
579 /*HasNUW=*/true, /*HasNSW=*/true);
580 return Builder.CreateTrunc(VectorLoopRes64, ResType);
581}
582
583Value *LoopIdiomVectorize::createPredicatedFindMismatch(
584 IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
585 GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) {
586 Type *I64Type = Builder.getInt64Ty();
587 Type *I32Type = Builder.getInt32Ty();
588 Type *ResType = I32Type;
589 Type *LoadType = Builder.getInt8Ty();
590 Value *PtrA = GEPA->getPointerOperand();
591 Value *PtrB = GEPB->getPointerOperand();
592
593 auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
594 Builder.Insert(JumpToVectorLoop);
595
596 DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
597 VectorLoopStartBlock}});
598
599 // Set up the first Vector loop block by creating the PHIs, doing the vector
600 // loads and comparing the vectors.
601 Builder.SetInsertPoint(VectorLoopStartBlock);
602 auto *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vector_index");
603 VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
604
605 // Calculate AVL by subtracting the vector loop index from the trip count
606 Value *AVL = Builder.CreateSub(ExtEnd, VectorIndexPhi, "avl", /*HasNUW=*/true,
607 /*HasNSW=*/true);
608
609 auto *VectorLoadType = ScalableVectorType::get(LoadType, ByteCompareVF);
610 auto *VF = ConstantInt::get(I32Type, ByteCompareVF);
611
612 Value *VL = Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length,
613 {I64Type}, {AVL, VF, Builder.getTrue()});
614 Value *GepOffset = VectorIndexPhi;
615
616 Value *VectorLhsGep =
617 Builder.CreateGEP(LoadType, PtrA, GepOffset, "", GEPA->isInBounds());
618 VectorType *TrueMaskTy =
619 VectorType::get(Builder.getInt1Ty(), VectorLoadType->getElementCount());
620 Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy);
621 Value *VectorLhsLoad = Builder.CreateIntrinsic(
622 Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
623 {VectorLhsGep, AllTrueMask, VL}, nullptr, "lhs.load");
624
625 Value *VectorRhsGep =
626 Builder.CreateGEP(LoadType, PtrB, GepOffset, "", GEPB->isInBounds());
627 Value *VectorRhsLoad = Builder.CreateIntrinsic(
628 Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
629 {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
630
631 Value *VectorMatchCmp =
632 Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad, "mismatch.cmp");
633 Value *CTZ = Builder.CreateIntrinsic(
634 Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
635 {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask,
636 VL});
637 Value *MismatchFound = Builder.CreateICmpNE(CTZ, VL);
638 auto *VectorEarlyExit = BranchInst::Create(VectorLoopMismatchBlock,
639 VectorLoopIncBlock, MismatchFound);
640 Builder.Insert(VectorEarlyExit);
641
642 DTU.applyUpdates(
643 {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
644 {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
645
646 // Increment the index counter and calculate the predicate for the next
647 // iteration of the loop. We branch back to the start of the loop if there
648 // is at least one active lane.
649 Builder.SetInsertPoint(VectorLoopIncBlock);
650 Value *VL64 = Builder.CreateZExt(VL, I64Type);
651 Value *NewVectorIndexPhi =
652 Builder.CreateAdd(VectorIndexPhi, VL64, "",
653 /*HasNUW=*/true, /*HasNSW=*/true);
654 VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
655 Value *ExitCond = Builder.CreateICmpNE(NewVectorIndexPhi, ExtEnd);
656 auto *VectorLoopBranchBack =
657 BranchInst::Create(VectorLoopStartBlock, EndBlock, ExitCond);
658 Builder.Insert(VectorLoopBranchBack);
659
660 DTU.applyUpdates(
661 {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
662 {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
663
664 // If we found a mismatch then we need to calculate which lane in the vector
665 // had a mismatch and add that on to the current loop index.
666 Builder.SetInsertPoint(VectorLoopMismatchBlock);
667
668 // Add LCSSA phis for CTZ and VectorIndexPhi.
669 auto *CTZLCSSAPhi = Builder.CreatePHI(CTZ->getType(), 1, "ctz");
670 CTZLCSSAPhi->addIncoming(CTZ, VectorLoopStartBlock);
671 auto *VectorIndexLCSSAPhi =
672 Builder.CreatePHI(VectorIndexPhi->getType(), 1, "mismatch_vector_index");
673 VectorIndexLCSSAPhi->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
674
675 Value *CTZI64 = Builder.CreateZExt(CTZLCSSAPhi, I64Type);
676 Value *VectorLoopRes64 = Builder.CreateAdd(VectorIndexLCSSAPhi, CTZI64, "",
677 /*HasNUW=*/true, /*HasNSW=*/true);
678 return Builder.CreateTrunc(VectorLoopRes64, ResType);
679}
680
681Value *LoopIdiomVectorize::expandFindMismatch(
682 IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
683 GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
684 Value *PtrA = GEPA->getPointerOperand();
685 Value *PtrB = GEPB->getPointerOperand();
686
687 // Get the arguments and types for the intrinsic.
688 BasicBlock *Preheader = CurLoop->getLoopPreheader();
689 BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
690 LLVMContext &Ctx = PHBranch->getContext();
691 Type *LoadType = Type::getInt8Ty(Ctx);
692 Type *ResType = Builder.getInt32Ty();
693
694 // Split block in the original loop preheader.
695 EndBlock = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
696
697 // Create the blocks that we're going to need:
698 // 1. A block for checking the zero-extended length exceeds 0
699 // 2. A block to check that the start and end addresses of a given array
700 // lie on the same page.
701 // 3. The vector loop preheader.
702 // 4. The first vector loop block.
703 // 5. The vector loop increment block.
704 // 6. A block we can jump to from the vector loop when a mismatch is found.
705 // 7. The first block of the scalar loop itself, containing PHIs , loads
706 // and cmp.
707 // 8. A scalar loop increment block to increment the PHIs and go back
708 // around the loop.
709
710 BasicBlock *MinItCheckBlock = BasicBlock::Create(
711 Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock);
712
713 // Update the terminator added by SplitBlock to branch to the first block
714 Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock);
715
716 BasicBlock *MemCheckBlock = BasicBlock::Create(
717 Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock);
718
719 VectorLoopPreheaderBlock = BasicBlock::Create(
720 Ctx, "mismatch_vec_loop_preheader", EndBlock->getParent(), EndBlock);
721
722 VectorLoopStartBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop",
723 EndBlock->getParent(), EndBlock);
724
725 VectorLoopIncBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_inc",
726 EndBlock->getParent(), EndBlock);
727
728 VectorLoopMismatchBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_found",
729 EndBlock->getParent(), EndBlock);
730
731 BasicBlock *LoopPreHeaderBlock = BasicBlock::Create(
732 Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock);
733
734 BasicBlock *LoopStartBlock =
735 BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock);
736
737 BasicBlock *LoopIncBlock = BasicBlock::Create(
738 Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock);
739
740 DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock},
741 {DominatorTree::Delete, Preheader, EndBlock}});
742
743 // Update LoopInfo with the new vector & scalar loops.
744 auto VectorLoop = LI->AllocateLoop();
745 auto ScalarLoop = LI->AllocateLoop();
746
747 if (CurLoop->getParentLoop()) {
748 CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, *LI);
749 CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, *LI);
750 CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopPreheaderBlock,
751 *LI);
752 CurLoop->getParentLoop()->addChildLoop(VectorLoop);
753 CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopMismatchBlock, *LI);
754 CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, *LI);
755 CurLoop->getParentLoop()->addChildLoop(ScalarLoop);
756 } else {
757 LI->addTopLevelLoop(VectorLoop);
758 LI->addTopLevelLoop(ScalarLoop);
759 }
760
761 // Add the new basic blocks to their associated loops.
762 VectorLoop->addBasicBlockToLoop(VectorLoopStartBlock, *LI);
763 VectorLoop->addBasicBlockToLoop(VectorLoopIncBlock, *LI);
764
765 ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI);
766 ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI);
767
768 // Set up some types and constants that we intend to reuse.
769 Type *I64Type = Builder.getInt64Ty();
770
771 // Check the zero-extended iteration count > 0
772 Builder.SetInsertPoint(MinItCheckBlock);
773 Value *ExtStart = Builder.CreateZExt(Start, I64Type);
774 Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type);
775 // This check doesn't really cost us very much.
776
777 Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen);
778 BranchInst *MinItCheckBr =
779 BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck);
780 MinItCheckBr->setMetadata(
781 LLVMContext::MD_prof,
782 MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1));
783 Builder.Insert(MinItCheckBr);
784
785 DTU.applyUpdates(
786 {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock},
787 {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}});
788
789 // For each of the arrays, check the start/end addresses are on the same
790 // page.
791 Builder.SetInsertPoint(MemCheckBlock);
792
793 // The early exit in the original loop means that when performing vector
794 // loads we are potentially reading ahead of the early exit. So we could
795 // fault if crossing a page boundary. Therefore, we create runtime memory
796 // checks based on the minimum page size as follows:
797 // 1. Calculate the addresses of the first memory accesses in the loop,
798 // i.e. LhsStart and RhsStart.
799 // 2. Get the last accessed addresses in the loop, i.e. LhsEnd and RhsEnd.
800 // 3. Determine which pages correspond to all the memory accesses, i.e
801 // LhsStartPage, LhsEndPage, RhsStartPage, RhsEndPage.
802 // 4. If LhsStartPage == LhsEndPage and RhsStartPage == RhsEndPage, then
803 // we know we won't cross any page boundaries in the loop so we can
804 // enter the vector loop! Otherwise we fall back on the scalar loop.
805 Value *LhsStartGEP = Builder.CreateGEP(LoadType, PtrA, ExtStart);
806 Value *RhsStartGEP = Builder.CreateGEP(LoadType, PtrB, ExtStart);
807 Value *RhsStart = Builder.CreatePtrToInt(RhsStartGEP, I64Type);
808 Value *LhsStart = Builder.CreatePtrToInt(LhsStartGEP, I64Type);
809 Value *LhsEndGEP = Builder.CreateGEP(LoadType, PtrA, ExtEnd);
810 Value *RhsEndGEP = Builder.CreateGEP(LoadType, PtrB, ExtEnd);
811 Value *LhsEnd = Builder.CreatePtrToInt(LhsEndGEP, I64Type);
812 Value *RhsEnd = Builder.CreatePtrToInt(RhsEndGEP, I64Type);
813
814 const uint64_t MinPageSize = TTI->getMinPageSize().value();
815 const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize);
816 Value *LhsStartPage = Builder.CreateLShr(LhsStart, AddrShiftAmt);
817 Value *LhsEndPage = Builder.CreateLShr(LhsEnd, AddrShiftAmt);
818 Value *RhsStartPage = Builder.CreateLShr(RhsStart, AddrShiftAmt);
819 Value *RhsEndPage = Builder.CreateLShr(RhsEnd, AddrShiftAmt);
820 Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage);
821 Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage);
822
823 Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp);
824 BranchInst *CombinedPageCmpCmpBr = BranchInst::Create(
825 LoopPreHeaderBlock, VectorLoopPreheaderBlock, CombinedPageCmp);
826 CombinedPageCmpCmpBr->setMetadata(
827 LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext())
828 .createBranchWeights(10, 90));
829 Builder.Insert(CombinedPageCmpCmpBr);
830
831 DTU.applyUpdates(
832 {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock},
833 {DominatorTree::Insert, MemCheckBlock, VectorLoopPreheaderBlock}});
834
835 // Set up the vector loop preheader, i.e. calculate initial loop predicate,
836 // zero-extend MaxLen to 64-bits, determine the number of vector elements
837 // processed in each iteration, etc.
838 Builder.SetInsertPoint(VectorLoopPreheaderBlock);
839
840 // At this point we know two things must be true:
841 // 1. Start <= End
842 // 2. ExtMaxLen <= MinPageSize due to the page checks.
843 // Therefore, we know that we can use a 64-bit induction variable that
844 // starts from 0 -> ExtMaxLen and it will not overflow.
845 Value *VectorLoopRes = nullptr;
846 switch (VectorizeStyle) {
848 VectorLoopRes =
849 createMaskedFindMismatch(Builder, DTU, GEPA, GEPB, ExtStart, ExtEnd);
850 break;
852 VectorLoopRes = createPredicatedFindMismatch(Builder, DTU, GEPA, GEPB,
853 ExtStart, ExtEnd);
854 break;
855 }
856
857 Builder.Insert(BranchInst::Create(EndBlock));
858
859 DTU.applyUpdates(
860 {{DominatorTree::Insert, VectorLoopMismatchBlock, EndBlock}});
861
862 // Generate code for scalar loop.
863 Builder.SetInsertPoint(LoopPreHeaderBlock);
864 Builder.Insert(BranchInst::Create(LoopStartBlock));
865
866 DTU.applyUpdates(
867 {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}});
868
869 Builder.SetInsertPoint(LoopStartBlock);
870 PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index");
871 IndexPhi->addIncoming(Start, LoopPreHeaderBlock);
872
873 // Otherwise compare the values
874 // Load bytes from each array and compare them.
875 Value *GepOffset = Builder.CreateZExt(IndexPhi, I64Type);
876
877 Value *LhsGep =
878 Builder.CreateGEP(LoadType, PtrA, GepOffset, "", GEPA->isInBounds());
879 Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep);
880
881 Value *RhsGep =
882 Builder.CreateGEP(LoadType, PtrB, GepOffset, "", GEPB->isInBounds());
883 Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep);
884
885 Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
886 // If we have a mismatch then exit the loop ...
887 BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp);
888 Builder.Insert(MatchCmpBr);
889
890 DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock},
891 {DominatorTree::Insert, LoopStartBlock, EndBlock}});
892
893 // Have we reached the maximum permitted length for the loop?
894 Builder.SetInsertPoint(LoopIncBlock);
895 Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "",
896 /*HasNUW=*/Index->hasNoUnsignedWrap(),
897 /*HasNSW=*/Index->hasNoSignedWrap());
898 IndexPhi->addIncoming(PhiInc, LoopIncBlock);
899 Value *IVCmp = Builder.CreateICmpEQ(PhiInc, MaxLen);
900 BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp);
901 Builder.Insert(IVCmpBr);
902
903 DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock},
904 {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}});
905
906 // In the end block we need to insert a PHI node to deal with three cases:
907 // 1. We didn't find a mismatch in the scalar loop, so we return MaxLen.
908 // 2. We exitted the scalar loop early due to a mismatch and need to return
909 // the index that we found.
910 // 3. We didn't find a mismatch in the vector loop, so we return MaxLen.
911 // 4. We exitted the vector loop early due to a mismatch and need to return
912 // the index that we found.
913 Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt());
914 PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result");
915 ResPhi->addIncoming(MaxLen, LoopIncBlock);
916 ResPhi->addIncoming(IndexPhi, LoopStartBlock);
917 ResPhi->addIncoming(MaxLen, VectorLoopIncBlock);
918 ResPhi->addIncoming(VectorLoopRes, VectorLoopMismatchBlock);
919
920 Value *FinalRes = Builder.CreateTrunc(ResPhi, ResType);
921
922 if (VerifyLoops) {
923 ScalarLoop->verifyLoop();
924 VectorLoop->verifyLoop();
925 if (!VectorLoop->isRecursivelyLCSSAForm(*DT, *LI))
926 report_fatal_error("Loops must remain in LCSSA form!");
927 if (!ScalarLoop->isRecursivelyLCSSAForm(*DT, *LI))
928 report_fatal_error("Loops must remain in LCSSA form!");
929 }
930
931 return FinalRes;
932}
933
934void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
935 GetElementPtrInst *GEPB,
936 PHINode *IndPhi, Value *MaxLen,
937 Instruction *Index, Value *Start,
938 bool IncIdx, BasicBlock *FoundBB,
939 BasicBlock *EndBB) {
940
941 // Insert the byte compare code at the end of the preheader block
942 BasicBlock *Preheader = CurLoop->getLoopPreheader();
943 BasicBlock *Header = CurLoop->getHeader();
944 BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
945 IRBuilder<> Builder(PHBranch);
946 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
947 Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
948
949 // Increment the pointer if this was done before the loads in the loop.
950 if (IncIdx)
951 Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1));
952
953 Value *ByteCmpRes =
954 expandFindMismatch(Builder, DTU, GEPA, GEPB, Index, Start, MaxLen);
955
956 // Replaces uses of index & induction Phi with intrinsic (we already
957 // checked that the the first instruction of Header is the Phi above).
958 assert(IndPhi->hasOneUse() && "Index phi node has more than one use!");
959 Index->replaceAllUsesWith(ByteCmpRes);
960
961 assert(PHBranch->isUnconditional() &&
962 "Expected preheader to terminate with an unconditional branch.");
963
964 // If no mismatch was found, we can jump to the end block. Create a
965 // new basic block for the compare instruction.
966 auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare",
967 Preheader->getParent());
968 CmpBB->moveBefore(EndBB);
969
970 // Replace the branch in the preheader with an always-true conditional branch.
971 // This ensures there is still a reference to the original loop.
972 Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header);
973 PHBranch->eraseFromParent();
974
975 BasicBlock *MismatchEnd = cast<Instruction>(ByteCmpRes)->getParent();
976 DTU.applyUpdates({{DominatorTree::Insert, MismatchEnd, CmpBB}});
977
978 // Create the branch to either the end or found block depending on the value
979 // returned by the intrinsic.
980 Builder.SetInsertPoint(CmpBB);
981 if (FoundBB != EndBB) {
982 Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen);
983 Builder.CreateCondBr(FoundCmp, EndBB, FoundBB);
984 DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB},
985 {DominatorTree::Insert, CmpBB, EndBB}});
986
987 } else {
988 Builder.CreateBr(FoundBB);
989 DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}});
990 }
991
992 // Ensure all Phis in the successors of CmpBB have an incoming value from it.
993 fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, EndBB, CmpBB);
994 if (EndBB != FoundBB)
995 fixSuccessorPhis(CurLoop, ByteCmpRes, ByteCmpRes, FoundBB, CmpBB);
996
997 // The new CmpBB block isn't part of the loop, but will need to be added to
998 // the outer loop if there is one.
999 if (!CurLoop->isOutermost())
1000 CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, *LI);
1001
1002 if (VerifyLoops && CurLoop->getParentLoop()) {
1003 CurLoop->getParentLoop()->verifyLoop();
1004 if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
1005 report_fatal_error("Loops must remain in LCSSA form!");
1006 }
1007}
1008
1009bool LoopIdiomVectorize::recognizeFindFirstByte() {
1010 // Currently the transformation only works on scalable vector types, although
1011 // there is no fundamental reason why it cannot be made to work for fixed
1012 // vectors. We also need to know the target's minimum page size in order to
1013 // generate runtime memory checks to ensure the vector version won't fault.
1014 if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() ||
1016 return false;
1017
1018 // We exclude loops with trip counts > minimum page size via runtime checks,
1019 // so make sure that the minimum page size is something sensible such that
1020 // induction variables cannot overflow.
1021 if (uint64_t(*TTI->getMinPageSize()) >
1022 (std::numeric_limits<uint64_t>::max() / 2))
1023 return false;
1024
1025 // Define some constants we need throughout.
1026 BasicBlock *Header = CurLoop->getHeader();
1027 LLVMContext &Ctx = Header->getContext();
1028
1029 // We are expecting the four blocks defined below: Header, MatchBB, InnerBB,
1030 // and OuterBB. For now, we will bail our for almost anything else. The Four
1031 // blocks contain one nested loop.
1032 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4 ||
1033 CurLoop->getSubLoops().size() != 1)
1034 return false;
1035
1036 auto *InnerLoop = CurLoop->getSubLoops().front();
1037 Function &F = *InnerLoop->getHeader()->getParent();
1038
1039 // Bail if vectorization is disabled on inner loop.
1040 LoopVectorizeHints Hints(InnerLoop, /*InterleaveOnlyWhenForced=*/true, ORE);
1041 if (!Hints.allowVectorization(&F, InnerLoop,
1042 /*VectorizeOnlyWhenForced=*/false)) {
1043 LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on inner loop "
1044 << InnerLoop->getName()
1045 << " due to vectorization hints\n");
1046 return false;
1047 }
1048
1049 PHINode *IndPhi = dyn_cast<PHINode>(&Header->front());
1050 if (!IndPhi || IndPhi->getNumIncomingValues() != 2)
1051 return false;
1052
1053 // Check instruction counts.
1054 auto LoopBlocks = CurLoop->getBlocks();
1055 if (LoopBlocks[0]->sizeWithoutDebug() > 3 ||
1056 LoopBlocks[1]->sizeWithoutDebug() > 4 ||
1057 LoopBlocks[2]->sizeWithoutDebug() > 3 ||
1058 LoopBlocks[3]->sizeWithoutDebug() > 3)
1059 return false;
1060
1061 // Check that no instruction other than IndPhi has outside uses.
1062 for (BasicBlock *BB : LoopBlocks)
1063 for (Instruction &I : *BB)
1064 if (&I != IndPhi)
1065 for (User *U : I.users())
1066 if (!CurLoop->contains(cast<Instruction>(U)))
1067 return false;
1068
1069 // Match the branch instruction in the header. We are expecting an
1070 // unconditional branch to the inner loop.
1071 //
1072 // Header:
1073 // %14 = phi ptr [ %24, %OuterBB ], [ %3, %Header.preheader ]
1074 // %15 = load i8, ptr %14, align 1
1075 // br label %MatchBB
1076 BasicBlock *MatchBB;
1077 if (!match(Header->getTerminator(), m_UnconditionalBr(MatchBB)) ||
1078 !InnerLoop->contains(MatchBB))
1079 return false;
1080
1081 // MatchBB should be the entrypoint into the inner loop containing the
1082 // comparison between a search element and a needle.
1083 //
1084 // MatchBB:
1085 // %20 = phi ptr [ %7, %Header ], [ %17, %InnerBB ]
1086 // %21 = load i8, ptr %20, align 1
1087 // %22 = icmp eq i8 %15, %21
1088 // br i1 %22, label %ExitSucc, label %InnerBB
1089 BasicBlock *ExitSucc, *InnerBB;
1090 Value *LoadSearch, *LoadNeedle;
1091 CmpPredicate MatchPred;
1092 if (!match(MatchBB->getTerminator(),
1093 m_Br(m_ICmp(MatchPred, m_Value(LoadSearch), m_Value(LoadNeedle)),
1094 m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) ||
1095 MatchPred != ICmpInst::ICMP_EQ || !InnerLoop->contains(InnerBB))
1096 return false;
1097
1098 // We expect outside uses of `IndPhi' in ExitSucc (and only there).
1099 for (User *U : IndPhi->users())
1100 if (!CurLoop->contains(cast<Instruction>(U))) {
1101 auto *PN = dyn_cast<PHINode>(U);
1102 if (!PN || PN->getParent() != ExitSucc)
1103 return false;
1104 }
1105
1106 // Match the loads and check they are simple.
1107 Value *Search, *Needle;
1108 if (!match(LoadSearch, m_Load(m_Value(Search))) ||
1109 !match(LoadNeedle, m_Load(m_Value(Needle))) ||
1110 !cast<LoadInst>(LoadSearch)->isSimple() ||
1111 !cast<LoadInst>(LoadNeedle)->isSimple())
1112 return false;
1113
1114 // Check we are loading valid characters.
1115 Type *CharTy = LoadSearch->getType();
1116 if (!CharTy->isIntegerTy() || LoadNeedle->getType() != CharTy)
1117 return false;
1118
1119 // Pick the vectorisation factor based on CharTy, work out the cost of the
1120 // match intrinsic and decide if we should use it.
1121 // Note: For the time being we assume 128-bit vectors.
1122 unsigned VF = 128 / CharTy->getIntegerBitWidth();
1124 ScalableVectorType::get(CharTy, VF), FixedVectorType::get(CharTy, VF),
1126 IntrinsicCostAttributes Attrs(Intrinsic::experimental_vector_match, Args[2],
1127 Args);
1128 if (TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) > 4)
1129 return false;
1130
1131 // The loads come from two PHIs, each with two incoming values.
1132 PHINode *PSearch = dyn_cast<PHINode>(Search);
1133 PHINode *PNeedle = dyn_cast<PHINode>(Needle);
1134 if (!PSearch || PSearch->getNumIncomingValues() != 2 || !PNeedle ||
1135 PNeedle->getNumIncomingValues() != 2)
1136 return false;
1137
1138 // One PHI comes from the outer loop (PSearch), the other one from the inner
1139 // loop (PNeedle). PSearch effectively corresponds to IndPhi.
1140 if (InnerLoop->contains(PSearch))
1141 std::swap(PSearch, PNeedle);
1142 if (PSearch != &Header->front() || PNeedle != &MatchBB->front())
1143 return false;
1144
1145 // The incoming values of both PHI nodes should be a gep of 1.
1146 Value *SearchStart = PSearch->getIncomingValue(0);
1147 Value *SearchIndex = PSearch->getIncomingValue(1);
1148 if (CurLoop->contains(PSearch->getIncomingBlock(0)))
1149 std::swap(SearchStart, SearchIndex);
1150
1151 Value *NeedleStart = PNeedle->getIncomingValue(0);
1152 Value *NeedleIndex = PNeedle->getIncomingValue(1);
1153 if (InnerLoop->contains(PNeedle->getIncomingBlock(0)))
1154 std::swap(NeedleStart, NeedleIndex);
1155
1156 // Match the GEPs.
1157 if (!match(SearchIndex, m_GEP(m_Specific(PSearch), m_One())) ||
1158 !match(NeedleIndex, m_GEP(m_Specific(PNeedle), m_One())))
1159 return false;
1160
1161 // Check the GEPs result type matches `CharTy'.
1162 GetElementPtrInst *GEPSearch = cast<GetElementPtrInst>(SearchIndex);
1163 GetElementPtrInst *GEPNeedle = cast<GetElementPtrInst>(NeedleIndex);
1164 if (GEPSearch->getResultElementType() != CharTy ||
1165 GEPNeedle->getResultElementType() != CharTy)
1166 return false;
1167
1168 // InnerBB should increment the address of the needle pointer.
1169 //
1170 // InnerBB:
1171 // %17 = getelementptr inbounds i8, ptr %20, i64 1
1172 // %18 = icmp eq ptr %17, %10
1173 // br i1 %18, label %OuterBB, label %MatchBB
1174 BasicBlock *OuterBB;
1175 Value *NeedleEnd;
1176 if (!match(InnerBB->getTerminator(),
1178 m_Value(NeedleEnd)),
1179 m_BasicBlock(OuterBB), m_Specific(MatchBB))) ||
1180 !CurLoop->contains(OuterBB))
1181 return false;
1182
1183 // OuterBB should increment the address of the search element pointer.
1184 //
1185 // OuterBB:
1186 // %24 = getelementptr inbounds i8, ptr %14, i64 1
1187 // %25 = icmp eq ptr %24, %6
1188 // br i1 %25, label %ExitFail, label %Header
1189 BasicBlock *ExitFail;
1190 Value *SearchEnd;
1191 if (!match(OuterBB->getTerminator(),
1193 m_Value(SearchEnd)),
1194 m_BasicBlock(ExitFail), m_Specific(Header))))
1195 return false;
1196
1197 if (!CurLoop->isLoopInvariant(SearchStart) ||
1198 !CurLoop->isLoopInvariant(SearchEnd) ||
1199 !CurLoop->isLoopInvariant(NeedleStart) ||
1200 !CurLoop->isLoopInvariant(NeedleEnd))
1201 return false;
1202
1203 LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n");
1204
1205 transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, SearchStart,
1206 SearchEnd, NeedleStart, NeedleEnd);
1207 return true;
1208}
1209
1210Value *LoopIdiomVectorize::expandFindFirstByte(
1211 IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy,
1212 Value *IndPhi, BasicBlock *ExitSucc, BasicBlock *ExitFail,
1213 Value *SearchStart, Value *SearchEnd, Value *NeedleStart,
1214 Value *NeedleEnd) {
1215 // Set up some types and constants that we intend to reuse.
1216 auto *PtrTy = Builder.getPtrTy();
1217 auto *I64Ty = Builder.getInt64Ty();
1218 auto *PredVTy = ScalableVectorType::get(Builder.getInt1Ty(), VF);
1219 auto *CharVTy = ScalableVectorType::get(CharTy, VF);
1220 auto *ConstVF = ConstantInt::get(I64Ty, VF);
1221
1222 // Other common arguments.
1223 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1224 LLVMContext &Ctx = Preheader->getContext();
1225 Value *Passthru = ConstantInt::getNullValue(CharVTy);
1226
1227 // Split block in the original loop preheader.
1228 // SPH is the new preheader to the old scalar loop.
1229 BasicBlock *SPH = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1230 nullptr, "scalar_preheader");
1231
1232 // Create the blocks that we're going to use.
1233 //
1234 // We will have the following loops:
1235 // (O) Outer loop where we iterate over the elements of the search array.
1236 // (I) Inner loop where we iterate over the elements of the needle array.
1237 //
1238 // Overall, the blocks do the following:
1239 // (0) Check if the arrays can't cross page boundaries. If so go to (1),
1240 // otherwise fall back to the original scalar loop.
1241 // (1) Load the search array. Go to (2).
1242 // (2) (a) Load the needle array.
1243 // (b) Splat the first element to the inactive lanes.
1244 // (c) Accumulate any matches found. If we haven't reached the end of the
1245 // needle array loop back to (2), otherwise go to (3).
1246 // (3) Test if we found any match. If so go to (4), otherwise go to (5).
1247 // (4) Compute the index of the first match and exit.
1248 // (5) Check if we've reached the end of the search array. If not loop back to
1249 // (1), otherwise exit.
1250 // Blocks (0,4) are not part of any loop. Blocks (1,3,5) and (2) belong to the
1251 // outer and inner loops, respectively.
1252 BasicBlock *BB0 = BasicBlock::Create(Ctx, "mem_check", SPH->getParent(), SPH);
1253 BasicBlock *BB1 =
1254 BasicBlock::Create(Ctx, "find_first_vec_header", SPH->getParent(), SPH);
1255 BasicBlock *BB2 =
1256 BasicBlock::Create(Ctx, "needle_check_vec", SPH->getParent(), SPH);
1257 BasicBlock *BB3 =
1258 BasicBlock::Create(Ctx, "match_check_vec", SPH->getParent(), SPH);
1259 BasicBlock *BB4 =
1260 BasicBlock::Create(Ctx, "calculate_match", SPH->getParent(), SPH);
1261 BasicBlock *BB5 =
1262 BasicBlock::Create(Ctx, "search_check_vec", SPH->getParent(), SPH);
1263
1264 // Update LoopInfo with the new loops.
1265 auto OuterLoop = LI->AllocateLoop();
1266 auto InnerLoop = LI->AllocateLoop();
1267
1268 if (auto ParentLoop = CurLoop->getParentLoop()) {
1269 ParentLoop->addBasicBlockToLoop(BB0, *LI);
1270 ParentLoop->addChildLoop(OuterLoop);
1271 ParentLoop->addBasicBlockToLoop(BB4, *LI);
1272 } else {
1273 LI->addTopLevelLoop(OuterLoop);
1274 }
1275
1276 // Add the inner loop to the outer.
1277 OuterLoop->addChildLoop(InnerLoop);
1278
1279 // Add the new basic blocks to the corresponding loops.
1280 OuterLoop->addBasicBlockToLoop(BB1, *LI);
1281 OuterLoop->addBasicBlockToLoop(BB3, *LI);
1282 OuterLoop->addBasicBlockToLoop(BB5, *LI);
1283 InnerLoop->addBasicBlockToLoop(BB2, *LI);
1284
1285 // Update the terminator added by SplitBlock to branch to the first block.
1286 Preheader->getTerminator()->setSuccessor(0, BB0);
1287 DTU.applyUpdates({{DominatorTree::Delete, Preheader, SPH},
1288 {DominatorTree::Insert, Preheader, BB0}});
1289
1290 // (0) Check if we could be crossing a page boundary; if so, fallback to the
1291 // old scalar loops. Also create a predicate of VF elements to be used in the
1292 // vector loops.
1293 Builder.SetInsertPoint(BB0);
1294 Value *ISearchStart =
1295 Builder.CreatePtrToInt(SearchStart, I64Ty, "search_start_int");
1296 Value *ISearchEnd =
1297 Builder.CreatePtrToInt(SearchEnd, I64Ty, "search_end_int");
1298 Value *SearchIdxInit = Constant::getNullValue(I64Ty);
1299 Value *SearchTripCount =
1300 Builder.CreateZExt(Builder.CreatePtrDiff(CharTy, SearchEnd, SearchStart,
1301 "search_trip_count"),
1302 I64Ty);
1303 Value *INeedleStart =
1304 Builder.CreatePtrToInt(NeedleStart, I64Ty, "needle_start_int");
1305 Value *INeedleEnd =
1306 Builder.CreatePtrToInt(NeedleEnd, I64Ty, "needle_end_int");
1307 Value *NeedleIdxInit = Constant::getNullValue(I64Ty);
1308 Value *NeedleTripCount =
1309 Builder.CreateZExt(Builder.CreatePtrDiff(CharTy, NeedleEnd, NeedleStart,
1310 "needle_trip_count"),
1311 I64Ty);
1312 Value *PredVF =
1313 Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
1314 {ConstantInt::get(I64Ty, 0), ConstVF});
1315
1316 const uint64_t MinPageSize = TTI->getMinPageSize().value();
1317 const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize);
1318 Value *SearchStartPage =
1319 Builder.CreateLShr(ISearchStart, AddrShiftAmt, "search_start_page");
1320 Value *SearchEndPage =
1321 Builder.CreateLShr(ISearchEnd, AddrShiftAmt, "search_end_page");
1322 Value *NeedleStartPage =
1323 Builder.CreateLShr(INeedleStart, AddrShiftAmt, "needle_start_page");
1324 Value *NeedleEndPage =
1325 Builder.CreateLShr(INeedleEnd, AddrShiftAmt, "needle_end_page");
1326 Value *SearchPageCmp =
1327 Builder.CreateICmpNE(SearchStartPage, SearchEndPage, "search_page_cmp");
1328 Value *NeedlePageCmp =
1329 Builder.CreateICmpNE(NeedleStartPage, NeedleEndPage, "needle_page_cmp");
1330
1331 Value *CombinedPageCmp =
1332 Builder.CreateOr(SearchPageCmp, NeedlePageCmp, "combined_page_cmp");
1333 BranchInst *CombinedPageBr = Builder.CreateCondBr(CombinedPageCmp, SPH, BB1);
1334 CombinedPageBr->setMetadata(LLVMContext::MD_prof,
1335 MDBuilder(Ctx).createBranchWeights(10, 90));
1336 DTU.applyUpdates(
1337 {{DominatorTree::Insert, BB0, SPH}, {DominatorTree::Insert, BB0, BB1}});
1338
1339 // (1) Load the search array and branch to the inner loop.
1340 Builder.SetInsertPoint(BB1);
1341 PHINode *SearchIdx = Builder.CreatePHI(I64Ty, 2, "search_idx");
1342 Value *PredSearch = Builder.CreateIntrinsic(
1343 Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
1344 {SearchIdx, SearchTripCount}, nullptr, "search_pred");
1345 PredSearch = Builder.CreateAnd(PredVF, PredSearch, "search_masked");
1346 Value *Search = Builder.CreateGEP(CharTy, SearchStart, SearchIdx, "psearch");
1347 Value *LoadSearch = Builder.CreateMaskedLoad(
1348 CharVTy, Search, Align(1), PredSearch, Passthru, "search_load_vec");
1349 Value *MatchInit = Constant::getNullValue(PredVTy);
1350 Builder.CreateBr(BB2);
1351 DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}});
1352
1353 // (2) Inner loop.
1354 Builder.SetInsertPoint(BB2);
1355 PHINode *NeedleIdx = Builder.CreatePHI(I64Ty, 2, "needle_idx");
1356 PHINode *Match = Builder.CreatePHI(PredVTy, 2, "pmatch");
1357
1358 // (2.a) Load the needle array.
1359 Value *PredNeedle = Builder.CreateIntrinsic(
1360 Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
1361 {NeedleIdx, NeedleTripCount}, nullptr, "needle_pred");
1362 PredNeedle = Builder.CreateAnd(PredVF, PredNeedle, "needle_masked");
1363 Value *Needle = Builder.CreateGEP(CharTy, NeedleStart, NeedleIdx, "pneedle");
1364 Value *LoadNeedle = Builder.CreateMaskedLoad(
1365 CharVTy, Needle, Align(1), PredNeedle, Passthru, "needle_load_vec");
1366
1367 // (2.b) Splat the first element to the inactive lanes.
1368 Value *Needle0 =
1369 Builder.CreateExtractElement(LoadNeedle, uint64_t(0), "needle0");
1370 Value *Needle0Splat = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
1371 Needle0, "needle0");
1372 LoadNeedle = Builder.CreateSelect(PredNeedle, LoadNeedle, Needle0Splat,
1373 "needle_splat");
1374 LoadNeedle = Builder.CreateExtractVector(
1375 FixedVectorType::get(CharTy, VF), LoadNeedle, uint64_t(0), "needle_vec");
1376
1377 // (2.c) Accumulate matches.
1378 Value *MatchSeg = Builder.CreateIntrinsic(
1379 Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()},
1380 {LoadSearch, LoadNeedle, PredSearch}, nullptr, "match_segment");
1381 Value *MatchAcc = Builder.CreateOr(Match, MatchSeg, "match_accumulator");
1382 Value *NextNeedleIdx =
1383 Builder.CreateAdd(NeedleIdx, ConstVF, "needle_idx_next");
1384 Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedleIdx, NeedleTripCount),
1385 BB2, BB3);
1386 DTU.applyUpdates(
1387 {{DominatorTree::Insert, BB2, BB2}, {DominatorTree::Insert, BB2, BB3}});
1388
1389 // (3) Check if we found a match.
1390 Builder.SetInsertPoint(BB3);
1391 PHINode *MatchPredAccLCSSA = Builder.CreatePHI(PredVTy, 1, "match_pred");
1392 Value *IfAnyMatch = Builder.CreateOrReduce(MatchPredAccLCSSA);
1393 Builder.CreateCondBr(IfAnyMatch, BB4, BB5);
1394 DTU.applyUpdates(
1395 {{DominatorTree::Insert, BB3, BB4}, {DominatorTree::Insert, BB3, BB5}});
1396
1397 // (4) We found a match. Compute the index of its location and exit.
1398 Builder.SetInsertPoint(BB4);
1399 PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1, "match_start");
1400 PHINode *MatchPredLCSSA = Builder.CreatePHI(PredVTy, 1, "match_vec");
1401 Value *MatchCnt = Builder.CreateIntrinsic(
1402 Intrinsic::experimental_cttz_elts, {I64Ty, PredVTy},
1403 {MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(true)}, nullptr,
1404 "match_idx");
1405 Value *MatchVal =
1406 Builder.CreateGEP(CharTy, MatchLCSSA, MatchCnt, "match_res");
1407 Builder.CreateBr(ExitSucc);
1408 DTU.applyUpdates({{DominatorTree::Insert, BB4, ExitSucc}});
1409
1410 // (5) Check if we've reached the end of the search array.
1411 Builder.SetInsertPoint(BB5);
1412 Value *NextSearchIdx =
1413 Builder.CreateAdd(SearchIdx, ConstVF, "search_idx_next");
1414 Builder.CreateCondBr(Builder.CreateICmpULT(NextSearchIdx, SearchTripCount),
1415 BB1, ExitFail);
1416 DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1},
1417 {DominatorTree::Insert, BB5, ExitFail}});
1418
1419 // Set up the PHI nodes.
1420 SearchIdx->addIncoming(SearchIdxInit, BB0);
1421 SearchIdx->addIncoming(NextSearchIdx, BB5);
1422 NeedleIdx->addIncoming(NeedleIdxInit, BB1);
1423 NeedleIdx->addIncoming(NextNeedleIdx, BB2);
1424 Match->addIncoming(MatchInit, BB1);
1425 Match->addIncoming(MatchAcc, BB2);
1426 // These are needed to retain LCSSA form.
1427 MatchPredAccLCSSA->addIncoming(MatchAcc, BB2);
1428 MatchLCSSA->addIncoming(Search, BB3);
1429 MatchPredLCSSA->addIncoming(MatchPredAccLCSSA, BB3);
1430
1431 // Ensure all Phis in the successors of BB4/BB5 have an incoming value from
1432 // them.
1433 fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitSucc, BB4);
1434 if (ExitSucc != ExitFail)
1435 fixSuccessorPhis(CurLoop, IndPhi, MatchVal, ExitFail, BB5);
1436
1437 if (VerifyLoops) {
1438 OuterLoop->verifyLoop();
1439 InnerLoop->verifyLoop();
1440 if (!OuterLoop->isRecursivelyLCSSAForm(*DT, *LI))
1441 report_fatal_error("Loops must remain in LCSSA form!");
1442 }
1443
1444 return MatchVal;
1445}
1446
1447void LoopIdiomVectorize::transformFindFirstByte(
1448 PHINode *IndPhi, unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
1449 BasicBlock *ExitFail, Value *SearchStart, Value *SearchEnd,
1450 Value *NeedleStart, Value *NeedleEnd) {
1451 // Insert the find first byte code at the end of the preheader block.
1452 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1453 BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
1454 IRBuilder<> Builder(PHBranch);
1455 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
1456 Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
1457
1458 expandFindFirstByte(Builder, DTU, VF, CharTy, IndPhi, ExitSucc, ExitFail,
1459 SearchStart, SearchEnd, NeedleStart, NeedleEnd);
1460
1461 assert(PHBranch->isUnconditional() &&
1462 "Expected preheader to terminate with an unconditional branch.");
1463
1464 if (VerifyLoops && CurLoop->getParentLoop()) {
1465 CurLoop->getParentLoop()->verifyLoop();
1466 if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
1467 report_fatal_error("Loops must remain in LCSSA form!");
1468 }
1469}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define DEBUG_TYPE
static MDNode * createBranchWeights(LLVMContext &Context, uint64_t TrueWeight, uint64_t FalseWeight)
static cl::opt< bool > VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false), cl::desc("Verify loops generated Loop Idiom Vectorize Pass."))
static cl::opt< bool > DisableAll("disable-loop-idiom-vectorize-all", cl::Hidden, cl::init(false), cl::desc("Disable Loop Idiom Vectorize Pass."))
static void fixSuccessorPhis(Loop *L, Value *ScalarRes, Value *VectorRes, BasicBlock *SuccBB, BasicBlock *IncBB)
static cl::opt< LoopIdiomVectorizeStyle > LITVecStyle("loop-idiom-vectorize-style", cl::Hidden, cl::desc("The vectorization style for loop idiom transform."), cl::values(clEnumValN(LoopIdiomVectorizeStyle::Masked, "masked", "Use masked vector intrinsics"), clEnumValN(LoopIdiomVectorizeStyle::Predicated, "predicated", "Use VP intrinsics")), cl::init(LoopIdiomVectorizeStyle::Masked))
static cl::opt< bool > DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte", cl::Hidden, cl::init(false), cl::desc("Do not convert find-first-byte loop(s)."))
static cl::opt< unsigned > ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden, cl::desc("The vectorization factor for byte-compare patterns."), cl::init(16))
static cl::opt< bool > DisableByteCmp("disable-loop-idiom-vectorize-bytecmp", cl::Hidden, cl::init(false), cl::desc("Proceed with Loop Idiom Vectorize Pass, but do " "not convert byte-compare loop(s)."))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static bool isSimple(Instruction *I)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static cl::opt< unsigned > MinPageSize("min-page-size", cl::init(0), cl::Hidden, cl::desc("Use this to override the target's minimum page size."))
This pass exposes codegen information to IR-level passes.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:539
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
const Instruction & front() const
Definition BasicBlock.h:493
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
bool isUnconditional() const
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
LLVM_ABI bool isInBounds() const
Determine whether the GEP has the inbounds flag.
Type * getResultElementType() const
unsigned getNumIndices() const
Module * getParent()
Get the module that this global value is contained inside of...
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition IRBuilder.h:497
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2324
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition IRBuilder.h:1096
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:546
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2549
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:502
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1516
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:561
Value * CreateVScale(Type *Ty, const Twine &Name="")
Create a call to llvm.vscale.<Ty>().
Definition IRBuilder.h:957
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition IRBuilder.h:566
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2312
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:1944
LLVM_ABI CallInst * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2473
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2308
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateCountTrailingZeroElems(Type *ResTy, Value *Mask, bool ZeroIsPoison=true, const Twine &Name="")
Create a call to llvm.experimental_cttz_elts.
Definition IRBuilder.h:1137
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1423
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1200
LLVM_ABI Value * CreatePtrDiff(Value *LHS, Value *RHS, const Twine &Name="", bool IsNUW=false)
Return the difference between two pointer values.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1854
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2054
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1554
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1406
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2166
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2040
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:604
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition IRBuilder.h:1194
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateICmpULE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2328
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1576
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:551
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1440
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
bool isSimple() const
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Class to represent scalable SIMD vectors.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:824
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:427
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
const ParentTy * getParent() const
Definition ilist_node.h:34
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
br_match m_UnconditionalBr(BasicBlock *&Succ)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
auto m_GEP(const OperandTypes &...Ops)
Matches GetElementPtrInst.
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
OuterAnalysisManagerProxy< FunctionAnalysisManager, Loop, LoopStandardAnalysisResults & > FunctionAnalysisManagerLoopProxy
A proxy from a FunctionAnalysisManager to a Loop.
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...