LLVM 17.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
64#include "llvm/IR/BasicBlock.h"
65#include "llvm/IR/Constant.h"
66#include "llvm/IR/Constants.h"
67#include "llvm/IR/DataLayout.h"
68#include "llvm/IR/DebugLoc.h"
70#include "llvm/IR/Dominators.h"
71#include "llvm/IR/GlobalValue.h"
73#include "llvm/IR/IRBuilder.h"
74#include "llvm/IR/InstrTypes.h"
75#include "llvm/IR/Instruction.h"
78#include "llvm/IR/Intrinsics.h"
79#include "llvm/IR/LLVMContext.h"
80#include "llvm/IR/Module.h"
81#include "llvm/IR/PassManager.h"
83#include "llvm/IR/Type.h"
84#include "llvm/IR/User.h"
85#include "llvm/IR/Value.h"
86#include "llvm/IR/ValueHandle.h"
89#include "llvm/Support/Debug.h"
96#include <algorithm>
97#include <cassert>
98#include <cstdint>
99#include <utility>
100#include <vector>
101
102using namespace llvm;
103
104#define DEBUG_TYPE "loop-idiom"
105
106STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
107STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
108STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
110 NumShiftUntilBitTest,
111 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
112STATISTIC(NumShiftUntilZero,
113 "Number of uncountable loops recognized as 'shift until zero' idiom");
114
117 DisableLIRPAll("disable-" DEBUG_TYPE "-all",
118 cl::desc("Options to disable Loop Idiom Recognize Pass."),
121
124 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
125 cl::desc("Proceed with loop idiom recognize pass, but do "
126 "not convert loop(s) to memset."),
129
132 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
133 cl::desc("Proceed with loop idiom recognize pass, but do "
134 "not convert loop(s) to memcpy."),
137
139 "use-lir-code-size-heurs",
140 cl::desc("Use loop idiom recognition code size heuristics when compiling"
141 "with -Os/-Oz"),
142 cl::init(true), cl::Hidden);
143
144namespace {
145
146class LoopIdiomRecognize {
147 Loop *CurLoop = nullptr;
148 AliasAnalysis *AA;
149 DominatorTree *DT;
150 LoopInfo *LI;
151 ScalarEvolution *SE;
154 const DataLayout *DL;
156 bool ApplyCodeSizeHeuristics;
157 std::unique_ptr<MemorySSAUpdater> MSSAU;
158
159public:
160 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
161 LoopInfo *LI, ScalarEvolution *SE,
163 const TargetTransformInfo *TTI, MemorySSA *MSSA,
164 const DataLayout *DL,
166 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
167 if (MSSA)
168 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
169 }
170
171 bool runOnLoop(Loop *L);
172
173private:
174 using StoreList = SmallVector<StoreInst *, 8>;
175 using StoreListMap = MapVector<Value *, StoreList>;
176
177 StoreListMap StoreRefsForMemset;
178 StoreListMap StoreRefsForMemsetPattern;
179 StoreList StoreRefsForMemcpy;
180 bool HasMemset;
181 bool HasMemsetPattern;
182 bool HasMemcpy;
183
184 /// Return code for isLegalStore()
185 enum LegalStoreKind {
186 None = 0,
187 Memset,
188 MemsetPattern,
189 Memcpy,
190 UnorderedAtomicMemcpy,
191 DontUse // Dummy retval never to be used. Allows catching errors in retval
192 // handling.
193 };
194
195 /// \name Countable Loop Idiom Handling
196 /// @{
197
198 bool runOnCountableLoop();
199 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
201
202 void collectStores(BasicBlock *BB);
203 LegalStoreKind isLegalStore(StoreInst *SI);
204 enum class ForMemset { No, Yes };
205 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
206 ForMemset For);
207
208 template <typename MemInst>
209 bool processLoopMemIntrinsic(
210 BasicBlock *BB,
211 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
212 const SCEV *BECount);
213 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
214 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
215
216 bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
217 MaybeAlign StoreAlignment, Value *StoredVal,
218 Instruction *TheStore,
220 const SCEVAddRecExpr *Ev, const SCEV *BECount,
221 bool IsNegStride, bool IsLoopMemset = false);
222 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
223 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
224 const SCEV *StoreSize, MaybeAlign StoreAlign,
225 MaybeAlign LoadAlign, Instruction *TheStore,
226 Instruction *TheLoad,
227 const SCEVAddRecExpr *StoreEv,
228 const SCEVAddRecExpr *LoadEv,
229 const SCEV *BECount);
230 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
231 bool IsLoopMemset = false);
232
233 /// @}
234 /// \name Noncountable Loop Idiom Handling
235 /// @{
236
237 bool runOnNoncountableLoop();
238
239 bool recognizePopcount();
240 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
241 PHINode *CntPhi, Value *Var);
242 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
243 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
244 Instruction *CntInst, PHINode *CntPhi,
245 Value *Var, Instruction *DefX,
246 const DebugLoc &DL, bool ZeroCheck,
247 bool IsCntPhiUsedOutsideLoop);
248
249 bool recognizeShiftUntilBitTest();
250 bool recognizeShiftUntilZero();
251
252 /// @}
253};
254} // end anonymous namespace
255
258 LPMUpdater &) {
260 return PreservedAnalyses::all();
261
262 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
263
264 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
265 // pass. Function analyses need to be preserved across loop transformations
266 // but ORE cannot be preserved (see comment before the pass definition).
267 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
268
269 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
270 AR.MSSA, DL, ORE);
271 if (!LIR.runOnLoop(&L))
272 return PreservedAnalyses::all();
273
275 if (AR.MSSA)
276 PA.preserve<MemorySSAAnalysis>();
277 return PA;
278}
279
281 I->replaceAllUsesWith(PoisonValue::get(I->getType()));
282 I->eraseFromParent();
283}
284
285//===----------------------------------------------------------------------===//
286//
287// Implementation of LoopIdiomRecognize
288//
289//===----------------------------------------------------------------------===//
290
291bool LoopIdiomRecognize::runOnLoop(Loop *L) {
292 CurLoop = L;
293 // If the loop could not be converted to canonical form, it must have an
294 // indirectbr in it, just give up.
295 if (!L->getLoopPreheader())
296 return false;
297
298 // Disable loop idiom recognition if the function's name is a common idiom.
299 StringRef Name = L->getHeader()->getParent()->getName();
300 if (Name == "memset" || Name == "memcpy")
301 return false;
302
303 // Determine if code size heuristics need to be applied.
304 ApplyCodeSizeHeuristics =
305 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
306
307 HasMemset = TLI->has(LibFunc_memset);
308 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
309 HasMemcpy = TLI->has(LibFunc_memcpy);
310
311 if (HasMemset || HasMemsetPattern || HasMemcpy)
313 return runOnCountableLoop();
314
315 return runOnNoncountableLoop();
316}
317
318bool LoopIdiomRecognize::runOnCountableLoop() {
319 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
320 assert(!isa<SCEVCouldNotCompute>(BECount) &&
321 "runOnCountableLoop() called on a loop without a predictable"
322 "backedge-taken count");
323
324 // If this loop executes exactly one time, then it should be peeled, not
325 // optimized by this pass.
326 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
327 if (BECst->getAPInt() == 0)
328 return false;
329
331 CurLoop->getUniqueExitBlocks(ExitBlocks);
332
333 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
334 << CurLoop->getHeader()->getParent()->getName()
335 << "] Countable Loop %" << CurLoop->getHeader()->getName()
336 << "\n");
337
338 // The following transforms hoist stores/memsets into the loop pre-header.
339 // Give up if the loop has instructions that may throw.
340 SimpleLoopSafetyInfo SafetyInfo;
341 SafetyInfo.computeLoopSafetyInfo(CurLoop);
342 if (SafetyInfo.anyBlockMayThrow())
343 return false;
344
345 bool MadeChange = false;
346
347 // Scan all the blocks in the loop that are not in subloops.
348 for (auto *BB : CurLoop->getBlocks()) {
349 // Ignore blocks in subloops.
350 if (LI->getLoopFor(BB) != CurLoop)
351 continue;
352
353 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
354 }
355 return MadeChange;
356}
357
358static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
359 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
360 return ConstStride->getAPInt();
361}
362
363/// getMemSetPatternValue - If a strided store of the specified value is safe to
364/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
365/// be passed in. Otherwise, return null.
366///
367/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
368/// just replicate their input array and then pass on to memset_pattern16.
370 // FIXME: This could check for UndefValue because it can be merged into any
371 // other valid pattern.
372
373 // If the value isn't a constant, we can't promote it to being in a constant
374 // array. We could theoretically do a store to an alloca or something, but
375 // that doesn't seem worthwhile.
376 Constant *C = dyn_cast<Constant>(V);
377 if (!C || isa<ConstantExpr>(C))
378 return nullptr;
379
380 // Only handle simple values that are a power of two bytes in size.
381 uint64_t Size = DL->getTypeSizeInBits(V->getType());
382 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
383 return nullptr;
384
385 // Don't care enough about darwin/ppc to implement this.
386 if (DL->isBigEndian())
387 return nullptr;
388
389 // Convert to size in bytes.
390 Size /= 8;
391
392 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
393 // if the top and bottom are the same (e.g. for vectors and large integers).
394 if (Size > 16)
395 return nullptr;
396
397 // If the constant is exactly 16 bytes, just use it.
398 if (Size == 16)
399 return C;
400
401 // Otherwise, we'll use an array of the constants.
402 unsigned ArraySize = 16 / Size;
403 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
404 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
405}
406
407LoopIdiomRecognize::LegalStoreKind
408LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
409 // Don't touch volatile stores.
410 if (SI->isVolatile())
411 return LegalStoreKind::None;
412 // We only want simple or unordered-atomic stores.
413 if (!SI->isUnordered())
414 return LegalStoreKind::None;
415
416 // Avoid merging nontemporal stores.
417 if (SI->getMetadata(LLVMContext::MD_nontemporal))
418 return LegalStoreKind::None;
419
420 Value *StoredVal = SI->getValueOperand();
421 Value *StorePtr = SI->getPointerOperand();
422
423 // Don't convert stores of non-integral pointer types to memsets (which stores
424 // integers).
425 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
426 return LegalStoreKind::None;
427
428 // Reject stores that are so large that they overflow an unsigned.
429 // When storing out scalable vectors we bail out for now, since the code
430 // below currently only works for constant strides.
431 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
432 if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
433 (SizeInBits.getFixedValue() >> 32) != 0)
434 return LegalStoreKind::None;
435
436 // See if the pointer expression is an AddRec like {base,+,1} on the current
437 // loop, which indicates a strided store. If we have something else, it's a
438 // random store we can't handle.
439 const SCEVAddRecExpr *StoreEv =
440 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
441 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
442 return LegalStoreKind::None;
443
444 // Check to see if we have a constant stride.
445 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
446 return LegalStoreKind::None;
447
448 // See if the store can be turned into a memset.
449
450 // If the stored value is a byte-wise value (like i32 -1), then it may be
451 // turned into a memset of i8 -1, assuming that all the consecutive bytes
452 // are stored. A store of i32 0x01020304 can never be turned into a memset,
453 // but it can be turned into memset_pattern if the target supports it.
454 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
455
456 // Note: memset and memset_pattern on unordered-atomic is yet not supported
457 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
458
459 // If we're allowed to form a memset, and the stored value would be
460 // acceptable for memset, use it.
461 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
462 // Verify that the stored value is loop invariant. If not, we can't
463 // promote the memset.
464 CurLoop->isLoopInvariant(SplatValue)) {
465 // It looks like we can use SplatValue.
466 return LegalStoreKind::Memset;
467 }
468 if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
469 // Don't create memset_pattern16s with address spaces.
470 StorePtr->getType()->getPointerAddressSpace() == 0 &&
471 getMemSetPatternValue(StoredVal, DL)) {
472 // It looks like we can use PatternValue!
473 return LegalStoreKind::MemsetPattern;
474 }
475
476 // Otherwise, see if the store can be turned into a memcpy.
477 if (HasMemcpy && !DisableLIRP::Memcpy) {
478 // Check to see if the stride matches the size of the store. If so, then we
479 // know that every byte is touched in the loop.
480 APInt Stride = getStoreStride(StoreEv);
481 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
482 if (StoreSize != Stride && StoreSize != -Stride)
483 return LegalStoreKind::None;
484
485 // The store must be feeding a non-volatile load.
486 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
487
488 // Only allow non-volatile loads
489 if (!LI || LI->isVolatile())
490 return LegalStoreKind::None;
491 // Only allow simple or unordered-atomic loads
492 if (!LI->isUnordered())
493 return LegalStoreKind::None;
494
495 // See if the pointer expression is an AddRec like {base,+,1} on the current
496 // loop, which indicates a strided load. If we have something else, it's a
497 // random load we can't handle.
498 const SCEVAddRecExpr *LoadEv =
499 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
500 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
501 return LegalStoreKind::None;
502
503 // The store and load must share the same stride.
504 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
505 return LegalStoreKind::None;
506
507 // Success. This store can be converted into a memcpy.
508 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
509 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
510 : LegalStoreKind::Memcpy;
511 }
512 // This store can't be transformed into a memset/memcpy.
513 return LegalStoreKind::None;
514}
515
516void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
517 StoreRefsForMemset.clear();
518 StoreRefsForMemsetPattern.clear();
519 StoreRefsForMemcpy.clear();
520 for (Instruction &I : *BB) {
521 StoreInst *SI = dyn_cast<StoreInst>(&I);
522 if (!SI)
523 continue;
524
525 // Make sure this is a strided store with a constant stride.
526 switch (isLegalStore(SI)) {
527 case LegalStoreKind::None:
528 // Nothing to do
529 break;
530 case LegalStoreKind::Memset: {
531 // Find the base pointer.
532 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
533 StoreRefsForMemset[Ptr].push_back(SI);
534 } break;
535 case LegalStoreKind::MemsetPattern: {
536 // Find the base pointer.
537 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
538 StoreRefsForMemsetPattern[Ptr].push_back(SI);
539 } break;
540 case LegalStoreKind::Memcpy:
541 case LegalStoreKind::UnorderedAtomicMemcpy:
542 StoreRefsForMemcpy.push_back(SI);
543 break;
544 default:
545 assert(false && "unhandled return value");
546 break;
547 }
548 }
549}
550
551/// runOnLoopBlock - Process the specified block, which lives in a counted loop
552/// with the specified backedge count. This block is known to be in the current
553/// loop and not in any subloops.
554bool LoopIdiomRecognize::runOnLoopBlock(
555 BasicBlock *BB, const SCEV *BECount,
556 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
557 // We can only promote stores in this block if they are unconditionally
558 // executed in the loop. For a block to be unconditionally executed, it has
559 // to dominate all the exit blocks of the loop. Verify this now.
560 for (BasicBlock *ExitBlock : ExitBlocks)
561 if (!DT->dominates(BB, ExitBlock))
562 return false;
563
564 bool MadeChange = false;
565 // Look for store instructions, which may be optimized to memset/memcpy.
566 collectStores(BB);
567
568 // Look for a single store or sets of stores with a common base, which can be
569 // optimized into a memset (memset_pattern). The latter most commonly happens
570 // with structs and handunrolled loops.
571 for (auto &SL : StoreRefsForMemset)
572 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
573
574 for (auto &SL : StoreRefsForMemsetPattern)
575 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
576
577 // Optimize the store into a memcpy, if it feeds an similarly strided load.
578 for (auto &SI : StoreRefsForMemcpy)
579 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
580
581 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
582 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
583 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
584 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
585
586 return MadeChange;
587}
588
589/// See if this store(s) can be promoted to a memset.
590bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
591 const SCEV *BECount, ForMemset For) {
592 // Try to find consecutive stores that can be transformed into memsets.
593 SetVector<StoreInst *> Heads, Tails;
595
596 // Do a quadratic search on all of the given stores and find
597 // all of the pairs of stores that follow each other.
598 SmallVector<unsigned, 16> IndexQueue;
599 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
600 assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
601
602 Value *FirstStoredVal = SL[i]->getValueOperand();
603 Value *FirstStorePtr = SL[i]->getPointerOperand();
604 const SCEVAddRecExpr *FirstStoreEv =
605 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
606 APInt FirstStride = getStoreStride(FirstStoreEv);
607 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
608
609 // See if we can optimize just this store in isolation.
610 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
611 Heads.insert(SL[i]);
612 continue;
613 }
614
615 Value *FirstSplatValue = nullptr;
616 Constant *FirstPatternValue = nullptr;
617
618 if (For == ForMemset::Yes)
619 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
620 else
621 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
622
623 assert((FirstSplatValue || FirstPatternValue) &&
624 "Expected either splat value or pattern value.");
625
626 IndexQueue.clear();
627 // If a store has multiple consecutive store candidates, search Stores
628 // array according to the sequence: from i+1 to e, then from i-1 to 0.
629 // This is because usually pairing with immediate succeeding or preceding
630 // candidate create the best chance to find memset opportunity.
631 unsigned j = 0;
632 for (j = i + 1; j < e; ++j)
633 IndexQueue.push_back(j);
634 for (j = i; j > 0; --j)
635 IndexQueue.push_back(j - 1);
636
637 for (auto &k : IndexQueue) {
638 assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
639 Value *SecondStorePtr = SL[k]->getPointerOperand();
640 const SCEVAddRecExpr *SecondStoreEv =
641 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
642 APInt SecondStride = getStoreStride(SecondStoreEv);
643
644 if (FirstStride != SecondStride)
645 continue;
646
647 Value *SecondStoredVal = SL[k]->getValueOperand();
648 Value *SecondSplatValue = nullptr;
649 Constant *SecondPatternValue = nullptr;
650
651 if (For == ForMemset::Yes)
652 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
653 else
654 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
655
656 assert((SecondSplatValue || SecondPatternValue) &&
657 "Expected either splat value or pattern value.");
658
659 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
660 if (For == ForMemset::Yes) {
661 if (isa<UndefValue>(FirstSplatValue))
662 FirstSplatValue = SecondSplatValue;
663 if (FirstSplatValue != SecondSplatValue)
664 continue;
665 } else {
666 if (isa<UndefValue>(FirstPatternValue))
667 FirstPatternValue = SecondPatternValue;
668 if (FirstPatternValue != SecondPatternValue)
669 continue;
670 }
671 Tails.insert(SL[k]);
672 Heads.insert(SL[i]);
673 ConsecutiveChain[SL[i]] = SL[k];
674 break;
675 }
676 }
677 }
678
679 // We may run into multiple chains that merge into a single chain. We mark the
680 // stores that we transformed so that we don't visit the same store twice.
681 SmallPtrSet<Value *, 16> TransformedStores;
682 bool Changed = false;
683
684 // For stores that start but don't end a link in the chain:
685 for (StoreInst *I : Heads) {
686 if (Tails.count(I))
687 continue;
688
689 // We found a store instr that starts a chain. Now follow the chain and try
690 // to transform it.
691 SmallPtrSet<Instruction *, 8> AdjacentStores;
692 StoreInst *HeadStore = I;
693 unsigned StoreSize = 0;
694
695 // Collect the chain into a list.
696 while (Tails.count(I) || Heads.count(I)) {
697 if (TransformedStores.count(I))
698 break;
699 AdjacentStores.insert(I);
700
701 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
702 // Move to the next value in the chain.
703 I = ConsecutiveChain[I];
704 }
705
706 Value *StoredVal = HeadStore->getValueOperand();
707 Value *StorePtr = HeadStore->getPointerOperand();
708 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
709 APInt Stride = getStoreStride(StoreEv);
710
711 // Check to see if the stride matches the size of the stores. If so, then
712 // we know that every byte is touched in the loop.
713 if (StoreSize != Stride && StoreSize != -Stride)
714 continue;
715
716 bool IsNegStride = StoreSize == -Stride;
717
718 Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
719 const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
720 if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
721 MaybeAlign(HeadStore->getAlign()), StoredVal,
722 HeadStore, AdjacentStores, StoreEv, BECount,
723 IsNegStride)) {
724 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
725 Changed = true;
726 }
727 }
728
729 return Changed;
730}
731
732/// processLoopMemIntrinsic - Template function for calling different processor
733/// functions based on mem intrinsic type.
734template <typename MemInst>
735bool LoopIdiomRecognize::processLoopMemIntrinsic(
736 BasicBlock *BB,
737 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
738 const SCEV *BECount) {
739 bool MadeChange = false;
740 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
741 Instruction *Inst = &*I++;
742 // Look for memory instructions, which may be optimized to a larger one.
743 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
744 WeakTrackingVH InstPtr(&*I);
745 if (!(this->*Processor)(MI, BECount))
746 continue;
747 MadeChange = true;
748
749 // If processing the instruction invalidated our iterator, start over from
750 // the top of the block.
751 if (!InstPtr)
752 I = BB->begin();
753 }
754 }
755 return MadeChange;
756}
757
758/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
759bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
760 const SCEV *BECount) {
761 // We can only handle non-volatile memcpys with a constant size.
762 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
763 return false;
764
765 // If we're not allowed to hack on memcpy, we fail.
766 if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
767 return false;
768
769 Value *Dest = MCI->getDest();
770 Value *Source = MCI->getSource();
771 if (!Dest || !Source)
772 return false;
773
774 // See if the load and store pointer expressions are AddRec like {base,+,1} on
775 // the current loop, which indicates a strided load and store. If we have
776 // something else, it's a random load or store we can't handle.
777 const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
778 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
779 return false;
780 const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
781 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
782 return false;
783
784 // Reject memcpys that are so large that they overflow an unsigned.
785 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
786 if ((SizeInBytes >> 32) != 0)
787 return false;
788
789 // Check if the stride matches the size of the memcpy. If so, then we know
790 // that every byte is touched in the loop.
791 const SCEVConstant *ConstStoreStride =
792 dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
793 const SCEVConstant *ConstLoadStride =
794 dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
795 if (!ConstStoreStride || !ConstLoadStride)
796 return false;
797
798 APInt StoreStrideValue = ConstStoreStride->getAPInt();
799 APInt LoadStrideValue = ConstLoadStride->getAPInt();
800 // Huge stride value - give up
801 if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
802 return false;
803
804 if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
805 ORE.emit([&]() {
806 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
807 << ore::NV("Inst", "memcpy") << " in "
808 << ore::NV("Function", MCI->getFunction())
809 << " function will not be hoisted: "
810 << ore::NV("Reason", "memcpy size is not equal to stride");
811 });
812 return false;
813 }
814
815 int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
816 int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
817 // Check if the load stride matches the store stride.
818 if (StoreStrideInt != LoadStrideInt)
819 return false;
820
821 return processLoopStoreOfLoopLoad(
822 Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
823 MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
824 BECount);
825}
826
827/// processLoopMemSet - See if this memset can be promoted to a large memset.
828bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
829 const SCEV *BECount) {
830 // We can only handle non-volatile memsets.
831 if (MSI->isVolatile())
832 return false;
833
834 // If we're not allowed to hack on memset, we fail.
835 if (!HasMemset || DisableLIRP::Memset)
836 return false;
837
838 Value *Pointer = MSI->getDest();
839
840 // See if the pointer expression is an AddRec like {base,+,1} on the current
841 // loop, which indicates a strided store. If we have something else, it's a
842 // random store we can't handle.
843 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
844 if (!Ev || Ev->getLoop() != CurLoop)
845 return false;
846 if (!Ev->isAffine()) {
847 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
848 return false;
849 }
850
851 const SCEV *PointerStrideSCEV = Ev->getOperand(1);
852 const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
853 if (!PointerStrideSCEV || !MemsetSizeSCEV)
854 return false;
855
856 bool IsNegStride = false;
857 const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
858
859 if (IsConstantSize) {
860 // Memset size is constant.
861 // Check if the pointer stride matches the memset size. If so, then
862 // we know that every byte is touched in the loop.
863 LLVM_DEBUG(dbgs() << " memset size is constant\n");
864 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
865 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
866 if (!ConstStride)
867 return false;
868
869 APInt Stride = ConstStride->getAPInt();
870 if (SizeInBytes != Stride && SizeInBytes != -Stride)
871 return false;
872
873 IsNegStride = SizeInBytes == -Stride;
874 } else {
875 // Memset size is non-constant.
876 // Check if the pointer stride matches the memset size.
877 // To be conservative, the pass would not promote pointers that aren't in
878 // address space zero. Also, the pass only handles memset length and stride
879 // that are invariant for the top level loop.
880 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
881 if (Pointer->getType()->getPointerAddressSpace() != 0) {
882 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
883 << "abort\n");
884 return false;
885 }
886 if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
887 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
888 << "abort\n");
889 return false;
890 }
891
892 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
893 IsNegStride = PointerStrideSCEV->isNonConstantNegative();
894 const SCEV *PositiveStrideSCEV =
895 IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
896 : PointerStrideSCEV;
897 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
898 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
899 << "\n");
900
901 if (PositiveStrideSCEV != MemsetSizeSCEV) {
902 // If an expression is covered by the loop guard, compare again and
903 // proceed with optimization if equal.
904 const SCEV *FoldedPositiveStride =
905 SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
906 const SCEV *FoldedMemsetSize =
907 SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
908
909 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
910 << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
911 << " FoldedPositiveStride: " << *FoldedPositiveStride
912 << "\n");
913
914 if (FoldedPositiveStride != FoldedMemsetSize) {
915 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
916 return false;
917 }
918 }
919 }
920
921 // Verify that the memset value is loop invariant. If not, we can't promote
922 // the memset.
923 Value *SplatValue = MSI->getValue();
924 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
925 return false;
926
928 MSIs.insert(MSI);
929 return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
930 MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
931 BECount, IsNegStride, /*IsLoopMemset=*/true);
932}
933
934/// mayLoopAccessLocation - Return true if the specified loop might access the
935/// specified pointer location, which is a loop-strided access. The 'Access'
936/// argument specifies what the verboten forms of access are (read or write).
937static bool
939 const SCEV *BECount, const SCEV *StoreSizeSCEV,
940 AliasAnalysis &AA,
941 SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
942 // Get the location that may be stored across the loop. Since the access is
943 // strided positively through memory, we say that the modified location starts
944 // at the pointer and has infinite size.
946
947 // If the loop iterates a fixed number of times, we can refine the access size
948 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
949 const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
950 const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
951 if (BECst && ConstSize)
952 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
953 ConstSize->getValue()->getZExtValue());
954
955 // TODO: For this to be really effective, we have to dive into the pointer
956 // operand in the store. Store to &A[i] of 100 will always return may alias
957 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
958 // which will then no-alias a store to &A[100].
959 MemoryLocation StoreLoc(Ptr, AccessSize);
960
961 for (BasicBlock *B : L->blocks())
962 for (Instruction &I : *B)
963 if (!IgnoredInsts.contains(&I) &&
964 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
965 return true;
966 return false;
967}
968
969// If we have a negative stride, Start refers to the end of the memory location
970// we're trying to memset. Therefore, we need to recompute the base pointer,
971// which is just Start - BECount*Size.
972static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
973 Type *IntPtr, const SCEV *StoreSizeSCEV,
974 ScalarEvolution *SE) {
975 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
976 if (!StoreSizeSCEV->isOne()) {
977 // index = back edge count * store size
978 Index = SE->getMulExpr(Index,
979 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
981 }
982 // base pointer = start - index * store size
983 return SE->getMinusSCEV(Start, Index);
984}
985
986/// Compute the number of bytes as a SCEV from the backedge taken count.
987///
988/// This also maps the SCEV into the provided type and tries to handle the
989/// computation in a way that will fold cleanly.
990static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
991 const SCEV *StoreSizeSCEV, Loop *CurLoop,
992 const DataLayout *DL, ScalarEvolution *SE) {
993 const SCEV *TripCountSCEV =
994 SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop);
995 return SE->getMulExpr(TripCountSCEV,
996 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
998}
999
1000/// processLoopStridedStore - We see a strided store of some value. If we can
1001/// transform this into a memset or memset_pattern in the loop preheader, do so.
1002bool LoopIdiomRecognize::processLoopStridedStore(
1003 Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1004 Value *StoredVal, Instruction *TheStore,
1006 const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1007 Module *M = TheStore->getModule();
1008 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1009 Constant *PatternValue = nullptr;
1010
1011 if (!SplatValue)
1012 PatternValue = getMemSetPatternValue(StoredVal, DL);
1013
1014 assert((SplatValue || PatternValue) &&
1015 "Expected either splat value or pattern value.");
1016
1017 // The trip count of the loop and the base pointer of the addrec SCEV is
1018 // guaranteed to be loop invariant, which means that it should dominate the
1019 // header. This allows us to insert code for it in the preheader.
1020 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1021 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1022 IRBuilder<> Builder(Preheader->getTerminator());
1023 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1024 SCEVExpanderCleaner ExpCleaner(Expander);
1025
1026 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
1027 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1028
1029 bool Changed = false;
1030 const SCEV *Start = Ev->getStart();
1031 // Handle negative strided loops.
1032 if (IsNegStride)
1033 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1034
1035 // TODO: ideally we should still be able to generate memset if SCEV expander
1036 // is taught to generate the dependencies at the latest point.
1037 if (!Expander.isSafeToExpand(Start))
1038 return Changed;
1039
1040 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1041 // this into a memset in the loop preheader now if we want. However, this
1042 // would be unsafe to do if there is anything else in the loop that may read
1043 // or write to the aliased location. Check for any overlap by generating the
1044 // base pointer and checking the region.
1045 Value *BasePtr =
1046 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1047
1048 // From here on out, conservatively report to the pass manager that we've
1049 // changed the IR, even if we later clean up these added instructions. There
1050 // may be structural differences e.g. in the order of use lists not accounted
1051 // for in just a textual dump of the IR. This is written as a variable, even
1052 // though statically all the places this dominates could be replaced with
1053 // 'true', with the hope that anyone trying to be clever / "more precise" with
1054 // the return value will read this comment, and leave them alone.
1055 Changed = true;
1056
1057 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1058 StoreSizeSCEV, *AA, Stores))
1059 return Changed;
1060
1061 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1062 return Changed;
1063
1064 // Okay, everything looks good, insert the memset.
1065
1066 const SCEV *NumBytesS =
1067 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1068
1069 // TODO: ideally we should still be able to generate memset if SCEV expander
1070 // is taught to generate the dependencies at the latest point.
1071 if (!Expander.isSafeToExpand(NumBytesS))
1072 return Changed;
1073
1074 Value *NumBytes =
1075 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1076
1077 CallInst *NewCall;
1078 if (SplatValue) {
1079 AAMDNodes AATags = TheStore->getAAMetadata();
1080 for (Instruction *Store : Stores)
1081 AATags = AATags.merge(Store->getAAMetadata());
1082 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1083 AATags = AATags.extendTo(CI->getZExtValue());
1084 else
1085 AATags = AATags.extendTo(-1);
1086
1087 NewCall = Builder.CreateMemSet(
1088 BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
1089 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1090 } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
1091 // Everything is emitted in default address space
1092 Type *Int8PtrTy = DestInt8PtrTy;
1093
1094 StringRef FuncName = "memset_pattern16";
1095 FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
1096 Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
1097 inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
1098
1099 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1100 // an constant array of 16-bytes. Plop the value into a mergable global.
1101 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1103 PatternValue, ".memset_pattern");
1104 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1105 GV->setAlignment(Align(16));
1106 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1107 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1108 } else
1109 return Changed;
1110
1111 NewCall->setDebugLoc(TheStore->getDebugLoc());
1112
1113 if (MSSAU) {
1114 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1115 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1116 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1117 }
1118
1119 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1120 << " from store to: " << *Ev << " at: " << *TheStore
1121 << "\n");
1122
1123 ORE.emit([&]() {
1124 OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1125 NewCall->getDebugLoc(), Preheader);
1126 R << "Transformed loop-strided store in "
1127 << ore::NV("Function", TheStore->getFunction())
1128 << " function into a call to "
1129 << ore::NV("NewFunction", NewCall->getCalledFunction())
1130 << "() intrinsic";
1131 if (!Stores.empty())
1132 R << ore::setExtraArgs();
1133 for (auto *I : Stores) {
1134 R << ore::NV("FromBlock", I->getParent()->getName())
1135 << ore::NV("ToBlock", Preheader->getName());
1136 }
1137 return R;
1138 });
1139
1140 // Okay, the memset has been formed. Zap the original store and anything that
1141 // feeds into it.
1142 for (auto *I : Stores) {
1143 if (MSSAU)
1144 MSSAU->removeMemoryAccess(I, true);
1146 }
1147 if (MSSAU && VerifyMemorySSA)
1148 MSSAU->getMemorySSA()->verifyMemorySSA();
1149 ++NumMemSet;
1150 ExpCleaner.markResultUsed();
1151 return true;
1152}
1153
1154/// If the stored value is a strided load in the same loop with the same stride
1155/// this may be transformable into a memcpy. This kicks in for stuff like
1156/// for (i) A[i] = B[i];
1157bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1158 const SCEV *BECount) {
1159 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1160
1161 Value *StorePtr = SI->getPointerOperand();
1162 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1163 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1164
1165 // The store must be feeding a non-volatile load.
1166 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1167 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1168
1169 // See if the pointer expression is an AddRec like {base,+,1} on the current
1170 // loop, which indicates a strided load. If we have something else, it's a
1171 // random load we can't handle.
1172 Value *LoadPtr = LI->getPointerOperand();
1173 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1174
1175 const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1176 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1177 SI->getAlign(), LI->getAlign(), SI, LI,
1178 StoreEv, LoadEv, BECount);
1179}
1180
1181namespace {
1182class MemmoveVerifier {
1183public:
1184 explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1185 const DataLayout &DL)
1187 LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1189 StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1190 IsSameObject(BP1 == BP2) {}
1191
1192 bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1193 const Instruction &TheLoad,
1194 bool IsMemCpy) const {
1195 if (IsMemCpy) {
1196 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1197 // for negative stride.
1198 if ((!IsNegStride && LoadOff <= StoreOff) ||
1199 (IsNegStride && LoadOff >= StoreOff))
1200 return false;
1201 } else {
1202 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1203 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1204 int64_t LoadSize =
1205 DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
1206 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1207 return false;
1208 if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1209 (IsNegStride && LoadOff + LoadSize > StoreOff))
1210 return false;
1211 }
1212 return true;
1213 }
1214
1215private:
1216 const DataLayout &DL;
1217 int64_t LoadOff = 0;
1218 int64_t StoreOff = 0;
1219 const Value *BP1;
1220 const Value *BP2;
1221
1222public:
1223 const bool IsSameObject;
1224};
1225} // namespace
1226
1227bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1228 Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1229 MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1230 Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1231 const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1232
1233 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1234 // conservatively bail here, since otherwise we may have to transform
1235 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1236 if (isa<MemCpyInlineInst>(TheStore))
1237 return false;
1238
1239 // The trip count of the loop and the base pointer of the addrec SCEV is
1240 // guaranteed to be loop invariant, which means that it should dominate the
1241 // header. This allows us to insert code for it in the preheader.
1242 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1243 IRBuilder<> Builder(Preheader->getTerminator());
1244 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1245
1246 SCEVExpanderCleaner ExpCleaner(Expander);
1247
1248 bool Changed = false;
1249 const SCEV *StrStart = StoreEv->getStart();
1250 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1251 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1252
1253 APInt Stride = getStoreStride(StoreEv);
1254 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1255
1256 // TODO: Deal with non-constant size; Currently expect constant store size
1257 assert(ConstStoreSize && "store size is expected to be a constant");
1258
1259 int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1260 bool IsNegStride = StoreSize == -Stride;
1261
1262 // Handle negative strided loops.
1263 if (IsNegStride)
1264 StrStart =
1265 getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1266
1267 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1268 // this into a memcpy in the loop preheader now if we want. However, this
1269 // would be unsafe to do if there is anything else in the loop that may read
1270 // or write the memory region we're storing to. This includes the load that
1271 // feeds the stores. Check for an alias by generating the base address and
1272 // checking everything.
1273 Value *StoreBasePtr = Expander.expandCodeFor(
1274 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1275
1276 // From here on out, conservatively report to the pass manager that we've
1277 // changed the IR, even if we later clean up these added instructions. There
1278 // may be structural differences e.g. in the order of use lists not accounted
1279 // for in just a textual dump of the IR. This is written as a variable, even
1280 // though statically all the places this dominates could be replaced with
1281 // 'true', with the hope that anyone trying to be clever / "more precise" with
1282 // the return value will read this comment, and leave them alone.
1283 Changed = true;
1284
1285 SmallPtrSet<Instruction *, 2> IgnoredInsts;
1286 IgnoredInsts.insert(TheStore);
1287
1288 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1289 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1290
1291 bool LoopAccessStore =
1292 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1293 StoreSizeSCEV, *AA, IgnoredInsts);
1294 if (LoopAccessStore) {
1295 // For memmove case it's not enough to guarantee that loop doesn't access
1296 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1297 // the only user of TheLoad.
1298 if (!TheLoad->hasOneUse())
1299 return Changed;
1300 IgnoredInsts.insert(TheLoad);
1301 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1302 BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1303 ORE.emit([&]() {
1304 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1305 TheStore)
1306 << ore::NV("Inst", InstRemark) << " in "
1307 << ore::NV("Function", TheStore->getFunction())
1308 << " function will not be hoisted: "
1309 << ore::NV("Reason", "The loop may access store location");
1310 });
1311 return Changed;
1312 }
1313 IgnoredInsts.erase(TheLoad);
1314 }
1315
1316 const SCEV *LdStart = LoadEv->getStart();
1317 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1318
1319 // Handle negative strided loops.
1320 if (IsNegStride)
1321 LdStart =
1322 getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1323
1324 // For a memcpy, we have to make sure that the input array is not being
1325 // mutated by the loop.
1326 Value *LoadBasePtr = Expander.expandCodeFor(
1327 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1328
1329 // If the store is a memcpy instruction, we must check if it will write to
1330 // the load memory locations. So remove it from the ignored stores.
1331 MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1332 if (IsMemCpy && !Verifier.IsSameObject)
1333 IgnoredInsts.erase(TheStore);
1334 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1335 StoreSizeSCEV, *AA, IgnoredInsts)) {
1336 ORE.emit([&]() {
1337 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1338 << ore::NV("Inst", InstRemark) << " in "
1339 << ore::NV("Function", TheStore->getFunction())
1340 << " function will not be hoisted: "
1341 << ore::NV("Reason", "The loop may access load location");
1342 });
1343 return Changed;
1344 }
1345
1346 bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1347 if (UseMemMove)
1348 if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1349 IsMemCpy))
1350 return Changed;
1351
1352 if (avoidLIRForMultiBlockLoop())
1353 return Changed;
1354
1355 // Okay, everything is safe, we can transform this!
1356
1357 const SCEV *NumBytesS =
1358 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1359
1360 Value *NumBytes =
1361 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1362
1363 AAMDNodes AATags = TheLoad->getAAMetadata();
1364 AAMDNodes StoreAATags = TheStore->getAAMetadata();
1365 AATags = AATags.merge(StoreAATags);
1366 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1367 AATags = AATags.extendTo(CI->getZExtValue());
1368 else
1369 AATags = AATags.extendTo(-1);
1370
1371 CallInst *NewCall = nullptr;
1372 // Check whether to generate an unordered atomic memcpy:
1373 // If the load or store are atomic, then they must necessarily be unordered
1374 // by previous checks.
1375 if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1376 if (UseMemMove)
1377 NewCall = Builder.CreateMemMove(
1378 StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1379 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1380 else
1381 NewCall =
1382 Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1383 NumBytes, /*isVolatile=*/false, AATags.TBAA,
1384 AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1385 } else {
1386 // For now don't support unordered atomic memmove.
1387 if (UseMemMove)
1388 return Changed;
1389 // We cannot allow unaligned ops for unordered load/store, so reject
1390 // anything where the alignment isn't at least the element size.
1391 assert((StoreAlign && LoadAlign) &&
1392 "Expect unordered load/store to have align.");
1393 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1394 return Changed;
1395
1396 // If the element.atomic memcpy is not lowered into explicit
1397 // loads/stores later, then it will be lowered into an element-size
1398 // specific lib call. If the lib call doesn't exist for our store size, then
1399 // we shouldn't generate the memcpy.
1400 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1401 return Changed;
1402
1403 // Create the call.
1404 // Note that unordered atomic loads/stores are *required* by the spec to
1405 // have an alignment but non-atomic loads/stores may not.
1406 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1407 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
1408 AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1409 }
1410 NewCall->setDebugLoc(TheStore->getDebugLoc());
1411
1412 if (MSSAU) {
1413 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1414 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1415 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1416 }
1417
1418 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1419 << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1420 << "\n"
1421 << " from store ptr=" << *StoreEv << " at: " << *TheStore
1422 << "\n");
1423
1424 ORE.emit([&]() {
1425 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1426 NewCall->getDebugLoc(), Preheader)
1427 << "Formed a call to "
1428 << ore::NV("NewFunction", NewCall->getCalledFunction())
1429 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1430 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1431 << " function"
1433 << ore::NV("FromBlock", TheStore->getParent()->getName())
1434 << ore::NV("ToBlock", Preheader->getName());
1435 });
1436
1437 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1438 // and anything that feeds into it.
1439 if (MSSAU)
1440 MSSAU->removeMemoryAccess(TheStore, true);
1441 deleteDeadInstruction(TheStore);
1442 if (MSSAU && VerifyMemorySSA)
1443 MSSAU->getMemorySSA()->verifyMemorySSA();
1444 if (UseMemMove)
1445 ++NumMemMove;
1446 else
1447 ++NumMemCpy;
1448 ExpCleaner.markResultUsed();
1449 return true;
1450}
1451
1452// When compiling for codesize we avoid idiom recognition for a multi-block loop
1453// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1454//
1455bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1456 bool IsLoopMemset) {
1457 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1458 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1459 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1460 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1461 << " avoided: multi-block top-level loop\n");
1462 return true;
1463 }
1464 }
1465
1466 return false;
1467}
1468
1469bool LoopIdiomRecognize::runOnNoncountableLoop() {
1470 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1471 << CurLoop->getHeader()->getParent()->getName()
1472 << "] Noncountable Loop %"
1473 << CurLoop->getHeader()->getName() << "\n");
1474
1475 return recognizePopcount() || recognizeAndInsertFFS() ||
1476 recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
1477}
1478
1479/// Check if the given conditional branch is based on the comparison between
1480/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1481/// true), the control yields to the loop entry. If the branch matches the
1482/// behavior, the variable involved in the comparison is returned. This function
1483/// will be called to see if the precondition and postcondition of the loop are
1484/// in desirable form.
1486 bool JmpOnZero = false) {
1487 if (!BI || !BI->isConditional())
1488 return nullptr;
1489
1490 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1491 if (!Cond)
1492 return nullptr;
1493
1494 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1495 if (!CmpZero || !CmpZero->isZero())
1496 return nullptr;
1497
1498 BasicBlock *TrueSucc = BI->getSuccessor(0);
1499 BasicBlock *FalseSucc = BI->getSuccessor(1);
1500 if (JmpOnZero)
1501 std::swap(TrueSucc, FalseSucc);
1502
1503 ICmpInst::Predicate Pred = Cond->getPredicate();
1504 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1505 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1506 return Cond->getOperand(0);
1507
1508 return nullptr;
1509}
1510
1511// Check if the recurrence variable `VarX` is in the right form to create
1512// the idiom. Returns the value coerced to a PHINode if so.
1514 BasicBlock *LoopEntry) {
1515 auto *PhiX = dyn_cast<PHINode>(VarX);
1516 if (PhiX && PhiX->getParent() == LoopEntry &&
1517 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1518 return PhiX;
1519 return nullptr;
1520}
1521
1522/// Return true iff the idiom is detected in the loop.
1523///
1524/// Additionally:
1525/// 1) \p CntInst is set to the instruction counting the population bit.
1526/// 2) \p CntPhi is set to the corresponding phi node.
1527/// 3) \p Var is set to the value whose population bits are being counted.
1528///
1529/// The core idiom we are trying to detect is:
1530/// \code
1531/// if (x0 != 0)
1532/// goto loop-exit // the precondition of the loop
1533/// cnt0 = init-val;
1534/// do {
1535/// x1 = phi (x0, x2);
1536/// cnt1 = phi(cnt0, cnt2);
1537///
1538/// cnt2 = cnt1 + 1;
1539/// ...
1540/// x2 = x1 & (x1 - 1);
1541/// ...
1542/// } while(x != 0);
1543///
1544/// loop-exit:
1545/// \endcode
1546static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1547 Instruction *&CntInst, PHINode *&CntPhi,
1548 Value *&Var) {
1549 // step 1: Check to see if the look-back branch match this pattern:
1550 // "if (a!=0) goto loop-entry".
1551 BasicBlock *LoopEntry;
1552 Instruction *DefX2, *CountInst;
1553 Value *VarX1, *VarX0;
1554 PHINode *PhiX, *CountPhi;
1555
1556 DefX2 = CountInst = nullptr;
1557 VarX1 = VarX0 = nullptr;
1558 PhiX = CountPhi = nullptr;
1559 LoopEntry = *(CurLoop->block_begin());
1560
1561 // step 1: Check if the loop-back branch is in desirable form.
1562 {
1563 if (Value *T = matchCondition(
1564 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1565 DefX2 = dyn_cast<Instruction>(T);
1566 else
1567 return false;
1568 }
1569
1570 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1571 {
1572 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1573 return false;
1574
1575 BinaryOperator *SubOneOp;
1576
1577 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1578 VarX1 = DefX2->getOperand(1);
1579 else {
1580 VarX1 = DefX2->getOperand(0);
1581 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1582 }
1583 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1584 return false;
1585
1586 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1587 if (!Dec ||
1588 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1589 (SubOneOp->getOpcode() == Instruction::Add &&
1590 Dec->isMinusOne()))) {
1591 return false;
1592 }
1593 }
1594
1595 // step 3: Check the recurrence of variable X
1596 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1597 if (!PhiX)
1598 return false;
1599
1600 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1601 {
1602 CountInst = nullptr;
1603 for (Instruction &Inst : llvm::make_range(
1604 LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1605 if (Inst.getOpcode() != Instruction::Add)
1606 continue;
1607
1608 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1609 if (!Inc || !Inc->isOne())
1610 continue;
1611
1612 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1613 if (!Phi)
1614 continue;
1615
1616 // Check if the result of the instruction is live of the loop.
1617 bool LiveOutLoop = false;
1618 for (User *U : Inst.users()) {
1619 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1620 LiveOutLoop = true;
1621 break;
1622 }
1623 }
1624
1625 if (LiveOutLoop) {
1626 CountInst = &Inst;
1627 CountPhi = Phi;
1628 break;
1629 }
1630 }
1631
1632 if (!CountInst)
1633 return false;
1634 }
1635
1636 // step 5: check if the precondition is in this form:
1637 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1638 {
1639 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1640 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1641 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1642 return false;
1643
1644 CntInst = CountInst;
1645 CntPhi = CountPhi;
1646 Var = T;
1647 }
1648
1649 return true;
1650}
1651
1652/// Return true if the idiom is detected in the loop.
1653///
1654/// Additionally:
1655/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1656/// or nullptr if there is no such.
1657/// 2) \p CntPhi is set to the corresponding phi node
1658/// or nullptr if there is no such.
1659/// 3) \p Var is set to the value whose CTLZ could be used.
1660/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1661///
1662/// The core idiom we are trying to detect is:
1663/// \code
1664/// if (x0 == 0)
1665/// goto loop-exit // the precondition of the loop
1666/// cnt0 = init-val;
1667/// do {
1668/// x = phi (x0, x.next); //PhiX
1669/// cnt = phi(cnt0, cnt.next);
1670///
1671/// cnt.next = cnt + 1;
1672/// ...
1673/// x.next = x >> 1; // DefX
1674/// ...
1675/// } while(x.next != 0);
1676///
1677/// loop-exit:
1678/// \endcode
1679static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1680 Intrinsic::ID &IntrinID, Value *&InitX,
1681 Instruction *&CntInst, PHINode *&CntPhi,
1682 Instruction *&DefX) {
1683 BasicBlock *LoopEntry;
1684 Value *VarX = nullptr;
1685
1686 DefX = nullptr;
1687 CntInst = nullptr;
1688 CntPhi = nullptr;
1689 LoopEntry = *(CurLoop->block_begin());
1690
1691 // step 1: Check if the loop-back branch is in desirable form.
1692 if (Value *T = matchCondition(
1693 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1694 DefX = dyn_cast<Instruction>(T);
1695 else
1696 return false;
1697
1698 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1699 if (!DefX || !DefX->isShift())
1700 return false;
1701 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1702 Intrinsic::ctlz;
1703 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1704 if (!Shft || !Shft->isOne())
1705 return false;
1706 VarX = DefX->getOperand(0);
1707
1708 // step 3: Check the recurrence of variable X
1709 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1710 if (!PhiX)
1711 return false;
1712
1713 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1714
1715 // Make sure the initial value can't be negative otherwise the ashr in the
1716 // loop might never reach zero which would make the loop infinite.
1717 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1718 return false;
1719
1720 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1721 // or cnt.next = cnt + -1.
1722 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1723 // then all uses of "cnt.next" could be optimized to the trip count
1724 // plus "cnt0". Currently it is not optimized.
1725 // This step could be used to detect POPCNT instruction:
1726 // cnt.next = cnt + (x.next & 1)
1727 for (Instruction &Inst : llvm::make_range(
1728 LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1729 if (Inst.getOpcode() != Instruction::Add)
1730 continue;
1731
1732 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1733 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1734 continue;
1735
1736 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1737 if (!Phi)
1738 continue;
1739
1740 CntInst = &Inst;
1741 CntPhi = Phi;
1742 break;
1743 }
1744 if (!CntInst)
1745 return false;
1746
1747 return true;
1748}
1749
1750/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1751/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1752/// trip count returns true; otherwise, returns false.
1753bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1754 // Give up if the loop has multiple blocks or multiple backedges.
1755 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1756 return false;
1757
1758 Intrinsic::ID IntrinID;
1759 Value *InitX;
1760 Instruction *DefX = nullptr;
1761 PHINode *CntPhi = nullptr;
1762 Instruction *CntInst = nullptr;
1763 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1764 // this is always 6.
1765 size_t IdiomCanonicalSize = 6;
1766
1767 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1768 CntInst, CntPhi, DefX))
1769 return false;
1770
1771 bool IsCntPhiUsedOutsideLoop = false;
1772 for (User *U : CntPhi->users())
1773 if (!CurLoop->contains(cast<Instruction>(U))) {
1774 IsCntPhiUsedOutsideLoop = true;
1775 break;
1776 }
1777 bool IsCntInstUsedOutsideLoop = false;
1778 for (User *U : CntInst->users())
1779 if (!CurLoop->contains(cast<Instruction>(U))) {
1780 IsCntInstUsedOutsideLoop = true;
1781 break;
1782 }
1783 // If both CntInst and CntPhi are used outside the loop the profitability
1784 // is questionable.
1785 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1786 return false;
1787
1788 // For some CPUs result of CTLZ(X) intrinsic is undefined
1789 // when X is 0. If we can not guarantee X != 0, we need to check this
1790 // when expand.
1791 bool ZeroCheck = false;
1792 // It is safe to assume Preheader exist as it was checked in
1793 // parent function RunOnLoop.
1794 BasicBlock *PH = CurLoop->getLoopPreheader();
1795
1796 // If we are using the count instruction outside the loop, make sure we
1797 // have a zero check as a precondition. Without the check the loop would run
1798 // one iteration for before any check of the input value. This means 0 and 1
1799 // would have identical behavior in the original loop and thus
1800 if (!IsCntPhiUsedOutsideLoop) {
1801 auto *PreCondBB = PH->getSinglePredecessor();
1802 if (!PreCondBB)
1803 return false;
1804 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1805 if (!PreCondBI)
1806 return false;
1807 if (matchCondition(PreCondBI, PH) != InitX)
1808 return false;
1809 ZeroCheck = true;
1810 }
1811
1812 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1813 // profitable if we delete the loop.
1814
1815 // the loop has only 6 instructions:
1816 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1817 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1818 // %shr = ashr %n.addr.0, 1
1819 // %tobool = icmp eq %shr, 0
1820 // %inc = add nsw %i.0, 1
1821 // br i1 %tobool
1822
1823 const Value *Args[] = {InitX,
1824 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1825
1826 // @llvm.dbg doesn't count as they have no semantic effect.
1827 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1829 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1830
1831 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1834 if (HeaderSize != IdiomCanonicalSize &&
1836 return false;
1837
1838 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1839 DefX->getDebugLoc(), ZeroCheck,
1840 IsCntPhiUsedOutsideLoop);
1841 return true;
1842}
1843
1844/// Recognizes a population count idiom in a non-countable loop.
1845///
1846/// If detected, transforms the relevant code to issue the popcount intrinsic
1847/// function call, and returns true; otherwise, returns false.
1848bool LoopIdiomRecognize::recognizePopcount() {
1850 return false;
1851
1852 // Counting population are usually conducted by few arithmetic instructions.
1853 // Such instructions can be easily "absorbed" by vacant slots in a
1854 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1855 // in a compact loop.
1856
1857 // Give up if the loop has multiple blocks or multiple backedges.
1858 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1859 return false;
1860
1861 BasicBlock *LoopBody = *(CurLoop->block_begin());
1862 if (LoopBody->size() >= 20) {
1863 // The loop is too big, bail out.
1864 return false;
1865 }
1866
1867 // It should have a preheader containing nothing but an unconditional branch.
1868 BasicBlock *PH = CurLoop->getLoopPreheader();
1869 if (!PH || &PH->front() != PH->getTerminator())
1870 return false;
1871 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1872 if (!EntryBI || EntryBI->isConditional())
1873 return false;
1874
1875 // It should have a precondition block where the generated popcount intrinsic
1876 // function can be inserted.
1877 auto *PreCondBB = PH->getSinglePredecessor();
1878 if (!PreCondBB)
1879 return false;
1880 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1881 if (!PreCondBI || PreCondBI->isUnconditional())
1882 return false;
1883
1884 Instruction *CntInst;
1885 PHINode *CntPhi;
1886 Value *Val;
1887 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1888 return false;
1889
1890 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1891 return true;
1892}
1893
1895 const DebugLoc &DL) {
1896 Value *Ops[] = {Val};
1897 Type *Tys[] = {Val->getType()};
1898
1900 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1901 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1902 CI->setDebugLoc(DL);
1903
1904 return CI;
1905}
1906
1908 const DebugLoc &DL, bool ZeroCheck,
1909 Intrinsic::ID IID) {
1910 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
1911 Type *Tys[] = {Val->getType()};
1912
1914 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1915 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1916 CI->setDebugLoc(DL);
1917
1918 return CI;
1919}
1920
1921/// Transform the following loop (Using CTLZ, CTTZ is similar):
1922/// loop:
1923/// CntPhi = PHI [Cnt0, CntInst]
1924/// PhiX = PHI [InitX, DefX]
1925/// CntInst = CntPhi + 1
1926/// DefX = PhiX >> 1
1927/// LOOP_BODY
1928/// Br: loop if (DefX != 0)
1929/// Use(CntPhi) or Use(CntInst)
1930///
1931/// Into:
1932/// If CntPhi used outside the loop:
1933/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1934/// Count = CountPrev + 1
1935/// else
1936/// Count = BitWidth(InitX) - CTLZ(InitX)
1937/// loop:
1938/// CntPhi = PHI [Cnt0, CntInst]
1939/// PhiX = PHI [InitX, DefX]
1940/// PhiCount = PHI [Count, Dec]
1941/// CntInst = CntPhi + 1
1942/// DefX = PhiX >> 1
1943/// Dec = PhiCount - 1
1944/// LOOP_BODY
1945/// Br: loop if (Dec != 0)
1946/// Use(CountPrev + Cnt0) // Use(CntPhi)
1947/// or
1948/// Use(Count + Cnt0) // Use(CntInst)
1949///
1950/// If LOOP_BODY is empty the loop will be deleted.
1951/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1952void LoopIdiomRecognize::transformLoopToCountable(
1953 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1954 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1955 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1956 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1957
1958 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1959 IRBuilder<> Builder(PreheaderBr);
1960 Builder.SetCurrentDebugLocation(DL);
1961
1962 // If there are no uses of CntPhi crate:
1963 // Count = BitWidth - CTLZ(InitX);
1964 // NewCount = Count;
1965 // If there are uses of CntPhi create:
1966 // NewCount = BitWidth - CTLZ(InitX >> 1);
1967 // Count = NewCount + 1;
1968 Value *InitXNext;
1969 if (IsCntPhiUsedOutsideLoop) {
1970 if (DefX->getOpcode() == Instruction::AShr)
1971 InitXNext = Builder.CreateAShr(InitX, 1);
1972 else if (DefX->getOpcode() == Instruction::LShr)
1973 InitXNext = Builder.CreateLShr(InitX, 1);
1974 else if (DefX->getOpcode() == Instruction::Shl) // cttz
1975 InitXNext = Builder.CreateShl(InitX, 1);
1976 else
1977 llvm_unreachable("Unexpected opcode!");
1978 } else
1979 InitXNext = InitX;
1980 Value *Count =
1981 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1982 Type *CountTy = Count->getType();
1983 Count = Builder.CreateSub(
1984 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
1985 Value *NewCount = Count;
1986 if (IsCntPhiUsedOutsideLoop)
1987 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
1988
1989 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
1990
1991 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1992 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
1993 // If the counter was being incremented in the loop, add NewCount to the
1994 // counter's initial value, but only if the initial value is not zero.
1995 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1996 if (!InitConst || !InitConst->isZero())
1997 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1998 } else {
1999 // If the count was being decremented in the loop, subtract NewCount from
2000 // the counter's initial value.
2001 NewCount = Builder.CreateSub(CntInitVal, NewCount);
2002 }
2003
2004 // Step 2: Insert new IV and loop condition:
2005 // loop:
2006 // ...
2007 // PhiCount = PHI [Count, Dec]
2008 // ...
2009 // Dec = PhiCount - 1
2010 // ...
2011 // Br: loop if (Dec != 0)
2012 BasicBlock *Body = *(CurLoop->block_begin());
2013 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2014 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2015
2016 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
2017
2018 Builder.SetInsertPoint(LbCond);
2019 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2020 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2021
2022 TcPhi->addIncoming(Count, Preheader);
2023 TcPhi->addIncoming(TcDec, Body);
2024
2025 CmpInst::Predicate Pred =
2026 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2027 LbCond->setPredicate(Pred);
2028 LbCond->setOperand(0, TcDec);
2029 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2030
2031 // Step 3: All the references to the original counter outside
2032 // the loop are replaced with the NewCount
2033 if (IsCntPhiUsedOutsideLoop)
2034 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2035 else
2036 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2037
2038 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2039 // loop. The loop would otherwise not be deleted even if it becomes empty.
2040 SE->forgetLoop(CurLoop);
2041}
2042
2043void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2044 Instruction *CntInst,
2045 PHINode *CntPhi, Value *Var) {
2046 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2047 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2048 const DebugLoc &DL = CntInst->getDebugLoc();
2049
2050 // Assuming before transformation, the loop is following:
2051 // if (x) // the precondition
2052 // do { cnt++; x &= x - 1; } while(x);
2053
2054 // Step 1: Insert the ctpop instruction at the end of the precondition block
2055 IRBuilder<> Builder(PreCondBr);
2056 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2057 {
2058 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2059 NewCount = PopCntZext =
2060 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2061
2062 if (NewCount != PopCnt)
2063 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2064
2065 // TripCnt is exactly the number of iterations the loop has
2066 TripCnt = NewCount;
2067
2068 // If the population counter's initial value is not zero, insert Add Inst.
2069 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2070 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2071 if (!InitConst || !InitConst->isZero()) {
2072 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2073 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2074 }
2075 }
2076
2077 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2078 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2079 // function would be partial dead code, and downstream passes will drag
2080 // it back from the precondition block to the preheader.
2081 {
2082 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2083
2084 Value *Opnd0 = PopCntZext;
2085 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2086 if (PreCond->getOperand(0) != Var)
2087 std::swap(Opnd0, Opnd1);
2088
2089 ICmpInst *NewPreCond = cast<ICmpInst>(
2090 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2091 PreCondBr->setCondition(NewPreCond);
2092
2094 }
2095
2096 // Step 3: Note that the population count is exactly the trip count of the
2097 // loop in question, which enable us to convert the loop from noncountable
2098 // loop into a countable one. The benefit is twofold:
2099 //
2100 // - If the loop only counts population, the entire loop becomes dead after
2101 // the transformation. It is a lot easier to prove a countable loop dead
2102 // than to prove a noncountable one. (In some C dialects, an infinite loop
2103 // isn't dead even if it computes nothing useful. In general, DCE needs
2104 // to prove a noncountable loop finite before safely delete it.)
2105 //
2106 // - If the loop also performs something else, it remains alive.
2107 // Since it is transformed to countable form, it can be aggressively
2108 // optimized by some optimizations which are in general not applicable
2109 // to a noncountable loop.
2110 //
2111 // After this step, this loop (conceptually) would look like following:
2112 // newcnt = __builtin_ctpop(x);
2113 // t = newcnt;
2114 // if (x)
2115 // do { cnt++; x &= x-1; t--) } while (t > 0);
2116 BasicBlock *Body = *(CurLoop->block_begin());
2117 {
2118 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2119 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2120 Type *Ty = TripCnt->getType();
2121
2122 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
2123
2124 Builder.SetInsertPoint(LbCond);
2125 Instruction *TcDec = cast<Instruction>(
2126 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2127 "tcdec", false, true));
2128
2129 TcPhi->addIncoming(TripCnt, PreHead);
2130 TcPhi->addIncoming(TcDec, Body);
2131
2132 CmpInst::Predicate Pred =
2133 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2134 LbCond->setPredicate(Pred);
2135 LbCond->setOperand(0, TcDec);
2136 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2137 }
2138
2139 // Step 4: All the references to the original population counter outside
2140 // the loop are replaced with the NewCount -- the value returned from
2141 // __builtin_ctpop().
2142 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2143
2144 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2145 // loop. The loop would otherwise not be deleted even if it becomes empty.
2146 SE->forgetLoop(CurLoop);
2147}
2148
2149/// Match loop-invariant value.
2150template <typename SubPattern_t> struct match_LoopInvariant {
2151 SubPattern_t SubPattern;
2152 const Loop *L;
2153
2154 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2155 : SubPattern(SP), L(L) {}
2156
2157 template <typename ITy> bool match(ITy *V) {
2158 return L->isLoopInvariant(V) && SubPattern.match(V);
2159 }
2160};
2161
2162/// Matches if the value is loop-invariant.
2163template <typename Ty>
2164inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2165 return match_LoopInvariant<Ty>(M, L);
2166}
2167
2168/// Return true if the idiom is detected in the loop.
2169///
2170/// The core idiom we are trying to detect is:
2171/// \code
2172/// entry:
2173/// <...>
2174/// %bitmask = shl i32 1, %bitpos
2175/// br label %loop
2176///
2177/// loop:
2178/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2179/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2180/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2181/// %x.next = shl i32 %x.curr, 1
2182/// <...>
2183/// br i1 %x.curr.isbitunset, label %loop, label %end
2184///
2185/// end:
2186/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2187/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2188/// <...>
2189/// \endcode
2190static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2191 Value *&BitMask, Value *&BitPos,
2192 Value *&CurrX, Instruction *&NextX) {
2194 " Performing shift-until-bittest idiom detection.\n");
2195
2196 // Give up if the loop has multiple blocks or multiple backedges.
2197 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2198 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2199 return false;
2200 }
2201
2202 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2203 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2204 assert(LoopPreheaderBB && "There is always a loop preheader.");
2205
2206 using namespace PatternMatch;
2207
2208 // Step 1: Check if the loop backedge is in desirable form.
2209
2211 Value *CmpLHS, *CmpRHS;
2212 BasicBlock *TrueBB, *FalseBB;
2213 if (!match(LoopHeaderBB->getTerminator(),
2214 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2215 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2216 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2217 return false;
2218 }
2219
2220 // Step 2: Check if the backedge's condition is in desirable form.
2221
2222 auto MatchVariableBitMask = [&]() {
2223 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2224 match(CmpLHS,
2225 m_c_And(m_Value(CurrX),
2227 m_Value(BitMask),
2228 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2229 CurLoop))));
2230 };
2231 auto MatchConstantBitMask = [&]() {
2232 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2233 match(CmpLHS, m_And(m_Value(CurrX),
2234 m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2235 (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2236 };
2237 auto MatchDecomposableConstantBitMask = [&]() {
2238 APInt Mask;
2239 return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2240 ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2241 (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2242 (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2243 };
2244
2245 if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2246 !MatchDecomposableConstantBitMask()) {
2247 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2248 return false;
2249 }
2250
2251 // Step 3: Check if the recurrence is in desirable form.
2252 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2253 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2254 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2255 return false;
2256 }
2257
2258 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2259 NextX =
2260 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2261
2262 assert(CurLoop->isLoopInvariant(BaseX) &&
2263 "Expected BaseX to be avaliable in the preheader!");
2264
2265 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2266 // FIXME: support right-shift?
2267 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2268 return false;
2269 }
2270
2271 // Step 4: Check if the backedge's destinations are in desirable form.
2272
2274 "Should only get equality predicates here.");
2275
2276 // cmp-br is commutative, so canonicalize to a single variant.
2277 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2278 Pred = ICmpInst::getInversePredicate(Pred);
2279 std::swap(TrueBB, FalseBB);
2280 }
2281
2282 // We expect to exit loop when comparison yields false,
2283 // so when it yields true we should branch back to loop header.
2284 if (TrueBB != LoopHeaderBB) {
2285 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2286 return false;
2287 }
2288
2289 // Okay, idiom checks out.
2290 return true;
2291}
2292
2293/// Look for the following loop:
2294/// \code
2295/// entry:
2296/// <...>
2297/// %bitmask = shl i32 1, %bitpos
2298/// br label %loop
2299///
2300/// loop:
2301/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2302/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2303/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2304/// %x.next = shl i32 %x.curr, 1
2305/// <...>
2306/// br i1 %x.curr.isbitunset, label %loop, label %end
2307///
2308/// end:
2309/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2310/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2311/// <...>
2312/// \endcode
2313///
2314/// And transform it into:
2315/// \code
2316/// entry:
2317/// %bitmask = shl i32 1, %bitpos
2318/// %lowbitmask = add i32 %bitmask, -1
2319/// %mask = or i32 %lowbitmask, %bitmask
2320/// %x.masked = and i32 %x, %mask
2321/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2322/// i1 true)
2323/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2324/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2325/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2326/// %tripcount = add i32 %backedgetakencount, 1
2327/// %x.curr = shl i32 %x, %backedgetakencount
2328/// %x.next = shl i32 %x, %tripcount
2329/// br label %loop
2330///
2331/// loop:
2332/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2333/// %loop.iv.next = add nuw i32 %loop.iv, 1
2334/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2335/// <...>
2336/// br i1 %loop.ivcheck, label %end, label %loop
2337///
2338/// end:
2339/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2340/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2341/// <...>
2342/// \endcode
2343bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2344 bool MadeChange = false;
2345
2346 Value *X, *BitMask, *BitPos, *XCurr;
2347 Instruction *XNext;
2348 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2349 XNext)) {
2351 " shift-until-bittest idiom detection failed.\n");
2352 return MadeChange;
2353 }
2354 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2355
2356 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2357 // but is it profitable to transform?
2358
2359 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2360 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2361 assert(LoopPreheaderBB && "There is always a loop preheader.");
2362
2363 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2364 assert(SuccessorBB && "There is only a single successor.");
2365
2366 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2367 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2368
2369 Intrinsic::ID IntrID = Intrinsic::ctlz;
2370 Type *Ty = X->getType();
2371 unsigned Bitwidth = Ty->getScalarSizeInBits();
2372
2375
2376 // The rewrite is considered to be unprofitable iff and only iff the
2377 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2378 // making the loop countable, even if nothing else changes.
2380 IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
2384 " Intrinsic is too costly, not beneficial\n");
2385 return MadeChange;
2386 }
2387 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2389 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2390 return MadeChange;
2391 }
2392
2393 // Ok, transform appears worthwhile.
2394 MadeChange = true;
2395
2396 // Step 1: Compute the loop trip count.
2397
2398 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2399 BitPos->getName() + ".lowbitmask");
2400 Value *Mask =
2401 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2402 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2403 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2404 IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
2405 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2406 Value *XMaskedNumActiveBits = Builder.CreateSub(
2407 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2408 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2409 /*HasNSW=*/Bitwidth != 2);
2410 Value *XMaskedLeadingOnePos =
2411 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2412 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2413 /*HasNSW=*/Bitwidth > 2);
2414
2415 Value *LoopBackedgeTakenCount = Builder.CreateSub(
2416 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2417 /*HasNUW=*/true, /*HasNSW=*/true);
2418 // We know loop's backedge-taken count, but what's loop's trip count?
2419 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2420 Value *LoopTripCount =
2421 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2422 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2423 /*HasNSW=*/Bitwidth != 2);
2424
2425 // Step 2: Compute the recurrence's final value without a loop.
2426
2427 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2428 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2429 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2430 NewX->takeName(XCurr);
2431 if (auto *I = dyn_cast<Instruction>(NewX))
2432 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2433
2434 Value *NewXNext;
2435 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2436 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2437 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2438 // that isn't the case, we'll need to emit an alternative, safe IR.
2439 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2443 Ty->getScalarSizeInBits() - 1))))
2444 NewXNext = Builder.CreateShl(X, LoopTripCount);
2445 else {
2446 // Otherwise, just additionally shift by one. It's the smallest solution,
2447 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2448 // and select 0 instead.
2449 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2450 }
2451
2452 NewXNext->takeName(XNext);
2453 if (auto *I = dyn_cast<Instruction>(NewXNext))
2454 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2455
2456 // Step 3: Adjust the successor basic block to recieve the computed
2457 // recurrence's final value instead of the recurrence itself.
2458
2459 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2460 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2461
2462 // Step 4: Rewrite the loop into a countable form, with canonical IV.
2463
2464 // The new canonical induction variable.
2465 Builder.SetInsertPoint(&LoopHeaderBB->front());
2466 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2467
2468 // The induction itself.
2469 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2470 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2471 auto *IVNext =
2472 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2473 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2474
2475 // The loop trip count check.
2476 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2477 CurLoop->getName() + ".ivcheck");
2478 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2479 LoopHeaderBB->getTerminator()->eraseFromParent();
2480
2481 // Populate the IV PHI.
2482 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2483 IV->addIncoming(IVNext, LoopHeaderBB);
2484
2485 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2486 // loop. The loop would otherwise not be deleted even if it becomes empty.
2487
2488 SE->forgetLoop(CurLoop);
2489
2490 // Other passes will take care of actually deleting the loop if possible.
2491
2492 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2493
2494 ++NumShiftUntilBitTest;
2495 return MadeChange;
2496}
2497
2498/// Return true if the idiom is detected in the loop.
2499///
2500/// The core idiom we are trying to detect is:
2501/// \code
2502/// entry:
2503/// <...>
2504/// %start = <...>
2505/// %extraoffset = <...>
2506/// <...>
2507/// br label %for.cond
2508///
2509/// loop:
2510/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2511/// %nbits = add nsw i8 %iv, %extraoffset
2512/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2513/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2514/// %iv.next = add i8 %iv, 1
2515/// <...>
2516/// br i1 %val.shifted.iszero, label %end, label %loop
2517///
2518/// end:
2519/// %iv.res = phi i8 [ %iv, %loop ] <...>
2520/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2521/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2522/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2523/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2524/// <...>
2525/// \endcode
2527 Instruction *&ValShiftedIsZero,
2528 Intrinsic::ID &IntrinID, Instruction *&IV,
2529 Value *&Start, Value *&Val,
2530 const SCEV *&ExtraOffsetExpr,
2531 bool &InvertedCond) {
2533 " Performing shift-until-zero idiom detection.\n");
2534
2535 // Give up if the loop has multiple blocks or multiple backedges.
2536 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2537 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2538 return false;
2539 }
2540
2541 Instruction *ValShifted, *NBits, *IVNext;
2542 Value *ExtraOffset;
2543
2544 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2545 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2546 assert(LoopPreheaderBB && "There is always a loop preheader.");
2547
2548 using namespace PatternMatch;
2549
2550 // Step 1: Check if the loop backedge, condition is in desirable form.
2551
2553 BasicBlock *TrueBB, *FalseBB;
2554 if (!match(LoopHeaderBB->getTerminator(),
2555 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2556 m_BasicBlock(FalseBB))) ||
2557 !match(ValShiftedIsZero,
2558 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
2559 !ICmpInst::isEquality(Pred)) {
2560 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2561 return false;
2562 }
2563
2564 // Step 2: Check if the comparison's operand is in desirable form.
2565 // FIXME: Val could be a one-input PHI node, which we should look past.
2566 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2567 m_Instruction(NBits)))) {
2568 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
2569 return false;
2570 }
2571 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2572 : Intrinsic::ctlz;
2573
2574 // Step 3: Check if the shift amount is in desirable form.
2575
2576 if (match(NBits, m_c_Add(m_Instruction(IV),
2577 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2578 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
2579 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2580 else if (match(NBits,
2582 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2583 NBits->hasNoSignedWrap())
2584 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2585 else {
2586 IV = NBits;
2587 ExtraOffsetExpr = SE->getZero(NBits->getType());
2588 }
2589
2590 // Step 4: Check if the recurrence is in desirable form.
2591 auto *IVPN = dyn_cast<PHINode>(IV);
2592 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
2593 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2594 return false;
2595 }
2596
2597 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2598 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2599
2600 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2601 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2602 return false;
2603 }
2604
2605 // Step 4: Check if the backedge's destinations are in desirable form.
2606
2608 "Should only get equality predicates here.");
2609
2610 // cmp-br is commutative, so canonicalize to a single variant.
2611 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2612 if (InvertedCond) {
2613 Pred = ICmpInst::getInversePredicate(Pred);
2614 std::swap(TrueBB, FalseBB);
2615 }
2616
2617 // We expect to exit loop when comparison yields true,
2618 // so when it yields false we should branch back to loop header.
2619 if (FalseBB != LoopHeaderBB) {
2620 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2621 return false;
2622 }
2623
2624 // The new, countable, loop will certainly only run a known number of
2625 // iterations, It won't be infinite. But the old loop might be infinite
2626 // under certain conditions. For logical shifts, the value will become zero
2627 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2628 // right-shift, iff the sign bit was set, the value will never become zero,
2629 // and the loop may never finish.
2630 if (ValShifted->getOpcode() == Instruction::AShr &&
2631 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2632 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
2633 return false;
2634 }
2635
2636 // Okay, idiom checks out.
2637 return true;
2638}
2639
2640/// Look for the following loop:
2641/// \code
2642/// entry:
2643/// <...>
2644/// %start = <...>
2645/// %extraoffset = <...>
2646/// <...>
2647/// br label %for.cond
2648///
2649/// loop:
2650/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2651/// %nbits = add nsw i8 %iv, %extraoffset
2652/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2653/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2654/// %iv.next = add i8 %iv, 1
2655/// <...>
2656/// br i1 %val.shifted.iszero, label %end, label %loop
2657///
2658/// end:
2659/// %iv.res = phi i8 [ %iv, %loop ] <...>
2660/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2661/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2662/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2663/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2664/// <...>
2665/// \endcode
2666///
2667/// And transform it into:
2668/// \code
2669/// entry:
2670/// <...>
2671/// %start = <...>
2672/// %extraoffset = <...>
2673/// <...>
2674/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2675/// %val.numactivebits = sub i8 8, %val.numleadingzeros
2676/// %extraoffset.neg = sub i8 0, %extraoffset
2677/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2678/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2679/// %loop.tripcount = sub i8 %iv.final, %start
2680/// br label %loop
2681///
2682/// loop:
2683/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2684/// %loop.iv.next = add i8 %loop.iv, 1
2685/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2686/// %iv = add i8 %loop.iv, %start
2687/// <...>
2688/// br i1 %loop.ivcheck, label %end, label %loop
2689///
2690/// end:
2691/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2692/// <...>
2693/// \endcode
2694bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2695 bool MadeChange = false;
2696
2697 Instruction *ValShiftedIsZero;
2698 Intrinsic::ID IntrID;
2699 Instruction *IV;
2700 Value *Start, *Val;
2701 const SCEV *ExtraOffsetExpr;
2702 bool InvertedCond;
2703 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2704 Start, Val, ExtraOffsetExpr, InvertedCond)) {
2706 " shift-until-zero idiom detection failed.\n");
2707 return MadeChange;
2708 }
2709 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
2710
2711 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2712 // but is it profitable to transform?
2713
2714 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2715 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2716 assert(LoopPreheaderBB && "There is always a loop preheader.");
2717
2718 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2719 assert(SuccessorBB && "There is only a single successor.");
2720
2721 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2722 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2723
2724 Type *Ty = Val->getType();
2725 unsigned Bitwidth = Ty->getScalarSizeInBits();
2726
2729
2730 // The rewrite is considered to be unprofitable iff and only iff the
2731 // intrinsic we'll use are not cheap. Note that we are okay with *just*
2732 // making the loop countable, even if nothing else changes.
2734 IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getFalse()});
2738 " Intrinsic is too costly, not beneficial\n");
2739 return MadeChange;
2740 }
2741
2742 // Ok, transform appears worthwhile.
2743 MadeChange = true;
2744
2745 bool OffsetIsZero = false;
2746 if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
2747 OffsetIsZero = ExtraOffsetExprC->isZero();
2748
2749 // Step 1: Compute the loop's final IV value / trip count.
2750
2751 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
2752 IntrID, Ty, {Val, /*is_zero_undef=*/Builder.getFalse()},
2753 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
2754 Value *ValNumActiveBits = Builder.CreateSub(
2755 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
2756 Val->getName() + ".numactivebits", /*HasNUW=*/true,
2757 /*HasNSW=*/Bitwidth != 2);
2758
2759 SCEVExpander Expander(*SE, *DL, "loop-idiom");
2760 Expander.setInsertPoint(&*Builder.GetInsertPoint());
2761 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
2762
2763 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
2764 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
2765 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
2766 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
2767 {ValNumActiveBitsOffset, Start},
2768 /*FMFSource=*/nullptr, "iv.final");
2769
2770 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
2771 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
2772 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
2773 // FIXME: or when the offset was `add nuw`
2774
2775 // We know loop's backedge-taken count, but what's loop's trip count?
2776 Value *LoopTripCount =
2777 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2778 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2779 /*HasNSW=*/Bitwidth != 2);
2780
2781 // Step 2: Adjust the successor basic block to recieve the original
2782 // induction variable's final value instead of the orig. IV itself.
2783
2784 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
2785
2786 // Step 3: Rewrite the loop into a countable form, with canonical IV.
2787
2788 // The new canonical induction variable.
2789 Builder.SetInsertPoint(&LoopHeaderBB->front());
2790 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2791
2792 // The induction itself.
2793 Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
2794 auto *CIVNext =
2795 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
2796 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2797
2798 // The loop trip count check.
2799 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
2800 CurLoop->getName() + ".ivcheck");
2801 auto *NewIVCheck = CIVCheck;
2802 if (InvertedCond) {
2803 NewIVCheck = Builder.CreateNot(CIVCheck);
2804 NewIVCheck->takeName(ValShiftedIsZero);
2805 }
2806
2807 // The original IV, but rebased to be an offset to the CIV.
2808 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
2809 /*HasNSW=*/true); // FIXME: what about NUW?
2810 IVDePHId->takeName(IV);
2811
2812 // The loop terminator.
2813 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2814 Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
2815 LoopHeaderBB->getTerminator()->eraseFromParent();
2816
2817 // Populate the IV PHI.
2818 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2819 CIV->addIncoming(CIVNext, LoopHeaderBB);
2820
2821 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
2822 // loop. The loop would otherwise not be deleted even if it becomes empty.
2823
2824 SE->forgetLoop(CurLoop);
2825
2826 // Step 5: Try to cleanup the loop's body somewhat.
2827 IV->replaceAllUsesWith(IVDePHId);
2828 IV->eraseFromParent();
2829
2830 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
2831 ValShiftedIsZero->eraseFromParent();
2832
2833 // Other passes will take care of actually deleting the loop if possible.
2834
2835 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
2836
2837 ++NumShiftUntilZero;
2838 return MadeChange;
2839}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
assume Assume Builder
static const Function * getParent(const Value *V)
SmallVector< MachineOperand, 4 > Cond
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
std::string Name
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define DEBUG_TYPE
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
IRTranslator LLVM IR MI
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
#define DEBUG_TYPE
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
static void deleteDeadInstruction(Instruction *I)
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...
This header defines various interfaces for pass management in LLVM.
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
verify safepoint Safepoint IR Verifier
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
static const uint32_t IV[8]
Definition: blake3_impl.h:77
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:75
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1443
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1520
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:708
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
iterator end()
Definition: BasicBlock.h:328
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:326
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:103
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:216
const Instruction & front() const
Definition: BasicBlock.h:338
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:292
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
size_t size() const
Definition: BasicBlock.h:336
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
BinaryOps getOpcode() const
Definition: InstrTypes.h:391
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1412
This class represents a function call, abstracting a target machine's calling convention.
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition: InstrTypes.h:804
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:711
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:741
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:734
@ ICMP_EQ
equal
Definition: InstrTypes.h:732
@ ICMP_NE
not equal
Definition: InstrTypes.h:733
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:825
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:801
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1235
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2213
static Constant * getExactLogBase2(Constant *C)
If C is a scalar/fixed width vector of known powers of 2, then this function returns a new scalar/fix...
Definition: Constants.cpp:2643
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:209
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:203
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:197
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:145
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:847
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:403
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:165
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalObject.
Definition: Globals.cpp:128
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:227
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:56
This instruction compares its operands according to the predicate given to the constructor.
bool isEquality() const
Return true if this predicate is either EQ or NE.
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:447
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2307
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2564
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:365
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:70
bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
const BasicBlock * getParent() const
Definition: Instruction.h:90
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:74
AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition: Metadata.cpp:1594
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:168
bool isShift() const
Definition: Instruction.h:175
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:82
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:362
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:177
Value * getPointerOperand()
Definition: Instructions.h:264
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:214
bool isUnordered() const
Definition: Instructions.h:258
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:220
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
block_iterator block_begin() const
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
StringRef getName() const
Definition: LoopInfo.h:391
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
MaybeAlign getSourceAlign() const
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Representation for a specific memory location.
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:936
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:700
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1743
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
const SCEV * getOperand(unsigned i) const
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
The main scalar evolution driver.
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A vector that has set insertion semantics.
Definition: SetVector.h:51
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:219
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:152
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:110
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:51
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:47
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
iterator end() const
Definition: SmallPtrSet.h:408
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
iterator begin() const
Definition: SmallPtrSet.h:403
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:389
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
Align getAlign() const
Definition: Instructions.h:345
Value * getValueOperand()
Definition: Instructions.h:390
Value * getPointerOperand()
Definition: Instructions.h:393
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Provides information about what library functions are available for the current target.
bool has(LibFunc F) const
Tests whether a library function is available.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
unsigned getAtomicMemIntrinsicMaxElementSize() const
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Basic
The cost of a typical 'add' instruction.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1724
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:535
iterator_range< user_iterator > users()
Definition: Value.h:421
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:587
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1069
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:384
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:182
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:166
self_iterator getIterator()
Definition: ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
@ HeaderSize
Definition: BTF.h:60
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:119
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1465
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:979
cst_pred_ty< is_power2 > m_Power2()
Match an integer or vector power-of-2.
Definition: PatternMatch.h:544
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:716
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:772
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:517
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:224
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition: PatternMatch.h:168
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:537
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition: PatternMatch.h:991
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition: PatternMatch.h:598
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:465
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:529
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
bool inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition: LoopInfo.cpp:1117
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI, LibFunc TheLibFunc, FunctionType *T, AttributeList AttributeList)
Calls getOrInsertFunction() and then makes sure to add mandatory argument attributes.
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
@ ModRef
The access may reference and may modify the value stored in memory.
@ Mod
The access may modify the value stored in memory.
bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition: MemorySSA.cpp:89
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Returns true if the give value is known to be non-negative.
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred, Value *&X, APInt &Mask, bool LookThroughTrunc=true)
Decompose an icmp into the form ((X & Mask) pred 0) if possible.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:651
MDNode * TBAAStruct
The tag for type-based alias analysis (tbaa struct).
Definition: Metadata.h:671
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:674
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:668
AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:677
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition: Metadata.h:718
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static bool Memset
When true, Memset is disabled.
static bool All
When true, the entire pass is disabled.
static bool Memcpy
When true, Memcpy is disabled.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Match loop-invariant value.
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)