LLVM 19.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, strlen, etc.
25//
26// This could recognize common matrix multiplies and dot product idioms and
27// replace them with calls to BLAS (if linked in??).
28//
29//===----------------------------------------------------------------------===//
30
32#include "llvm/ADT/APInt.h"
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/DenseMap.h"
35#include "llvm/ADT/MapVector.h"
36#include "llvm/ADT/SetVector.h"
39#include "llvm/ADT/Statistic.h"
40#include "llvm/ADT/StringRef.h"
56#include "llvm/IR/BasicBlock.h"
57#include "llvm/IR/Constant.h"
58#include "llvm/IR/Constants.h"
59#include "llvm/IR/DataLayout.h"
60#include "llvm/IR/DebugLoc.h"
62#include "llvm/IR/Dominators.h"
63#include "llvm/IR/GlobalValue.h"
65#include "llvm/IR/IRBuilder.h"
66#include "llvm/IR/InstrTypes.h"
67#include "llvm/IR/Instruction.h"
70#include "llvm/IR/Intrinsics.h"
71#include "llvm/IR/LLVMContext.h"
72#include "llvm/IR/Module.h"
73#include "llvm/IR/PassManager.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/User.h"
77#include "llvm/IR/Value.h"
78#include "llvm/IR/ValueHandle.h"
81#include "llvm/Support/Debug.h"
88#include <algorithm>
89#include <cassert>
90#include <cstdint>
91#include <utility>
92#include <vector>
93
94using namespace llvm;
95
96#define DEBUG_TYPE "loop-idiom"
97
98STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
99STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
100STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
102 NumShiftUntilBitTest,
103 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
104STATISTIC(NumShiftUntilZero,
105 "Number of uncountable loops recognized as 'shift until zero' idiom");
106
109 DisableLIRPAll("disable-" DEBUG_TYPE "-all",
110 cl::desc("Options to disable Loop Idiom Recognize Pass."),
113
116 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
117 cl::desc("Proceed with loop idiom recognize pass, but do "
118 "not convert loop(s) to memset."),
121
124 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
125 cl::desc("Proceed with loop idiom recognize pass, but do "
126 "not convert loop(s) to memcpy."),
129
131 "use-lir-code-size-heurs",
132 cl::desc("Use loop idiom recognition code size heuristics when compiling"
133 "with -Os/-Oz"),
134 cl::init(true), cl::Hidden);
135
136namespace {
137
138class LoopIdiomRecognize {
139 Loop *CurLoop = nullptr;
140 AliasAnalysis *AA;
141 DominatorTree *DT;
142 LoopInfo *LI;
143 ScalarEvolution *SE;
146 const DataLayout *DL;
148 bool ApplyCodeSizeHeuristics;
149 std::unique_ptr<MemorySSAUpdater> MSSAU;
150
151public:
152 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
153 LoopInfo *LI, ScalarEvolution *SE,
155 const TargetTransformInfo *TTI, MemorySSA *MSSA,
156 const DataLayout *DL,
158 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
159 if (MSSA)
160 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
161 }
162
163 bool runOnLoop(Loop *L);
164
165private:
166 using StoreList = SmallVector<StoreInst *, 8>;
167 using StoreListMap = MapVector<Value *, StoreList>;
168
169 StoreListMap StoreRefsForMemset;
170 StoreListMap StoreRefsForMemsetPattern;
171 StoreList StoreRefsForMemcpy;
172 bool HasMemset;
173 bool HasMemsetPattern;
174 bool HasMemcpy;
175
176 /// Return code for isLegalStore()
177 enum LegalStoreKind {
178 None = 0,
179 Memset,
180 MemsetPattern,
181 Memcpy,
182 UnorderedAtomicMemcpy,
183 DontUse // Dummy retval never to be used. Allows catching errors in retval
184 // handling.
185 };
186
187 /// \name Countable Loop Idiom Handling
188 /// @{
189
190 bool runOnCountableLoop();
191 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
193
194 void collectStores(BasicBlock *BB);
195 LegalStoreKind isLegalStore(StoreInst *SI);
196 enum class ForMemset { No, Yes };
197 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
198 ForMemset For);
199
200 template <typename MemInst>
201 bool processLoopMemIntrinsic(
202 BasicBlock *BB,
203 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
204 const SCEV *BECount);
205 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
206 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
207
208 bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
209 MaybeAlign StoreAlignment, Value *StoredVal,
210 Instruction *TheStore,
212 const SCEVAddRecExpr *Ev, const SCEV *BECount,
213 bool IsNegStride, bool IsLoopMemset = false);
214 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
215 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
216 const SCEV *StoreSize, MaybeAlign StoreAlign,
217 MaybeAlign LoadAlign, Instruction *TheStore,
218 Instruction *TheLoad,
219 const SCEVAddRecExpr *StoreEv,
220 const SCEVAddRecExpr *LoadEv,
221 const SCEV *BECount);
222 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
223 bool IsLoopMemset = false);
224
225 /// @}
226 /// \name Noncountable Loop Idiom Handling
227 /// @{
228
229 bool runOnNoncountableLoop();
230
231 bool recognizePopcount();
232 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
233 PHINode *CntPhi, Value *Var);
234 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
235 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
236 Instruction *CntInst, PHINode *CntPhi,
237 Value *Var, Instruction *DefX,
238 const DebugLoc &DL, bool ZeroCheck,
239 bool IsCntPhiUsedOutsideLoop);
240
241 bool recognizeShiftUntilBitTest();
242 bool recognizeShiftUntilZero();
243
244 /// @}
245};
246} // end anonymous namespace
247
250 LPMUpdater &) {
252 return PreservedAnalyses::all();
253
254 const auto *DL = &L.getHeader()->getDataLayout();
255
256 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
257 // pass. Function analyses need to be preserved across loop transformations
258 // but ORE cannot be preserved (see comment before the pass definition).
259 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
260
261 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
262 AR.MSSA, DL, ORE);
263 if (!LIR.runOnLoop(&L))
264 return PreservedAnalyses::all();
265
267 if (AR.MSSA)
268 PA.preserve<MemorySSAAnalysis>();
269 return PA;
270}
271
273 I->replaceAllUsesWith(PoisonValue::get(I->getType()));
274 I->eraseFromParent();
275}
276
277//===----------------------------------------------------------------------===//
278//
279// Implementation of LoopIdiomRecognize
280//
281//===----------------------------------------------------------------------===//
282
283bool LoopIdiomRecognize::runOnLoop(Loop *L) {
284 CurLoop = L;
285 // If the loop could not be converted to canonical form, it must have an
286 // indirectbr in it, just give up.
287 if (!L->getLoopPreheader())
288 return false;
289
290 // Disable loop idiom recognition if the function's name is a common idiom.
291 StringRef Name = L->getHeader()->getParent()->getName();
292 if (Name == "memset" || Name == "memcpy")
293 return false;
294
295 // Determine if code size heuristics need to be applied.
296 ApplyCodeSizeHeuristics =
297 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
298
299 HasMemset = TLI->has(LibFunc_memset);
300 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
301 HasMemcpy = TLI->has(LibFunc_memcpy);
302
303 if (HasMemset || HasMemsetPattern || HasMemcpy)
305 return runOnCountableLoop();
306
307 return runOnNoncountableLoop();
308}
309
310bool LoopIdiomRecognize::runOnCountableLoop() {
311 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
312 assert(!isa<SCEVCouldNotCompute>(BECount) &&
313 "runOnCountableLoop() called on a loop without a predictable"
314 "backedge-taken count");
315
316 // If this loop executes exactly one time, then it should be peeled, not
317 // optimized by this pass.
318 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
319 if (BECst->getAPInt() == 0)
320 return false;
321
323 CurLoop->getUniqueExitBlocks(ExitBlocks);
324
325 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
326 << CurLoop->getHeader()->getParent()->getName()
327 << "] Countable Loop %" << CurLoop->getHeader()->getName()
328 << "\n");
329
330 // The following transforms hoist stores/memsets into the loop pre-header.
331 // Give up if the loop has instructions that may throw.
332 SimpleLoopSafetyInfo SafetyInfo;
333 SafetyInfo.computeLoopSafetyInfo(CurLoop);
334 if (SafetyInfo.anyBlockMayThrow())
335 return false;
336
337 bool MadeChange = false;
338
339 // Scan all the blocks in the loop that are not in subloops.
340 for (auto *BB : CurLoop->getBlocks()) {
341 // Ignore blocks in subloops.
342 if (LI->getLoopFor(BB) != CurLoop)
343 continue;
344
345 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
346 }
347 return MadeChange;
348}
349
350static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
351 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
352 return ConstStride->getAPInt();
353}
354
355/// getMemSetPatternValue - If a strided store of the specified value is safe to
356/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
357/// be passed in. Otherwise, return null.
358///
359/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
360/// just replicate their input array and then pass on to memset_pattern16.
362 // FIXME: This could check for UndefValue because it can be merged into any
363 // other valid pattern.
364
365 // If the value isn't a constant, we can't promote it to being in a constant
366 // array. We could theoretically do a store to an alloca or something, but
367 // that doesn't seem worthwhile.
368 Constant *C = dyn_cast<Constant>(V);
369 if (!C || isa<ConstantExpr>(C))
370 return nullptr;
371
372 // Only handle simple values that are a power of two bytes in size.
373 uint64_t Size = DL->getTypeSizeInBits(V->getType());
374 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
375 return nullptr;
376
377 // Don't care enough about darwin/ppc to implement this.
378 if (DL->isBigEndian())
379 return nullptr;
380
381 // Convert to size in bytes.
382 Size /= 8;
383
384 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
385 // if the top and bottom are the same (e.g. for vectors and large integers).
386 if (Size > 16)
387 return nullptr;
388
389 // If the constant is exactly 16 bytes, just use it.
390 if (Size == 16)
391 return C;
392
393 // Otherwise, we'll use an array of the constants.
394 unsigned ArraySize = 16 / Size;
395 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
396 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
397}
398
399LoopIdiomRecognize::LegalStoreKind
400LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
401 // Don't touch volatile stores.
402 if (SI->isVolatile())
403 return LegalStoreKind::None;
404 // We only want simple or unordered-atomic stores.
405 if (!SI->isUnordered())
406 return LegalStoreKind::None;
407
408 // Avoid merging nontemporal stores.
409 if (SI->getMetadata(LLVMContext::MD_nontemporal))
410 return LegalStoreKind::None;
411
412 Value *StoredVal = SI->getValueOperand();
413 Value *StorePtr = SI->getPointerOperand();
414
415 // Don't convert stores of non-integral pointer types to memsets (which stores
416 // integers).
417 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
418 return LegalStoreKind::None;
419
420 // Reject stores that are so large that they overflow an unsigned.
421 // When storing out scalable vectors we bail out for now, since the code
422 // below currently only works for constant strides.
423 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
424 if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
425 (SizeInBits.getFixedValue() >> 32) != 0)
426 return LegalStoreKind::None;
427
428 // See if the pointer expression is an AddRec like {base,+,1} on the current
429 // loop, which indicates a strided store. If we have something else, it's a
430 // random store we can't handle.
431 const SCEVAddRecExpr *StoreEv =
432 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
433 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
434 return LegalStoreKind::None;
435
436 // Check to see if we have a constant stride.
437 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
438 return LegalStoreKind::None;
439
440 // See if the store can be turned into a memset.
441
442 // If the stored value is a byte-wise value (like i32 -1), then it may be
443 // turned into a memset of i8 -1, assuming that all the consecutive bytes
444 // are stored. A store of i32 0x01020304 can never be turned into a memset,
445 // but it can be turned into memset_pattern if the target supports it.
446 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
447
448 // Note: memset and memset_pattern on unordered-atomic is yet not supported
449 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
450
451 // If we're allowed to form a memset, and the stored value would be
452 // acceptable for memset, use it.
453 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
454 // Verify that the stored value is loop invariant. If not, we can't
455 // promote the memset.
456 CurLoop->isLoopInvariant(SplatValue)) {
457 // It looks like we can use SplatValue.
458 return LegalStoreKind::Memset;
459 }
460 if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
461 // Don't create memset_pattern16s with address spaces.
462 StorePtr->getType()->getPointerAddressSpace() == 0 &&
463 getMemSetPatternValue(StoredVal, DL)) {
464 // It looks like we can use PatternValue!
465 return LegalStoreKind::MemsetPattern;
466 }
467
468 // Otherwise, see if the store can be turned into a memcpy.
469 if (HasMemcpy && !DisableLIRP::Memcpy) {
470 // Check to see if the stride matches the size of the store. If so, then we
471 // know that every byte is touched in the loop.
472 APInt Stride = getStoreStride(StoreEv);
473 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
474 if (StoreSize != Stride && StoreSize != -Stride)
475 return LegalStoreKind::None;
476
477 // The store must be feeding a non-volatile load.
478 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
479
480 // Only allow non-volatile loads
481 if (!LI || LI->isVolatile())
482 return LegalStoreKind::None;
483 // Only allow simple or unordered-atomic loads
484 if (!LI->isUnordered())
485 return LegalStoreKind::None;
486
487 // See if the pointer expression is an AddRec like {base,+,1} on the current
488 // loop, which indicates a strided load. If we have something else, it's a
489 // random load we can't handle.
490 const SCEVAddRecExpr *LoadEv =
491 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
492 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
493 return LegalStoreKind::None;
494
495 // The store and load must share the same stride.
496 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
497 return LegalStoreKind::None;
498
499 // Success. This store can be converted into a memcpy.
500 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
501 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
502 : LegalStoreKind::Memcpy;
503 }
504 // This store can't be transformed into a memset/memcpy.
505 return LegalStoreKind::None;
506}
507
508void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
509 StoreRefsForMemset.clear();
510 StoreRefsForMemsetPattern.clear();
511 StoreRefsForMemcpy.clear();
512 for (Instruction &I : *BB) {
513 StoreInst *SI = dyn_cast<StoreInst>(&I);
514 if (!SI)
515 continue;
516
517 // Make sure this is a strided store with a constant stride.
518 switch (isLegalStore(SI)) {
519 case LegalStoreKind::None:
520 // Nothing to do
521 break;
522 case LegalStoreKind::Memset: {
523 // Find the base pointer.
524 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
525 StoreRefsForMemset[Ptr].push_back(SI);
526 } break;
527 case LegalStoreKind::MemsetPattern: {
528 // Find the base pointer.
529 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
530 StoreRefsForMemsetPattern[Ptr].push_back(SI);
531 } break;
532 case LegalStoreKind::Memcpy:
533 case LegalStoreKind::UnorderedAtomicMemcpy:
534 StoreRefsForMemcpy.push_back(SI);
535 break;
536 default:
537 assert(false && "unhandled return value");
538 break;
539 }
540 }
541}
542
543/// runOnLoopBlock - Process the specified block, which lives in a counted loop
544/// with the specified backedge count. This block is known to be in the current
545/// loop and not in any subloops.
546bool LoopIdiomRecognize::runOnLoopBlock(
547 BasicBlock *BB, const SCEV *BECount,
548 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
549 // We can only promote stores in this block if they are unconditionally
550 // executed in the loop. For a block to be unconditionally executed, it has
551 // to dominate all the exit blocks of the loop. Verify this now.
552 for (BasicBlock *ExitBlock : ExitBlocks)
553 if (!DT->dominates(BB, ExitBlock))
554 return false;
555
556 bool MadeChange = false;
557 // Look for store instructions, which may be optimized to memset/memcpy.
558 collectStores(BB);
559
560 // Look for a single store or sets of stores with a common base, which can be
561 // optimized into a memset (memset_pattern). The latter most commonly happens
562 // with structs and handunrolled loops.
563 for (auto &SL : StoreRefsForMemset)
564 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
565
566 for (auto &SL : StoreRefsForMemsetPattern)
567 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
568
569 // Optimize the store into a memcpy, if it feeds an similarly strided load.
570 for (auto &SI : StoreRefsForMemcpy)
571 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
572
573 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
574 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
575 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
576 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
577
578 return MadeChange;
579}
580
581/// See if this store(s) can be promoted to a memset.
582bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
583 const SCEV *BECount, ForMemset For) {
584 // Try to find consecutive stores that can be transformed into memsets.
585 SetVector<StoreInst *> Heads, Tails;
587
588 // Do a quadratic search on all of the given stores and find
589 // all of the pairs of stores that follow each other.
590 SmallVector<unsigned, 16> IndexQueue;
591 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
592 assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
593
594 Value *FirstStoredVal = SL[i]->getValueOperand();
595 Value *FirstStorePtr = SL[i]->getPointerOperand();
596 const SCEVAddRecExpr *FirstStoreEv =
597 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
598 APInt FirstStride = getStoreStride(FirstStoreEv);
599 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
600
601 // See if we can optimize just this store in isolation.
602 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
603 Heads.insert(SL[i]);
604 continue;
605 }
606
607 Value *FirstSplatValue = nullptr;
608 Constant *FirstPatternValue = nullptr;
609
610 if (For == ForMemset::Yes)
611 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
612 else
613 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
614
615 assert((FirstSplatValue || FirstPatternValue) &&
616 "Expected either splat value or pattern value.");
617
618 IndexQueue.clear();
619 // If a store has multiple consecutive store candidates, search Stores
620 // array according to the sequence: from i+1 to e, then from i-1 to 0.
621 // This is because usually pairing with immediate succeeding or preceding
622 // candidate create the best chance to find memset opportunity.
623 unsigned j = 0;
624 for (j = i + 1; j < e; ++j)
625 IndexQueue.push_back(j);
626 for (j = i; j > 0; --j)
627 IndexQueue.push_back(j - 1);
628
629 for (auto &k : IndexQueue) {
630 assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
631 Value *SecondStorePtr = SL[k]->getPointerOperand();
632 const SCEVAddRecExpr *SecondStoreEv =
633 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
634 APInt SecondStride = getStoreStride(SecondStoreEv);
635
636 if (FirstStride != SecondStride)
637 continue;
638
639 Value *SecondStoredVal = SL[k]->getValueOperand();
640 Value *SecondSplatValue = nullptr;
641 Constant *SecondPatternValue = nullptr;
642
643 if (For == ForMemset::Yes)
644 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
645 else
646 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
647
648 assert((SecondSplatValue || SecondPatternValue) &&
649 "Expected either splat value or pattern value.");
650
651 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
652 if (For == ForMemset::Yes) {
653 if (isa<UndefValue>(FirstSplatValue))
654 FirstSplatValue = SecondSplatValue;
655 if (FirstSplatValue != SecondSplatValue)
656 continue;
657 } else {
658 if (isa<UndefValue>(FirstPatternValue))
659 FirstPatternValue = SecondPatternValue;
660 if (FirstPatternValue != SecondPatternValue)
661 continue;
662 }
663 Tails.insert(SL[k]);
664 Heads.insert(SL[i]);
665 ConsecutiveChain[SL[i]] = SL[k];
666 break;
667 }
668 }
669 }
670
671 // We may run into multiple chains that merge into a single chain. We mark the
672 // stores that we transformed so that we don't visit the same store twice.
673 SmallPtrSet<Value *, 16> TransformedStores;
674 bool Changed = false;
675
676 // For stores that start but don't end a link in the chain:
677 for (StoreInst *I : Heads) {
678 if (Tails.count(I))
679 continue;
680
681 // We found a store instr that starts a chain. Now follow the chain and try
682 // to transform it.
683 SmallPtrSet<Instruction *, 8> AdjacentStores;
684 StoreInst *HeadStore = I;
685 unsigned StoreSize = 0;
686
687 // Collect the chain into a list.
688 while (Tails.count(I) || Heads.count(I)) {
689 if (TransformedStores.count(I))
690 break;
691 AdjacentStores.insert(I);
692
693 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
694 // Move to the next value in the chain.
695 I = ConsecutiveChain[I];
696 }
697
698 Value *StoredVal = HeadStore->getValueOperand();
699 Value *StorePtr = HeadStore->getPointerOperand();
700 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
701 APInt Stride = getStoreStride(StoreEv);
702
703 // Check to see if the stride matches the size of the stores. If so, then
704 // we know that every byte is touched in the loop.
705 if (StoreSize != Stride && StoreSize != -Stride)
706 continue;
707
708 bool IsNegStride = StoreSize == -Stride;
709
710 Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
711 const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
712 if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
713 MaybeAlign(HeadStore->getAlign()), StoredVal,
714 HeadStore, AdjacentStores, StoreEv, BECount,
715 IsNegStride)) {
716 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
717 Changed = true;
718 }
719 }
720
721 return Changed;
722}
723
724/// processLoopMemIntrinsic - Template function for calling different processor
725/// functions based on mem intrinsic type.
726template <typename MemInst>
727bool LoopIdiomRecognize::processLoopMemIntrinsic(
728 BasicBlock *BB,
729 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
730 const SCEV *BECount) {
731 bool MadeChange = false;
732 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
733 Instruction *Inst = &*I++;
734 // Look for memory instructions, which may be optimized to a larger one.
735 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
736 WeakTrackingVH InstPtr(&*I);
737 if (!(this->*Processor)(MI, BECount))
738 continue;
739 MadeChange = true;
740
741 // If processing the instruction invalidated our iterator, start over from
742 // the top of the block.
743 if (!InstPtr)
744 I = BB->begin();
745 }
746 }
747 return MadeChange;
748}
749
750/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
751bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
752 const SCEV *BECount) {
753 // We can only handle non-volatile memcpys with a constant size.
754 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
755 return false;
756
757 // If we're not allowed to hack on memcpy, we fail.
758 if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
759 return false;
760
761 Value *Dest = MCI->getDest();
762 Value *Source = MCI->getSource();
763 if (!Dest || !Source)
764 return false;
765
766 // See if the load and store pointer expressions are AddRec like {base,+,1} on
767 // the current loop, which indicates a strided load and store. If we have
768 // something else, it's a random load or store we can't handle.
769 const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
770 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
771 return false;
772 const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
773 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
774 return false;
775
776 // Reject memcpys that are so large that they overflow an unsigned.
777 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
778 if ((SizeInBytes >> 32) != 0)
779 return false;
780
781 // Check if the stride matches the size of the memcpy. If so, then we know
782 // that every byte is touched in the loop.
783 const SCEVConstant *ConstStoreStride =
784 dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
785 const SCEVConstant *ConstLoadStride =
786 dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
787 if (!ConstStoreStride || !ConstLoadStride)
788 return false;
789
790 APInt StoreStrideValue = ConstStoreStride->getAPInt();
791 APInt LoadStrideValue = ConstLoadStride->getAPInt();
792 // Huge stride value - give up
793 if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
794 return false;
795
796 if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
797 ORE.emit([&]() {
798 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
799 << ore::NV("Inst", "memcpy") << " in "
800 << ore::NV("Function", MCI->getFunction())
801 << " function will not be hoisted: "
802 << ore::NV("Reason", "memcpy size is not equal to stride");
803 });
804 return false;
805 }
806
807 int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
808 int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
809 // Check if the load stride matches the store stride.
810 if (StoreStrideInt != LoadStrideInt)
811 return false;
812
813 return processLoopStoreOfLoopLoad(
814 Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
815 MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
816 BECount);
817}
818
819/// processLoopMemSet - See if this memset can be promoted to a large memset.
820bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
821 const SCEV *BECount) {
822 // We can only handle non-volatile memsets.
823 if (MSI->isVolatile())
824 return false;
825
826 // If we're not allowed to hack on memset, we fail.
827 if (!HasMemset || DisableLIRP::Memset)
828 return false;
829
830 Value *Pointer = MSI->getDest();
831
832 // See if the pointer expression is an AddRec like {base,+,1} on the current
833 // loop, which indicates a strided store. If we have something else, it's a
834 // random store we can't handle.
835 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
836 if (!Ev || Ev->getLoop() != CurLoop)
837 return false;
838 if (!Ev->isAffine()) {
839 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
840 return false;
841 }
842
843 const SCEV *PointerStrideSCEV = Ev->getOperand(1);
844 const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
845 if (!PointerStrideSCEV || !MemsetSizeSCEV)
846 return false;
847
848 bool IsNegStride = false;
849 const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
850
851 if (IsConstantSize) {
852 // Memset size is constant.
853 // Check if the pointer stride matches the memset size. If so, then
854 // we know that every byte is touched in the loop.
855 LLVM_DEBUG(dbgs() << " memset size is constant\n");
856 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
857 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
858 if (!ConstStride)
859 return false;
860
861 APInt Stride = ConstStride->getAPInt();
862 if (SizeInBytes != Stride && SizeInBytes != -Stride)
863 return false;
864
865 IsNegStride = SizeInBytes == -Stride;
866 } else {
867 // Memset size is non-constant.
868 // Check if the pointer stride matches the memset size.
869 // To be conservative, the pass would not promote pointers that aren't in
870 // address space zero. Also, the pass only handles memset length and stride
871 // that are invariant for the top level loop.
872 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
873 if (Pointer->getType()->getPointerAddressSpace() != 0) {
874 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
875 << "abort\n");
876 return false;
877 }
878 if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
879 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
880 << "abort\n");
881 return false;
882 }
883
884 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
885 IsNegStride = PointerStrideSCEV->isNonConstantNegative();
886 const SCEV *PositiveStrideSCEV =
887 IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
888 : PointerStrideSCEV;
889 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
890 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
891 << "\n");
892
893 if (PositiveStrideSCEV != MemsetSizeSCEV) {
894 // If an expression is covered by the loop guard, compare again and
895 // proceed with optimization if equal.
896 const SCEV *FoldedPositiveStride =
897 SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
898 const SCEV *FoldedMemsetSize =
899 SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
900
901 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
902 << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
903 << " FoldedPositiveStride: " << *FoldedPositiveStride
904 << "\n");
905
906 if (FoldedPositiveStride != FoldedMemsetSize) {
907 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
908 return false;
909 }
910 }
911 }
912
913 // Verify that the memset value is loop invariant. If not, we can't promote
914 // the memset.
915 Value *SplatValue = MSI->getValue();
916 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
917 return false;
918
920 MSIs.insert(MSI);
921 return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
922 MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
923 BECount, IsNegStride, /*IsLoopMemset=*/true);
924}
925
926/// mayLoopAccessLocation - Return true if the specified loop might access the
927/// specified pointer location, which is a loop-strided access. The 'Access'
928/// argument specifies what the verboten forms of access are (read or write).
929static bool
931 const SCEV *BECount, const SCEV *StoreSizeSCEV,
932 AliasAnalysis &AA,
933 SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
934 // Get the location that may be stored across the loop. Since the access is
935 // strided positively through memory, we say that the modified location starts
936 // at the pointer and has infinite size.
938
939 // If the loop iterates a fixed number of times, we can refine the access size
940 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
941 const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
942 const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
943 if (BECst && ConstSize) {
944 std::optional<uint64_t> BEInt = BECst->getAPInt().tryZExtValue();
945 std::optional<uint64_t> SizeInt = ConstSize->getAPInt().tryZExtValue();
946 // FIXME: Should this check for overflow?
947 if (BEInt && SizeInt)
948 AccessSize = LocationSize::precise((*BEInt + 1) * *SizeInt);
949 }
950
951 // TODO: For this to be really effective, we have to dive into the pointer
952 // operand in the store. Store to &A[i] of 100 will always return may alias
953 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
954 // which will then no-alias a store to &A[100].
955 MemoryLocation StoreLoc(Ptr, AccessSize);
956
957 for (BasicBlock *B : L->blocks())
958 for (Instruction &I : *B)
959 if (!IgnoredInsts.contains(&I) &&
960 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
961 return true;
962 return false;
963}
964
965// If we have a negative stride, Start refers to the end of the memory location
966// we're trying to memset. Therefore, we need to recompute the base pointer,
967// which is just Start - BECount*Size.
968static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
969 Type *IntPtr, const SCEV *StoreSizeSCEV,
970 ScalarEvolution *SE) {
971 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
972 if (!StoreSizeSCEV->isOne()) {
973 // index = back edge count * store size
974 Index = SE->getMulExpr(Index,
975 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
977 }
978 // base pointer = start - index * store size
979 return SE->getMinusSCEV(Start, Index);
980}
981
982/// Compute the number of bytes as a SCEV from the backedge taken count.
983///
984/// This also maps the SCEV into the provided type and tries to handle the
985/// computation in a way that will fold cleanly.
986static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
987 const SCEV *StoreSizeSCEV, Loop *CurLoop,
988 const DataLayout *DL, ScalarEvolution *SE) {
989 const SCEV *TripCountSCEV =
990 SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop);
991 return SE->getMulExpr(TripCountSCEV,
992 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
994}
995
996/// processLoopStridedStore - We see a strided store of some value. If we can
997/// transform this into a memset or memset_pattern in the loop preheader, do so.
998bool LoopIdiomRecognize::processLoopStridedStore(
999 Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1000 Value *StoredVal, Instruction *TheStore,
1002 const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1003 Module *M = TheStore->getModule();
1004 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1005 Constant *PatternValue = nullptr;
1006
1007 if (!SplatValue)
1008 PatternValue = getMemSetPatternValue(StoredVal, DL);
1009
1010 assert((SplatValue || PatternValue) &&
1011 "Expected either splat value or pattern value.");
1012
1013 // The trip count of the loop and the base pointer of the addrec SCEV is
1014 // guaranteed to be loop invariant, which means that it should dominate the
1015 // header. This allows us to insert code for it in the preheader.
1016 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1017 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1018 IRBuilder<> Builder(Preheader->getTerminator());
1019 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1020 SCEVExpanderCleaner ExpCleaner(Expander);
1021
1022 Type *DestInt8PtrTy = Builder.getPtrTy(DestAS);
1023 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1024
1025 bool Changed = false;
1026 const SCEV *Start = Ev->getStart();
1027 // Handle negative strided loops.
1028 if (IsNegStride)
1029 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1030
1031 // TODO: ideally we should still be able to generate memset if SCEV expander
1032 // is taught to generate the dependencies at the latest point.
1033 if (!Expander.isSafeToExpand(Start))
1034 return Changed;
1035
1036 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1037 // this into a memset in the loop preheader now if we want. However, this
1038 // would be unsafe to do if there is anything else in the loop that may read
1039 // or write to the aliased location. Check for any overlap by generating the
1040 // base pointer and checking the region.
1041 Value *BasePtr =
1042 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1043
1044 // From here on out, conservatively report to the pass manager that we've
1045 // changed the IR, even if we later clean up these added instructions. There
1046 // may be structural differences e.g. in the order of use lists not accounted
1047 // for in just a textual dump of the IR. This is written as a variable, even
1048 // though statically all the places this dominates could be replaced with
1049 // 'true', with the hope that anyone trying to be clever / "more precise" with
1050 // the return value will read this comment, and leave them alone.
1051 Changed = true;
1052
1053 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1054 StoreSizeSCEV, *AA, Stores))
1055 return Changed;
1056
1057 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1058 return Changed;
1059
1060 // Okay, everything looks good, insert the memset.
1061
1062 const SCEV *NumBytesS =
1063 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1064
1065 // TODO: ideally we should still be able to generate memset if SCEV expander
1066 // is taught to generate the dependencies at the latest point.
1067 if (!Expander.isSafeToExpand(NumBytesS))
1068 return Changed;
1069
1070 Value *NumBytes =
1071 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1072
1073 if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16))
1074 return Changed;
1075
1076 AAMDNodes AATags = TheStore->getAAMetadata();
1077 for (Instruction *Store : Stores)
1078 AATags = AATags.merge(Store->getAAMetadata());
1079 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1080 AATags = AATags.extendTo(CI->getZExtValue());
1081 else
1082 AATags = AATags.extendTo(-1);
1083
1084 CallInst *NewCall;
1085 if (SplatValue) {
1086 NewCall = Builder.CreateMemSet(
1087 BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
1088 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1089 } else {
1090 assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
1091 // Everything is emitted in default address space
1092 Type *Int8PtrTy = DestInt8PtrTy;
1093
1094 StringRef FuncName = "memset_pattern16";
1095 FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
1096 Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
1097 inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
1098
1099 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1100 // an constant array of 16-bytes. Plop the value into a mergable global.
1101 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1103 PatternValue, ".memset_pattern");
1104 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1105 GV->setAlignment(Align(16));
1106 Value *PatternPtr = GV;
1107 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1108
1109 // Set the TBAA info if present.
1110 if (AATags.TBAA)
1111 NewCall->setMetadata(LLVMContext::MD_tbaa, AATags.TBAA);
1112
1113 if (AATags.Scope)
1114 NewCall->setMetadata(LLVMContext::MD_alias_scope, AATags.Scope);
1115
1116 if (AATags.NoAlias)
1117 NewCall->setMetadata(LLVMContext::MD_noalias, AATags.NoAlias);
1118 }
1119
1120 NewCall->setDebugLoc(TheStore->getDebugLoc());
1121
1122 if (MSSAU) {
1123 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1124 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1125 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1126 }
1127
1128 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1129 << " from store to: " << *Ev << " at: " << *TheStore
1130 << "\n");
1131
1132 ORE.emit([&]() {
1133 OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1134 NewCall->getDebugLoc(), Preheader);
1135 R << "Transformed loop-strided store in "
1136 << ore::NV("Function", TheStore->getFunction())
1137 << " function into a call to "
1138 << ore::NV("NewFunction", NewCall->getCalledFunction())
1139 << "() intrinsic";
1140 if (!Stores.empty())
1141 R << ore::setExtraArgs();
1142 for (auto *I : Stores) {
1143 R << ore::NV("FromBlock", I->getParent()->getName())
1144 << ore::NV("ToBlock", Preheader->getName());
1145 }
1146 return R;
1147 });
1148
1149 // Okay, the memset has been formed. Zap the original store and anything that
1150 // feeds into it.
1151 for (auto *I : Stores) {
1152 if (MSSAU)
1153 MSSAU->removeMemoryAccess(I, true);
1155 }
1156 if (MSSAU && VerifyMemorySSA)
1157 MSSAU->getMemorySSA()->verifyMemorySSA();
1158 ++NumMemSet;
1159 ExpCleaner.markResultUsed();
1160 return true;
1161}
1162
1163/// If the stored value is a strided load in the same loop with the same stride
1164/// this may be transformable into a memcpy. This kicks in for stuff like
1165/// for (i) A[i] = B[i];
1166bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1167 const SCEV *BECount) {
1168 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1169
1170 Value *StorePtr = SI->getPointerOperand();
1171 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1172 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1173
1174 // The store must be feeding a non-volatile load.
1175 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1176 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1177
1178 // See if the pointer expression is an AddRec like {base,+,1} on the current
1179 // loop, which indicates a strided load. If we have something else, it's a
1180 // random load we can't handle.
1181 Value *LoadPtr = LI->getPointerOperand();
1182 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1183
1184 const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1185 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1186 SI->getAlign(), LI->getAlign(), SI, LI,
1187 StoreEv, LoadEv, BECount);
1188}
1189
1190namespace {
1191class MemmoveVerifier {
1192public:
1193 explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1194 const DataLayout &DL)
1196 LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1198 StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1199 IsSameObject(BP1 == BP2) {}
1200
1201 bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1202 const Instruction &TheLoad,
1203 bool IsMemCpy) const {
1204 if (IsMemCpy) {
1205 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1206 // for negative stride.
1207 if ((!IsNegStride && LoadOff <= StoreOff) ||
1208 (IsNegStride && LoadOff >= StoreOff))
1209 return false;
1210 } else {
1211 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1212 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1213 int64_t LoadSize =
1214 DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
1215 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1216 return false;
1217 if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1218 (IsNegStride && LoadOff + LoadSize > StoreOff))
1219 return false;
1220 }
1221 return true;
1222 }
1223
1224private:
1225 const DataLayout &DL;
1226 int64_t LoadOff = 0;
1227 int64_t StoreOff = 0;
1228 const Value *BP1;
1229 const Value *BP2;
1230
1231public:
1232 const bool IsSameObject;
1233};
1234} // namespace
1235
1236bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1237 Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1238 MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1239 Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1240 const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1241
1242 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1243 // conservatively bail here, since otherwise we may have to transform
1244 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1245 if (isa<MemCpyInlineInst>(TheStore))
1246 return false;
1247
1248 // The trip count of the loop and the base pointer of the addrec SCEV is
1249 // guaranteed to be loop invariant, which means that it should dominate the
1250 // header. This allows us to insert code for it in the preheader.
1251 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1252 IRBuilder<> Builder(Preheader->getTerminator());
1253 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1254
1255 SCEVExpanderCleaner ExpCleaner(Expander);
1256
1257 bool Changed = false;
1258 const SCEV *StrStart = StoreEv->getStart();
1259 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1260 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1261
1262 APInt Stride = getStoreStride(StoreEv);
1263 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1264
1265 // TODO: Deal with non-constant size; Currently expect constant store size
1266 assert(ConstStoreSize && "store size is expected to be a constant");
1267
1268 int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1269 bool IsNegStride = StoreSize == -Stride;
1270
1271 // Handle negative strided loops.
1272 if (IsNegStride)
1273 StrStart =
1274 getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1275
1276 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1277 // this into a memcpy in the loop preheader now if we want. However, this
1278 // would be unsafe to do if there is anything else in the loop that may read
1279 // or write the memory region we're storing to. This includes the load that
1280 // feeds the stores. Check for an alias by generating the base address and
1281 // checking everything.
1282 Value *StoreBasePtr = Expander.expandCodeFor(
1283 StrStart, Builder.getPtrTy(StrAS), Preheader->getTerminator());
1284
1285 // From here on out, conservatively report to the pass manager that we've
1286 // changed the IR, even if we later clean up these added instructions. There
1287 // may be structural differences e.g. in the order of use lists not accounted
1288 // for in just a textual dump of the IR. This is written as a variable, even
1289 // though statically all the places this dominates could be replaced with
1290 // 'true', with the hope that anyone trying to be clever / "more precise" with
1291 // the return value will read this comment, and leave them alone.
1292 Changed = true;
1293
1294 SmallPtrSet<Instruction *, 2> IgnoredInsts;
1295 IgnoredInsts.insert(TheStore);
1296
1297 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1298 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1299
1300 bool LoopAccessStore =
1301 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1302 StoreSizeSCEV, *AA, IgnoredInsts);
1303 if (LoopAccessStore) {
1304 // For memmove case it's not enough to guarantee that loop doesn't access
1305 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1306 // the only user of TheLoad.
1307 if (!TheLoad->hasOneUse())
1308 return Changed;
1309 IgnoredInsts.insert(TheLoad);
1310 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1311 BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1312 ORE.emit([&]() {
1313 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1314 TheStore)
1315 << ore::NV("Inst", InstRemark) << " in "
1316 << ore::NV("Function", TheStore->getFunction())
1317 << " function will not be hoisted: "
1318 << ore::NV("Reason", "The loop may access store location");
1319 });
1320 return Changed;
1321 }
1322 IgnoredInsts.erase(TheLoad);
1323 }
1324
1325 const SCEV *LdStart = LoadEv->getStart();
1326 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1327
1328 // Handle negative strided loops.
1329 if (IsNegStride)
1330 LdStart =
1331 getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1332
1333 // For a memcpy, we have to make sure that the input array is not being
1334 // mutated by the loop.
1335 Value *LoadBasePtr = Expander.expandCodeFor(LdStart, Builder.getPtrTy(LdAS),
1336 Preheader->getTerminator());
1337
1338 // If the store is a memcpy instruction, we must check if it will write to
1339 // the load memory locations. So remove it from the ignored stores.
1340 MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1341 if (IsMemCpy && !Verifier.IsSameObject)
1342 IgnoredInsts.erase(TheStore);
1343 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1344 StoreSizeSCEV, *AA, IgnoredInsts)) {
1345 ORE.emit([&]() {
1346 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1347 << ore::NV("Inst", InstRemark) << " in "
1348 << ore::NV("Function", TheStore->getFunction())
1349 << " function will not be hoisted: "
1350 << ore::NV("Reason", "The loop may access load location");
1351 });
1352 return Changed;
1353 }
1354
1355 bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1356 if (UseMemMove)
1357 if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1358 IsMemCpy))
1359 return Changed;
1360
1361 if (avoidLIRForMultiBlockLoop())
1362 return Changed;
1363
1364 // Okay, everything is safe, we can transform this!
1365
1366 const SCEV *NumBytesS =
1367 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1368
1369 Value *NumBytes =
1370 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1371
1372 AAMDNodes AATags = TheLoad->getAAMetadata();
1373 AAMDNodes StoreAATags = TheStore->getAAMetadata();
1374 AATags = AATags.merge(StoreAATags);
1375 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1376 AATags = AATags.extendTo(CI->getZExtValue());
1377 else
1378 AATags = AATags.extendTo(-1);
1379
1380 CallInst *NewCall = nullptr;
1381 // Check whether to generate an unordered atomic memcpy:
1382 // If the load or store are atomic, then they must necessarily be unordered
1383 // by previous checks.
1384 if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1385 if (UseMemMove)
1386 NewCall = Builder.CreateMemMove(
1387 StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1388 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1389 else
1390 NewCall =
1391 Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1392 NumBytes, /*isVolatile=*/false, AATags.TBAA,
1393 AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1394 } else {
1395 // For now don't support unordered atomic memmove.
1396 if (UseMemMove)
1397 return Changed;
1398 // We cannot allow unaligned ops for unordered load/store, so reject
1399 // anything where the alignment isn't at least the element size.
1400 assert((StoreAlign && LoadAlign) &&
1401 "Expect unordered load/store to have align.");
1402 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1403 return Changed;
1404
1405 // If the element.atomic memcpy is not lowered into explicit
1406 // loads/stores later, then it will be lowered into an element-size
1407 // specific lib call. If the lib call doesn't exist for our store size, then
1408 // we shouldn't generate the memcpy.
1409 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1410 return Changed;
1411
1412 // Create the call.
1413 // Note that unordered atomic loads/stores are *required* by the spec to
1414 // have an alignment but non-atomic loads/stores may not.
1415 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1416 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
1417 AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1418 }
1419 NewCall->setDebugLoc(TheStore->getDebugLoc());
1420
1421 if (MSSAU) {
1422 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1423 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1424 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1425 }
1426
1427 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1428 << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1429 << "\n"
1430 << " from store ptr=" << *StoreEv << " at: " << *TheStore
1431 << "\n");
1432
1433 ORE.emit([&]() {
1434 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1435 NewCall->getDebugLoc(), Preheader)
1436 << "Formed a call to "
1437 << ore::NV("NewFunction", NewCall->getCalledFunction())
1438 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1439 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1440 << " function"
1442 << ore::NV("FromBlock", TheStore->getParent()->getName())
1443 << ore::NV("ToBlock", Preheader->getName());
1444 });
1445
1446 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1447 // and anything that feeds into it.
1448 if (MSSAU)
1449 MSSAU->removeMemoryAccess(TheStore, true);
1450 deleteDeadInstruction(TheStore);
1451 if (MSSAU && VerifyMemorySSA)
1452 MSSAU->getMemorySSA()->verifyMemorySSA();
1453 if (UseMemMove)
1454 ++NumMemMove;
1455 else
1456 ++NumMemCpy;
1457 ExpCleaner.markResultUsed();
1458 return true;
1459}
1460
1461// When compiling for codesize we avoid idiom recognition for a multi-block loop
1462// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1463//
1464bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1465 bool IsLoopMemset) {
1466 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1467 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1468 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1469 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1470 << " avoided: multi-block top-level loop\n");
1471 return true;
1472 }
1473 }
1474
1475 return false;
1476}
1477
1478bool LoopIdiomRecognize::runOnNoncountableLoop() {
1479 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1480 << CurLoop->getHeader()->getParent()->getName()
1481 << "] Noncountable Loop %"
1482 << CurLoop->getHeader()->getName() << "\n");
1483
1484 return recognizePopcount() || recognizeAndInsertFFS() ||
1485 recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
1486}
1487
1488/// Check if the given conditional branch is based on the comparison between
1489/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1490/// true), the control yields to the loop entry. If the branch matches the
1491/// behavior, the variable involved in the comparison is returned. This function
1492/// will be called to see if the precondition and postcondition of the loop are
1493/// in desirable form.
1495 bool JmpOnZero = false) {
1496 if (!BI || !BI->isConditional())
1497 return nullptr;
1498
1499 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1500 if (!Cond)
1501 return nullptr;
1502
1503 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1504 if (!CmpZero || !CmpZero->isZero())
1505 return nullptr;
1506
1507 BasicBlock *TrueSucc = BI->getSuccessor(0);
1508 BasicBlock *FalseSucc = BI->getSuccessor(1);
1509 if (JmpOnZero)
1510 std::swap(TrueSucc, FalseSucc);
1511
1512 ICmpInst::Predicate Pred = Cond->getPredicate();
1513 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1514 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1515 return Cond->getOperand(0);
1516
1517 return nullptr;
1518}
1519
1520// Check if the recurrence variable `VarX` is in the right form to create
1521// the idiom. Returns the value coerced to a PHINode if so.
1523 BasicBlock *LoopEntry) {
1524 auto *PhiX = dyn_cast<PHINode>(VarX);
1525 if (PhiX && PhiX->getParent() == LoopEntry &&
1526 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1527 return PhiX;
1528 return nullptr;
1529}
1530
1531/// Return true iff the idiom is detected in the loop.
1532///
1533/// Additionally:
1534/// 1) \p CntInst is set to the instruction counting the population bit.
1535/// 2) \p CntPhi is set to the corresponding phi node.
1536/// 3) \p Var is set to the value whose population bits are being counted.
1537///
1538/// The core idiom we are trying to detect is:
1539/// \code
1540/// if (x0 != 0)
1541/// goto loop-exit // the precondition of the loop
1542/// cnt0 = init-val;
1543/// do {
1544/// x1 = phi (x0, x2);
1545/// cnt1 = phi(cnt0, cnt2);
1546///
1547/// cnt2 = cnt1 + 1;
1548/// ...
1549/// x2 = x1 & (x1 - 1);
1550/// ...
1551/// } while(x != 0);
1552///
1553/// loop-exit:
1554/// \endcode
1555static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1556 Instruction *&CntInst, PHINode *&CntPhi,
1557 Value *&Var) {
1558 // step 1: Check to see if the look-back branch match this pattern:
1559 // "if (a!=0) goto loop-entry".
1560 BasicBlock *LoopEntry;
1561 Instruction *DefX2, *CountInst;
1562 Value *VarX1, *VarX0;
1563 PHINode *PhiX, *CountPhi;
1564
1565 DefX2 = CountInst = nullptr;
1566 VarX1 = VarX0 = nullptr;
1567 PhiX = CountPhi = nullptr;
1568 LoopEntry = *(CurLoop->block_begin());
1569
1570 // step 1: Check if the loop-back branch is in desirable form.
1571 {
1572 if (Value *T = matchCondition(
1573 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1574 DefX2 = dyn_cast<Instruction>(T);
1575 else
1576 return false;
1577 }
1578
1579 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1580 {
1581 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1582 return false;
1583
1584 BinaryOperator *SubOneOp;
1585
1586 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1587 VarX1 = DefX2->getOperand(1);
1588 else {
1589 VarX1 = DefX2->getOperand(0);
1590 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1591 }
1592 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1593 return false;
1594
1595 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1596 if (!Dec ||
1597 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1598 (SubOneOp->getOpcode() == Instruction::Add &&
1599 Dec->isMinusOne()))) {
1600 return false;
1601 }
1602 }
1603
1604 // step 3: Check the recurrence of variable X
1605 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1606 if (!PhiX)
1607 return false;
1608
1609 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1610 {
1611 CountInst = nullptr;
1612 for (Instruction &Inst : llvm::make_range(
1613 LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1614 if (Inst.getOpcode() != Instruction::Add)
1615 continue;
1616
1617 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1618 if (!Inc || !Inc->isOne())
1619 continue;
1620
1621 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1622 if (!Phi)
1623 continue;
1624
1625 // Check if the result of the instruction is live of the loop.
1626 bool LiveOutLoop = false;
1627 for (User *U : Inst.users()) {
1628 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1629 LiveOutLoop = true;
1630 break;
1631 }
1632 }
1633
1634 if (LiveOutLoop) {
1635 CountInst = &Inst;
1636 CountPhi = Phi;
1637 break;
1638 }
1639 }
1640
1641 if (!CountInst)
1642 return false;
1643 }
1644
1645 // step 5: check if the precondition is in this form:
1646 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1647 {
1648 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1649 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1650 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1651 return false;
1652
1653 CntInst = CountInst;
1654 CntPhi = CountPhi;
1655 Var = T;
1656 }
1657
1658 return true;
1659}
1660
1661/// Return true if the idiom is detected in the loop.
1662///
1663/// Additionally:
1664/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1665/// or nullptr if there is no such.
1666/// 2) \p CntPhi is set to the corresponding phi node
1667/// or nullptr if there is no such.
1668/// 3) \p Var is set to the value whose CTLZ could be used.
1669/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1670///
1671/// The core idiom we are trying to detect is:
1672/// \code
1673/// if (x0 == 0)
1674/// goto loop-exit // the precondition of the loop
1675/// cnt0 = init-val;
1676/// do {
1677/// x = phi (x0, x.next); //PhiX
1678/// cnt = phi(cnt0, cnt.next);
1679///
1680/// cnt.next = cnt + 1;
1681/// ...
1682/// x.next = x >> 1; // DefX
1683/// ...
1684/// } while(x.next != 0);
1685///
1686/// loop-exit:
1687/// \endcode
1688static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1689 Intrinsic::ID &IntrinID, Value *&InitX,
1690 Instruction *&CntInst, PHINode *&CntPhi,
1691 Instruction *&DefX) {
1692 BasicBlock *LoopEntry;
1693 Value *VarX = nullptr;
1694
1695 DefX = nullptr;
1696 CntInst = nullptr;
1697 CntPhi = nullptr;
1698 LoopEntry = *(CurLoop->block_begin());
1699
1700 // step 1: Check if the loop-back branch is in desirable form.
1701 if (Value *T = matchCondition(
1702 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1703 DefX = dyn_cast<Instruction>(T);
1704 else
1705 return false;
1706
1707 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1708 if (!DefX || !DefX->isShift())
1709 return false;
1710 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1711 Intrinsic::ctlz;
1712 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1713 if (!Shft || !Shft->isOne())
1714 return false;
1715 VarX = DefX->getOperand(0);
1716
1717 // step 3: Check the recurrence of variable X
1718 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1719 if (!PhiX)
1720 return false;
1721
1722 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1723
1724 // Make sure the initial value can't be negative otherwise the ashr in the
1725 // loop might never reach zero which would make the loop infinite.
1726 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1727 return false;
1728
1729 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1730 // or cnt.next = cnt + -1.
1731 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1732 // then all uses of "cnt.next" could be optimized to the trip count
1733 // plus "cnt0". Currently it is not optimized.
1734 // This step could be used to detect POPCNT instruction:
1735 // cnt.next = cnt + (x.next & 1)
1736 for (Instruction &Inst : llvm::make_range(
1737 LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1738 if (Inst.getOpcode() != Instruction::Add)
1739 continue;
1740
1741 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1742 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1743 continue;
1744
1745 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1746 if (!Phi)
1747 continue;
1748
1749 CntInst = &Inst;
1750 CntPhi = Phi;
1751 break;
1752 }
1753 if (!CntInst)
1754 return false;
1755
1756 return true;
1757}
1758
1759/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1760/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1761/// trip count returns true; otherwise, returns false.
1762bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1763 // Give up if the loop has multiple blocks or multiple backedges.
1764 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1765 return false;
1766
1767 Intrinsic::ID IntrinID;
1768 Value *InitX;
1769 Instruction *DefX = nullptr;
1770 PHINode *CntPhi = nullptr;
1771 Instruction *CntInst = nullptr;
1772 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1773 // this is always 6.
1774 size_t IdiomCanonicalSize = 6;
1775
1776 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1777 CntInst, CntPhi, DefX))
1778 return false;
1779
1780 bool IsCntPhiUsedOutsideLoop = false;
1781 for (User *U : CntPhi->users())
1782 if (!CurLoop->contains(cast<Instruction>(U))) {
1783 IsCntPhiUsedOutsideLoop = true;
1784 break;
1785 }
1786 bool IsCntInstUsedOutsideLoop = false;
1787 for (User *U : CntInst->users())
1788 if (!CurLoop->contains(cast<Instruction>(U))) {
1789 IsCntInstUsedOutsideLoop = true;
1790 break;
1791 }
1792 // If both CntInst and CntPhi are used outside the loop the profitability
1793 // is questionable.
1794 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1795 return false;
1796
1797 // For some CPUs result of CTLZ(X) intrinsic is undefined
1798 // when X is 0. If we can not guarantee X != 0, we need to check this
1799 // when expand.
1800 bool ZeroCheck = false;
1801 // It is safe to assume Preheader exist as it was checked in
1802 // parent function RunOnLoop.
1803 BasicBlock *PH = CurLoop->getLoopPreheader();
1804
1805 // If we are using the count instruction outside the loop, make sure we
1806 // have a zero check as a precondition. Without the check the loop would run
1807 // one iteration for before any check of the input value. This means 0 and 1
1808 // would have identical behavior in the original loop and thus
1809 if (!IsCntPhiUsedOutsideLoop) {
1810 auto *PreCondBB = PH->getSinglePredecessor();
1811 if (!PreCondBB)
1812 return false;
1813 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1814 if (!PreCondBI)
1815 return false;
1816 if (matchCondition(PreCondBI, PH) != InitX)
1817 return false;
1818 ZeroCheck = true;
1819 }
1820
1821 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1822 // profitable if we delete the loop.
1823
1824 // the loop has only 6 instructions:
1825 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1826 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1827 // %shr = ashr %n.addr.0, 1
1828 // %tobool = icmp eq %shr, 0
1829 // %inc = add nsw %i.0, 1
1830 // br i1 %tobool
1831
1832 const Value *Args[] = {InitX,
1833 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1834
1835 // @llvm.dbg doesn't count as they have no semantic effect.
1836 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1838 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1839
1840 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1843 if (HeaderSize != IdiomCanonicalSize &&
1845 return false;
1846
1847 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1848 DefX->getDebugLoc(), ZeroCheck,
1849 IsCntPhiUsedOutsideLoop);
1850 return true;
1851}
1852
1853/// Recognizes a population count idiom in a non-countable loop.
1854///
1855/// If detected, transforms the relevant code to issue the popcount intrinsic
1856/// function call, and returns true; otherwise, returns false.
1857bool LoopIdiomRecognize::recognizePopcount() {
1859 return false;
1860
1861 // Counting population are usually conducted by few arithmetic instructions.
1862 // Such instructions can be easily "absorbed" by vacant slots in a
1863 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1864 // in a compact loop.
1865
1866 // Give up if the loop has multiple blocks or multiple backedges.
1867 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1868 return false;
1869
1870 BasicBlock *LoopBody = *(CurLoop->block_begin());
1871 if (LoopBody->size() >= 20) {
1872 // The loop is too big, bail out.
1873 return false;
1874 }
1875
1876 // It should have a preheader containing nothing but an unconditional branch.
1877 BasicBlock *PH = CurLoop->getLoopPreheader();
1878 if (!PH || &PH->front() != PH->getTerminator())
1879 return false;
1880 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1881 if (!EntryBI || EntryBI->isConditional())
1882 return false;
1883
1884 // It should have a precondition block where the generated popcount intrinsic
1885 // function can be inserted.
1886 auto *PreCondBB = PH->getSinglePredecessor();
1887 if (!PreCondBB)
1888 return false;
1889 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1890 if (!PreCondBI || PreCondBI->isUnconditional())
1891 return false;
1892
1893 Instruction *CntInst;
1894 PHINode *CntPhi;
1895 Value *Val;
1896 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1897 return false;
1898
1899 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1900 return true;
1901}
1902
1904 const DebugLoc &DL) {
1905 Value *Ops[] = {Val};
1906 Type *Tys[] = {Val->getType()};
1907
1909 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1910 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1911 CI->setDebugLoc(DL);
1912
1913 return CI;
1914}
1915
1917 const DebugLoc &DL, bool ZeroCheck,
1918 Intrinsic::ID IID) {
1919 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
1920 Type *Tys[] = {Val->getType()};
1921
1923 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1924 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1925 CI->setDebugLoc(DL);
1926
1927 return CI;
1928}
1929
1930/// Transform the following loop (Using CTLZ, CTTZ is similar):
1931/// loop:
1932/// CntPhi = PHI [Cnt0, CntInst]
1933/// PhiX = PHI [InitX, DefX]
1934/// CntInst = CntPhi + 1
1935/// DefX = PhiX >> 1
1936/// LOOP_BODY
1937/// Br: loop if (DefX != 0)
1938/// Use(CntPhi) or Use(CntInst)
1939///
1940/// Into:
1941/// If CntPhi used outside the loop:
1942/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1943/// Count = CountPrev + 1
1944/// else
1945/// Count = BitWidth(InitX) - CTLZ(InitX)
1946/// loop:
1947/// CntPhi = PHI [Cnt0, CntInst]
1948/// PhiX = PHI [InitX, DefX]
1949/// PhiCount = PHI [Count, Dec]
1950/// CntInst = CntPhi + 1
1951/// DefX = PhiX >> 1
1952/// Dec = PhiCount - 1
1953/// LOOP_BODY
1954/// Br: loop if (Dec != 0)
1955/// Use(CountPrev + Cnt0) // Use(CntPhi)
1956/// or
1957/// Use(Count + Cnt0) // Use(CntInst)
1958///
1959/// If LOOP_BODY is empty the loop will be deleted.
1960/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1961void LoopIdiomRecognize::transformLoopToCountable(
1962 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1963 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1964 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1965 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1966
1967 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1968 IRBuilder<> Builder(PreheaderBr);
1969 Builder.SetCurrentDebugLocation(DL);
1970
1971 // If there are no uses of CntPhi crate:
1972 // Count = BitWidth - CTLZ(InitX);
1973 // NewCount = Count;
1974 // If there are uses of CntPhi create:
1975 // NewCount = BitWidth - CTLZ(InitX >> 1);
1976 // Count = NewCount + 1;
1977 Value *InitXNext;
1978 if (IsCntPhiUsedOutsideLoop) {
1979 if (DefX->getOpcode() == Instruction::AShr)
1980 InitXNext = Builder.CreateAShr(InitX, 1);
1981 else if (DefX->getOpcode() == Instruction::LShr)
1982 InitXNext = Builder.CreateLShr(InitX, 1);
1983 else if (DefX->getOpcode() == Instruction::Shl) // cttz
1984 InitXNext = Builder.CreateShl(InitX, 1);
1985 else
1986 llvm_unreachable("Unexpected opcode!");
1987 } else
1988 InitXNext = InitX;
1989 Value *Count =
1990 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1991 Type *CountTy = Count->getType();
1992 Count = Builder.CreateSub(
1993 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
1994 Value *NewCount = Count;
1995 if (IsCntPhiUsedOutsideLoop)
1996 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
1997
1998 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
1999
2000 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2001 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2002 // If the counter was being incremented in the loop, add NewCount to the
2003 // counter's initial value, but only if the initial value is not zero.
2004 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2005 if (!InitConst || !InitConst->isZero())
2006 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2007 } else {
2008 // If the count was being decremented in the loop, subtract NewCount from
2009 // the counter's initial value.
2010 NewCount = Builder.CreateSub(CntInitVal, NewCount);
2011 }
2012
2013 // Step 2: Insert new IV and loop condition:
2014 // loop:
2015 // ...
2016 // PhiCount = PHI [Count, Dec]
2017 // ...
2018 // Dec = PhiCount - 1
2019 // ...
2020 // Br: loop if (Dec != 0)
2021 BasicBlock *Body = *(CurLoop->block_begin());
2022 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2023 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2024
2025 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi");
2026 TcPhi->insertBefore(Body->begin());
2027
2028 Builder.SetInsertPoint(LbCond);
2029 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2030 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2031
2032 TcPhi->addIncoming(Count, Preheader);
2033 TcPhi->addIncoming(TcDec, Body);
2034
2035 CmpInst::Predicate Pred =
2036 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2037 LbCond->setPredicate(Pred);
2038 LbCond->setOperand(0, TcDec);
2039 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2040
2041 // Step 3: All the references to the original counter outside
2042 // the loop are replaced with the NewCount
2043 if (IsCntPhiUsedOutsideLoop)
2044 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2045 else
2046 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2047
2048 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2049 // loop. The loop would otherwise not be deleted even if it becomes empty.
2050 SE->forgetLoop(CurLoop);
2051}
2052
2053void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2054 Instruction *CntInst,
2055 PHINode *CntPhi, Value *Var) {
2056 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2057 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2058 const DebugLoc &DL = CntInst->getDebugLoc();
2059
2060 // Assuming before transformation, the loop is following:
2061 // if (x) // the precondition
2062 // do { cnt++; x &= x - 1; } while(x);
2063
2064 // Step 1: Insert the ctpop instruction at the end of the precondition block
2065 IRBuilder<> Builder(PreCondBr);
2066 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2067 {
2068 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2069 NewCount = PopCntZext =
2070 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2071
2072 if (NewCount != PopCnt)
2073 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2074
2075 // TripCnt is exactly the number of iterations the loop has
2076 TripCnt = NewCount;
2077
2078 // If the population counter's initial value is not zero, insert Add Inst.
2079 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2080 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2081 if (!InitConst || !InitConst->isZero()) {
2082 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2083 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2084 }
2085 }
2086
2087 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2088 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2089 // function would be partial dead code, and downstream passes will drag
2090 // it back from the precondition block to the preheader.
2091 {
2092 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2093
2094 Value *Opnd0 = PopCntZext;
2095 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2096 if (PreCond->getOperand(0) != Var)
2097 std::swap(Opnd0, Opnd1);
2098
2099 ICmpInst *NewPreCond = cast<ICmpInst>(
2100 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2101 PreCondBr->setCondition(NewPreCond);
2102
2104 }
2105
2106 // Step 3: Note that the population count is exactly the trip count of the
2107 // loop in question, which enable us to convert the loop from noncountable
2108 // loop into a countable one. The benefit is twofold:
2109 //
2110 // - If the loop only counts population, the entire loop becomes dead after
2111 // the transformation. It is a lot easier to prove a countable loop dead
2112 // than to prove a noncountable one. (In some C dialects, an infinite loop
2113 // isn't dead even if it computes nothing useful. In general, DCE needs
2114 // to prove a noncountable loop finite before safely delete it.)
2115 //
2116 // - If the loop also performs something else, it remains alive.
2117 // Since it is transformed to countable form, it can be aggressively
2118 // optimized by some optimizations which are in general not applicable
2119 // to a noncountable loop.
2120 //
2121 // After this step, this loop (conceptually) would look like following:
2122 // newcnt = __builtin_ctpop(x);
2123 // t = newcnt;
2124 // if (x)
2125 // do { cnt++; x &= x-1; t--) } while (t > 0);
2126 BasicBlock *Body = *(CurLoop->block_begin());
2127 {
2128 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2129 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2130 Type *Ty = TripCnt->getType();
2131
2132 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi");
2133 TcPhi->insertBefore(Body->begin());
2134
2135 Builder.SetInsertPoint(LbCond);
2136 Instruction *TcDec = cast<Instruction>(
2137 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2138 "tcdec", false, true));
2139
2140 TcPhi->addIncoming(TripCnt, PreHead);
2141 TcPhi->addIncoming(TcDec, Body);
2142
2143 CmpInst::Predicate Pred =
2144 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2145 LbCond->setPredicate(Pred);
2146 LbCond->setOperand(0, TcDec);
2147 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2148 }
2149
2150 // Step 4: All the references to the original population counter outside
2151 // the loop are replaced with the NewCount -- the value returned from
2152 // __builtin_ctpop().
2153 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2154
2155 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2156 // loop. The loop would otherwise not be deleted even if it becomes empty.
2157 SE->forgetLoop(CurLoop);
2158}
2159
2160/// Match loop-invariant value.
2161template <typename SubPattern_t> struct match_LoopInvariant {
2162 SubPattern_t SubPattern;
2163 const Loop *L;
2164
2165 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2166 : SubPattern(SP), L(L) {}
2167
2168 template <typename ITy> bool match(ITy *V) {
2169 return L->isLoopInvariant(V) && SubPattern.match(V);
2170 }
2171};
2172
2173/// Matches if the value is loop-invariant.
2174template <typename Ty>
2175inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2176 return match_LoopInvariant<Ty>(M, L);
2177}
2178
2179/// Return true if the idiom is detected in the loop.
2180///
2181/// The core idiom we are trying to detect is:
2182/// \code
2183/// entry:
2184/// <...>
2185/// %bitmask = shl i32 1, %bitpos
2186/// br label %loop
2187///
2188/// loop:
2189/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2190/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2191/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2192/// %x.next = shl i32 %x.curr, 1
2193/// <...>
2194/// br i1 %x.curr.isbitunset, label %loop, label %end
2195///
2196/// end:
2197/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2198/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2199/// <...>
2200/// \endcode
2201static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2202 Value *&BitMask, Value *&BitPos,
2203 Value *&CurrX, Instruction *&NextX) {
2205 " Performing shift-until-bittest idiom detection.\n");
2206
2207 // Give up if the loop has multiple blocks or multiple backedges.
2208 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2209 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2210 return false;
2211 }
2212
2213 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2214 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2215 assert(LoopPreheaderBB && "There is always a loop preheader.");
2216
2217 using namespace PatternMatch;
2218
2219 // Step 1: Check if the loop backedge is in desirable form.
2220
2222 Value *CmpLHS, *CmpRHS;
2223 BasicBlock *TrueBB, *FalseBB;
2224 if (!match(LoopHeaderBB->getTerminator(),
2225 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2226 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2227 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2228 return false;
2229 }
2230
2231 // Step 2: Check if the backedge's condition is in desirable form.
2232
2233 auto MatchVariableBitMask = [&]() {
2234 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2235 match(CmpLHS,
2236 m_c_And(m_Value(CurrX),
2238 m_Value(BitMask),
2239 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2240 CurLoop))));
2241 };
2242 auto MatchConstantBitMask = [&]() {
2243 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2244 match(CmpLHS, m_And(m_Value(CurrX),
2245 m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2246 (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2247 };
2248 auto MatchDecomposableConstantBitMask = [&]() {
2249 APInt Mask;
2250 return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2251 ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2252 (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2253 (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2254 };
2255
2256 if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2257 !MatchDecomposableConstantBitMask()) {
2258 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2259 return false;
2260 }
2261
2262 // Step 3: Check if the recurrence is in desirable form.
2263 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2264 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2265 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2266 return false;
2267 }
2268
2269 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2270 NextX =
2271 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2272
2273 assert(CurLoop->isLoopInvariant(BaseX) &&
2274 "Expected BaseX to be avaliable in the preheader!");
2275
2276 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2277 // FIXME: support right-shift?
2278 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2279 return false;
2280 }
2281
2282 // Step 4: Check if the backedge's destinations are in desirable form.
2283
2285 "Should only get equality predicates here.");
2286
2287 // cmp-br is commutative, so canonicalize to a single variant.
2288 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2289 Pred = ICmpInst::getInversePredicate(Pred);
2290 std::swap(TrueBB, FalseBB);
2291 }
2292
2293 // We expect to exit loop when comparison yields false,
2294 // so when it yields true we should branch back to loop header.
2295 if (TrueBB != LoopHeaderBB) {
2296 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2297 return false;
2298 }
2299
2300 // Okay, idiom checks out.
2301 return true;
2302}
2303
2304/// Look for the following loop:
2305/// \code
2306/// entry:
2307/// <...>
2308/// %bitmask = shl i32 1, %bitpos
2309/// br label %loop
2310///
2311/// loop:
2312/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2313/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2314/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2315/// %x.next = shl i32 %x.curr, 1
2316/// <...>
2317/// br i1 %x.curr.isbitunset, label %loop, label %end
2318///
2319/// end:
2320/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2321/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2322/// <...>
2323/// \endcode
2324///
2325/// And transform it into:
2326/// \code
2327/// entry:
2328/// %bitmask = shl i32 1, %bitpos
2329/// %lowbitmask = add i32 %bitmask, -1
2330/// %mask = or i32 %lowbitmask, %bitmask
2331/// %x.masked = and i32 %x, %mask
2332/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2333/// i1 true)
2334/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2335/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2336/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2337/// %tripcount = add i32 %backedgetakencount, 1
2338/// %x.curr = shl i32 %x, %backedgetakencount
2339/// %x.next = shl i32 %x, %tripcount
2340/// br label %loop
2341///
2342/// loop:
2343/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2344/// %loop.iv.next = add nuw i32 %loop.iv, 1
2345/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2346/// <...>
2347/// br i1 %loop.ivcheck, label %end, label %loop
2348///
2349/// end:
2350/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2351/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2352/// <...>
2353/// \endcode
2354bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2355 bool MadeChange = false;
2356
2357 Value *X, *BitMask, *BitPos, *XCurr;
2358 Instruction *XNext;
2359 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2360 XNext)) {
2362 " shift-until-bittest idiom detection failed.\n");
2363 return MadeChange;
2364 }
2365 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2366
2367 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2368 // but is it profitable to transform?
2369
2370 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2371 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2372 assert(LoopPreheaderBB && "There is always a loop preheader.");
2373
2374 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2375 assert(SuccessorBB && "There is only a single successor.");
2376
2377 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2378 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2379
2380 Intrinsic::ID IntrID = Intrinsic::ctlz;
2381 Type *Ty = X->getType();
2382 unsigned Bitwidth = Ty->getScalarSizeInBits();
2383
2386
2387 // The rewrite is considered to be unprofitable iff and only iff the
2388 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2389 // making the loop countable, even if nothing else changes.
2391 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getTrue()});
2395 " Intrinsic is too costly, not beneficial\n");
2396 return MadeChange;
2397 }
2398 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2400 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2401 return MadeChange;
2402 }
2403
2404 // Ok, transform appears worthwhile.
2405 MadeChange = true;
2406
2407 if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) {
2408 // BitMask may be computed from BitPos, Freeze BitPos so we can increase
2409 // it's use count.
2410 std::optional<BasicBlock::iterator> InsertPt = std::nullopt;
2411 if (auto *BitPosI = dyn_cast<Instruction>(BitPos))
2412 InsertPt = BitPosI->getInsertionPointAfterDef();
2413 else
2414 InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
2415 if (!InsertPt)
2416 return false;
2417 FreezeInst *BitPosFrozen =
2418 new FreezeInst(BitPos, BitPos->getName() + ".fr", *InsertPt);
2419 BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) {
2420 return U.getUser() != BitPosFrozen;
2421 });
2422 BitPos = BitPosFrozen;
2423 }
2424
2425 // Step 1: Compute the loop trip count.
2426
2427 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2428 BitPos->getName() + ".lowbitmask");
2429 Value *Mask =
2430 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2431 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2432 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2433 IntrID, Ty, {XMasked, /*is_zero_poison=*/Builder.getTrue()},
2434 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2435 Value *XMaskedNumActiveBits = Builder.CreateSub(
2436 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2437 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2438 /*HasNSW=*/Bitwidth != 2);
2439 Value *XMaskedLeadingOnePos =
2440 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2441 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2442 /*HasNSW=*/Bitwidth > 2);
2443
2444 Value *LoopBackedgeTakenCount = Builder.CreateSub(
2445 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2446 /*HasNUW=*/true, /*HasNSW=*/true);
2447 // We know loop's backedge-taken count, but what's loop's trip count?
2448 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2449 Value *LoopTripCount =
2450 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2451 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2452 /*HasNSW=*/Bitwidth != 2);
2453
2454 // Step 2: Compute the recurrence's final value without a loop.
2455
2456 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2457 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2458 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2459 NewX->takeName(XCurr);
2460 if (auto *I = dyn_cast<Instruction>(NewX))
2461 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2462
2463 Value *NewXNext;
2464 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2465 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2466 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2467 // that isn't the case, we'll need to emit an alternative, safe IR.
2468 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2472 Ty->getScalarSizeInBits() - 1))))
2473 NewXNext = Builder.CreateShl(X, LoopTripCount);
2474 else {
2475 // Otherwise, just additionally shift by one. It's the smallest solution,
2476 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2477 // and select 0 instead.
2478 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2479 }
2480
2481 NewXNext->takeName(XNext);
2482 if (auto *I = dyn_cast<Instruction>(NewXNext))
2483 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2484
2485 // Step 3: Adjust the successor basic block to recieve the computed
2486 // recurrence's final value instead of the recurrence itself.
2487
2488 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2489 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2490
2491 // Step 4: Rewrite the loop into a countable form, with canonical IV.
2492
2493 // The new canonical induction variable.
2494 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
2495 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2496
2497 // The induction itself.
2498 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2499 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2500 auto *IVNext =
2501 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2502 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2503
2504 // The loop trip count check.
2505 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2506 CurLoop->getName() + ".ivcheck");
2507 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2508 LoopHeaderBB->getTerminator()->eraseFromParent();
2509
2510 // Populate the IV PHI.
2511 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2512 IV->addIncoming(IVNext, LoopHeaderBB);
2513
2514 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2515 // loop. The loop would otherwise not be deleted even if it becomes empty.
2516
2517 SE->forgetLoop(CurLoop);
2518
2519 // Other passes will take care of actually deleting the loop if possible.
2520
2521 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2522
2523 ++NumShiftUntilBitTest;
2524 return MadeChange;
2525}
2526
2527/// Return true if the idiom is detected in the loop.
2528///
2529/// The core idiom we are trying to detect is:
2530/// \code
2531/// entry:
2532/// <...>
2533/// %start = <...>
2534/// %extraoffset = <...>
2535/// <...>
2536/// br label %for.cond
2537///
2538/// loop:
2539/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2540/// %nbits = add nsw i8 %iv, %extraoffset
2541/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2542/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2543/// %iv.next = add i8 %iv, 1
2544/// <...>
2545/// br i1 %val.shifted.iszero, label %end, label %loop
2546///
2547/// end:
2548/// %iv.res = phi i8 [ %iv, %loop ] <...>
2549/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2550/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2551/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2552/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2553/// <...>
2554/// \endcode
2556 Instruction *&ValShiftedIsZero,
2557 Intrinsic::ID &IntrinID, Instruction *&IV,
2558 Value *&Start, Value *&Val,
2559 const SCEV *&ExtraOffsetExpr,
2560 bool &InvertedCond) {
2562 " Performing shift-until-zero idiom detection.\n");
2563
2564 // Give up if the loop has multiple blocks or multiple backedges.
2565 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2566 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2567 return false;
2568 }
2569
2570 Instruction *ValShifted, *NBits, *IVNext;
2571 Value *ExtraOffset;
2572
2573 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2574 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2575 assert(LoopPreheaderBB && "There is always a loop preheader.");
2576
2577 using namespace PatternMatch;
2578
2579 // Step 1: Check if the loop backedge, condition is in desirable form.
2580
2582 BasicBlock *TrueBB, *FalseBB;
2583 if (!match(LoopHeaderBB->getTerminator(),
2584 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2585 m_BasicBlock(FalseBB))) ||
2586 !match(ValShiftedIsZero,
2587 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
2588 !ICmpInst::isEquality(Pred)) {
2589 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2590 return false;
2591 }
2592
2593 // Step 2: Check if the comparison's operand is in desirable form.
2594 // FIXME: Val could be a one-input PHI node, which we should look past.
2595 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2596 m_Instruction(NBits)))) {
2597 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
2598 return false;
2599 }
2600 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2601 : Intrinsic::ctlz;
2602
2603 // Step 3: Check if the shift amount is in desirable form.
2604
2605 if (match(NBits, m_c_Add(m_Instruction(IV),
2606 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2607 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
2608 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2609 else if (match(NBits,
2611 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2612 NBits->hasNoSignedWrap())
2613 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2614 else {
2615 IV = NBits;
2616 ExtraOffsetExpr = SE->getZero(NBits->getType());
2617 }
2618
2619 // Step 4: Check if the recurrence is in desirable form.
2620 auto *IVPN = dyn_cast<PHINode>(IV);
2621 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
2622 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2623 return false;
2624 }
2625
2626 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2627 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2628
2629 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2630 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2631 return false;
2632 }
2633
2634 // Step 4: Check if the backedge's destinations are in desirable form.
2635
2637 "Should only get equality predicates here.");
2638
2639 // cmp-br is commutative, so canonicalize to a single variant.
2640 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2641 if (InvertedCond) {
2642 Pred = ICmpInst::getInversePredicate(Pred);
2643 std::swap(TrueBB, FalseBB);
2644 }
2645
2646 // We expect to exit loop when comparison yields true,
2647 // so when it yields false we should branch back to loop header.
2648 if (FalseBB != LoopHeaderBB) {
2649 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2650 return false;
2651 }
2652
2653 // The new, countable, loop will certainly only run a known number of
2654 // iterations, It won't be infinite. But the old loop might be infinite
2655 // under certain conditions. For logical shifts, the value will become zero
2656 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2657 // right-shift, iff the sign bit was set, the value will never become zero,
2658 // and the loop may never finish.
2659 if (ValShifted->getOpcode() == Instruction::AShr &&
2660 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2661 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
2662 return false;
2663 }
2664
2665 // Okay, idiom checks out.
2666 return true;
2667}
2668
2669/// Look for the following loop:
2670/// \code
2671/// entry:
2672/// <...>
2673/// %start = <...>
2674/// %extraoffset = <...>
2675/// <...>
2676/// br label %for.cond
2677///
2678/// loop:
2679/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2680/// %nbits = add nsw i8 %iv, %extraoffset
2681/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2682/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2683/// %iv.next = add i8 %iv, 1
2684/// <...>
2685/// br i1 %val.shifted.iszero, label %end, label %loop
2686///
2687/// end:
2688/// %iv.res = phi i8 [ %iv, %loop ] <...>
2689/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2690/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2691/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2692/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2693/// <...>
2694/// \endcode
2695///
2696/// And transform it into:
2697/// \code
2698/// entry:
2699/// <...>
2700/// %start = <...>
2701/// %extraoffset = <...>
2702/// <...>
2703/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2704/// %val.numactivebits = sub i8 8, %val.numleadingzeros
2705/// %extraoffset.neg = sub i8 0, %extraoffset
2706/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2707/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2708/// %loop.tripcount = sub i8 %iv.final, %start
2709/// br label %loop
2710///
2711/// loop:
2712/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2713/// %loop.iv.next = add i8 %loop.iv, 1
2714/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2715/// %iv = add i8 %loop.iv, %start
2716/// <...>
2717/// br i1 %loop.ivcheck, label %end, label %loop
2718///
2719/// end:
2720/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2721/// <...>
2722/// \endcode
2723bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2724 bool MadeChange = false;
2725
2726 Instruction *ValShiftedIsZero;
2727 Intrinsic::ID IntrID;
2728 Instruction *IV;
2729 Value *Start, *Val;
2730 const SCEV *ExtraOffsetExpr;
2731 bool InvertedCond;
2732 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2733 Start, Val, ExtraOffsetExpr, InvertedCond)) {
2735 " shift-until-zero idiom detection failed.\n");
2736 return MadeChange;
2737 }
2738 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
2739
2740 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2741 // but is it profitable to transform?
2742
2743 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2744 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2745 assert(LoopPreheaderBB && "There is always a loop preheader.");
2746
2747 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2748 assert(SuccessorBB && "There is only a single successor.");
2749
2750 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2751 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2752
2753 Type *Ty = Val->getType();
2754 unsigned Bitwidth = Ty->getScalarSizeInBits();
2755
2758
2759 // The rewrite is considered to be unprofitable iff and only iff the
2760 // intrinsic we'll use are not cheap. Note that we are okay with *just*
2761 // making the loop countable, even if nothing else changes.
2763 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
2767 " Intrinsic is too costly, not beneficial\n");
2768 return MadeChange;
2769 }
2770
2771 // Ok, transform appears worthwhile.
2772 MadeChange = true;
2773
2774 bool OffsetIsZero = false;
2775 if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
2776 OffsetIsZero = ExtraOffsetExprC->isZero();
2777
2778 // Step 1: Compute the loop's final IV value / trip count.
2779
2780 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
2781 IntrID, Ty, {Val, /*is_zero_poison=*/Builder.getFalse()},
2782 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
2783 Value *ValNumActiveBits = Builder.CreateSub(
2784 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
2785 Val->getName() + ".numactivebits", /*HasNUW=*/true,
2786 /*HasNSW=*/Bitwidth != 2);
2787
2788 SCEVExpander Expander(*SE, *DL, "loop-idiom");
2789 Expander.setInsertPoint(&*Builder.GetInsertPoint());
2790 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
2791
2792 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
2793 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
2794 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
2795 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
2796 {ValNumActiveBitsOffset, Start},
2797 /*FMFSource=*/nullptr, "iv.final");
2798
2799 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
2800 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
2801 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
2802 // FIXME: or when the offset was `add nuw`
2803
2804 // We know loop's backedge-taken count, but what's loop's trip count?
2805 Value *LoopTripCount =
2806 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2807 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2808 /*HasNSW=*/Bitwidth != 2);
2809
2810 // Step 2: Adjust the successor basic block to recieve the original
2811 // induction variable's final value instead of the orig. IV itself.
2812
2813 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
2814
2815 // Step 3: Rewrite the loop into a countable form, with canonical IV.
2816
2817 // The new canonical induction variable.
2818 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
2819 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2820
2821 // The induction itself.
2822 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->getFirstNonPHIIt());
2823 auto *CIVNext =
2824 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
2825 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2826
2827 // The loop trip count check.
2828 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
2829 CurLoop->getName() + ".ivcheck");
2830 auto *NewIVCheck = CIVCheck;
2831 if (InvertedCond) {
2832 NewIVCheck = Builder.CreateNot(CIVCheck);
2833 NewIVCheck->takeName(ValShiftedIsZero);
2834 }
2835
2836 // The original IV, but rebased to be an offset to the CIV.
2837 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
2838 /*HasNSW=*/true); // FIXME: what about NUW?
2839 IVDePHId->takeName(IV);
2840
2841 // The loop terminator.
2842 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2843 Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
2844 LoopHeaderBB->getTerminator()->eraseFromParent();
2845
2846 // Populate the IV PHI.
2847 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2848 CIV->addIncoming(CIVNext, LoopHeaderBB);
2849
2850 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
2851 // loop. The loop would otherwise not be deleted even if it becomes empty.
2852
2853 SE->forgetLoop(CurLoop);
2854
2855 // Step 5: Try to cleanup the loop's body somewhat.
2856 IV->replaceAllUsesWith(IVDePHId);
2857 IV->eraseFromParent();
2858
2859 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
2860 ValShiftedIsZero->eraseFromParent();
2861
2862 // Other passes will take care of actually deleting the loop if possible.
2863
2864 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
2865
2866 ++NumShiftUntilZero;
2867 return MadeChange;
2868}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
std::string Name
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define DEBUG_TYPE
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
IRTranslator LLVM IR MI
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
#define DEBUG_TYPE
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
static void deleteDeadInstruction(Instruction *I)
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...
This header defines various interfaces for pass management in LLVM.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
verify safepoint Safepoint IR Verifier
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
static const uint32_t IV[8]
Definition: blake3_impl.h:78
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:77
std::optional< uint64_t > tryZExtValue() const
Get zero extended value if possible.
Definition: APInt.h:1511
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1447
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:647
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:451
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:438
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:248
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:372
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:365
const Instruction & front() const
Definition: BasicBlock.h:461
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:457
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:167
const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
Definition: BasicBlock.cpp:428
size_t size() const
Definition: BasicBlock.h:459
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
BinaryOps getOpcode() const
Definition: InstrTypes.h:442
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
This class represents a function call, abstracting a target machine's calling convention.
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition: InstrTypes.h:850
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:871
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:847
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1292
static Constant * getExactLogBase2(Constant *C)
If C is a scalar/fixed width vector of known powers of 2, then this function returns a new scalar/fix...
Definition: Constants.cpp:2586
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:218
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:212
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:864
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
NodeT * getRoot() const
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
This class represents a freeze function that returns random concrete value if an operand is either a ...
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalObject.
Definition: Globals.cpp:137
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:231
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
This instruction compares its operands according to the predicate given to the constructor.
bool isEquality() const
Return true if this predicate is either EQ or NE.
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:459
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:172
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2410
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2664
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:97
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:476
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1635
AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition: Metadata.cpp:1706
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isShift() const
Definition: Instruction.h:281
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:473
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:173
Value * getPointerOperand()
Definition: Instructions.h:252
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:202
bool isUnordered() const
Definition: Instructions.h:246
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:208
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
block_iterator block_begin() const
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
StringRef getName() const
Definition: LoopInfo.h:393
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
MaybeAlign getSourceAlign() const
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Representation for a specific memory location.
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:928
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:701
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1814
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
const SCEV * getOperand(unsigned i) const
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
The main scalar evolution driver.
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:110
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:51
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:47
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:361
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:418
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:289
Align getAlign() const
Definition: Instructions.h:328
Value * getValueOperand()
Definition: Instructions.h:373
Value * getPointerOperand()
Definition: Instructions.h:376
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Provides information about what library functions are available for the current target.
bool has(LibFunc F) const
Tests whether a library function is available.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
unsigned getAtomicMemIntrinsicMaxElementSize() const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCC_Basic
The cost of a typical 'add' instruction.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:592
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
@ HeaderSize
Definition: BTF.h:61
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1484
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
cst_pred_ty< is_power2 > m_Power2()
Match an integer or vector power-of-2.
Definition: PatternMatch.h:619
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:245
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition: PatternMatch.h:189
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition: PatternMatch.h:698
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
bool inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition: LoopInfo.cpp:1150
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI, LibFunc TheLibFunc, FunctionType *T, AttributeList AttributeList)
Calls getOrInsertFunction() and then makes sure to add mandatory argument attributes.
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
@ ModRef
The access may reference and may modify the value stored in memory.
@ Mod
The access may modify the value stored in memory.
bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition: MemorySSA.cpp:84
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred, Value *&X, APInt &Mask, bool LookThroughTrunc=true)
Decompose an icmp into the form ((X & Mask) pred 0) if possible.
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:760
MDNode * TBAAStruct
The tag for type-based alias analysis (tbaa struct).
Definition: Metadata.h:780
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:783
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:777
AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:786
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition: Metadata.h:827
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static bool Memset
When true, Memset is disabled.
static bool All
When true, the entire pass is disabled.
static bool Memcpy
When true, Memcpy is disabled.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Match loop-invariant value.
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)