LLVM 23.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize: memcmp, etc.
24//
25// This could recognize common matrix multiplies and dot product idioms and
26// replace them with calls to BLAS (if linked in??).
27//
28//===----------------------------------------------------------------------===//
29
31#include "llvm/ADT/APInt.h"
32#include "llvm/ADT/ArrayRef.h"
33#include "llvm/ADT/DenseMap.h"
34#include "llvm/ADT/MapVector.h"
35#include "llvm/ADT/SetVector.h"
38#include "llvm/ADT/Statistic.h"
39#include "llvm/ADT/StringRef.h"
57#include "llvm/IR/BasicBlock.h"
58#include "llvm/IR/Constant.h"
59#include "llvm/IR/Constants.h"
60#include "llvm/IR/DataLayout.h"
61#include "llvm/IR/DebugLoc.h"
63#include "llvm/IR/Dominators.h"
64#include "llvm/IR/GlobalValue.h"
66#include "llvm/IR/IRBuilder.h"
67#include "llvm/IR/InstrTypes.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/LLVMContext.h"
73#include "llvm/IR/Module.h"
74#include "llvm/IR/PassManager.h"
77#include "llvm/IR/Type.h"
78#include "llvm/IR/User.h"
79#include "llvm/IR/Value.h"
80#include "llvm/IR/ValueHandle.h"
83#include "llvm/Support/Debug.h"
90#include <algorithm>
91#include <cassert>
92#include <cstdint>
93#include <utility>
94
95using namespace llvm;
96using namespace SCEVPatternMatch;
97
98#define DEBUG_TYPE "loop-idiom"
99
100STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
101STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
102STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
103STATISTIC(NumStrLen, "Number of strlen's and wcslen's formed from loop loads");
105 NumShiftUntilBitTest,
106 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
107STATISTIC(NumShiftUntilZero,
108 "Number of uncountable loops recognized as 'shift until zero' idiom");
109
110namespace llvm {
113 DisableLIRPAll("disable-" DEBUG_TYPE "-all",
114 cl::desc("Options to disable Loop Idiom Recognize Pass."),
117
120 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
121 cl::desc("Proceed with loop idiom recognize pass, but do "
122 "not convert loop(s) to memset."),
125
128 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
129 cl::desc("Proceed with loop idiom recognize pass, but do "
130 "not convert loop(s) to memcpy."),
133
136 DisableLIRPStrlen("disable-loop-idiom-strlen",
137 cl::desc("Proceed with loop idiom recognize pass, but do "
138 "not convert loop(s) to strlen."),
141
144 EnableLIRPWcslen("disable-loop-idiom-wcslen",
145 cl::desc("Proceed with loop idiom recognize pass, "
146 "enable conversion of loop(s) to wcslen."),
149
152 DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize",
153 cl::desc("Proceed with loop idiom recognize pass, "
154 "but do not optimize CRC loops."),
156 cl::init(false), cl::ReallyHidden);
157
159 "use-lir-code-size-heurs",
160 cl::desc("Use loop idiom recognition code size heuristics when compiling "
161 "with -Os/-Oz"),
162 cl::init(true), cl::Hidden);
163
165 "loop-idiom-force-memset-pattern-intrinsic",
166 cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false),
167 cl::Hidden);
168
170
171} // namespace llvm
172
173namespace {
174
175class LoopIdiomRecognize {
176 Loop *CurLoop = nullptr;
178 DominatorTree *DT;
179 LoopInfo *LI;
180 ScalarEvolution *SE;
183 const DataLayout *DL;
185 bool ApplyCodeSizeHeuristics;
186 std::unique_ptr<MemorySSAUpdater> MSSAU;
187
188public:
189 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
190 LoopInfo *LI, ScalarEvolution *SE,
192 const TargetTransformInfo *TTI, MemorySSA *MSSA,
193 const DataLayout *DL,
195 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
196 if (MSSA)
197 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
198 }
199
200 bool runOnLoop(Loop *L);
201
202private:
203 using StoreList = SmallVector<StoreInst *, 8>;
204 using StoreListMap = MapVector<Value *, StoreList>;
205
206 StoreListMap StoreRefsForMemset;
207 StoreListMap StoreRefsForMemsetPattern;
208 StoreList StoreRefsForMemcpy;
209 bool HasMemset;
210 bool HasMemsetPattern;
211 bool HasMemcpy;
212
213 /// Return code for isLegalStore()
214 enum LegalStoreKind {
215 None = 0,
216 Memset,
217 MemsetPattern,
218 Memcpy,
219 UnorderedAtomicMemcpy,
220 DontUse // Dummy retval never to be used. Allows catching errors in retval
221 // handling.
222 };
223
224 /// \name Countable Loop Idiom Handling
225 /// @{
226
227 bool runOnCountableLoop();
228 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
229 SmallVectorImpl<BasicBlock *> &ExitBlocks);
230
231 void collectStores(BasicBlock *BB);
232 LegalStoreKind isLegalStore(StoreInst *SI);
233 enum class ForMemset { No, Yes };
234 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
235 ForMemset For);
236
237 template <typename MemInst>
238 bool processLoopMemIntrinsic(
239 BasicBlock *BB,
240 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
241 const SCEV *BECount);
242 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
243 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
244
245 bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
246 MaybeAlign StoreAlignment, Value *StoredVal,
247 Instruction *TheStore,
248 SmallPtrSetImpl<Instruction *> &Stores,
249 const SCEVAddRecExpr *Ev, const SCEV *BECount,
250 bool IsNegStride, bool IsLoopMemset = false);
251 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
252 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
253 const SCEV *StoreSize, MaybeAlign StoreAlign,
254 MaybeAlign LoadAlign, Instruction *TheStore,
255 Instruction *TheLoad,
256 const SCEVAddRecExpr *StoreEv,
257 const SCEVAddRecExpr *LoadEv,
258 const SCEV *BECount);
259 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
260 bool IsLoopMemset = false);
261 bool optimizeCRCLoop(const PolynomialInfo &Info);
262
263 /// @}
264 /// \name Noncountable Loop Idiom Handling
265 /// @{
266
267 bool runOnNoncountableLoop();
268
269 bool recognizePopcount();
270 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
271 PHINode *CntPhi, Value *Var);
272 bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
273 bool ZeroCheck, size_t CanonicalSize);
274 bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
275 Instruction *DefX, PHINode *CntPhi,
276 Instruction *CntInst);
277 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
278 bool recognizeShiftUntilLessThan();
279 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
280 Instruction *CntInst, PHINode *CntPhi,
281 Value *Var, Instruction *DefX,
282 const DebugLoc &DL, bool ZeroCheck,
283 bool IsCntPhiUsedOutsideLoop,
284 bool InsertSub = false);
285
286 bool recognizeShiftUntilBitTest();
287 bool recognizeShiftUntilZero();
288 bool recognizeAndInsertStrLen();
289
290 /// @}
291};
292} // end anonymous namespace
293
296 LPMUpdater &) {
298 return PreservedAnalyses::all();
299
300 const auto *DL = &L.getHeader()->getDataLayout();
301
302 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
303 // pass. Function analyses need to be preserved across loop transformations
304 // but ORE cannot be preserved (see comment before the pass definition).
305 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
306
307 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
308 AR.MSSA, DL, ORE);
309 if (!LIR.runOnLoop(&L))
310 return PreservedAnalyses::all();
311
313 if (AR.MSSA)
314 PA.preserve<MemorySSAAnalysis>();
315 return PA;
316}
317
319 I->replaceAllUsesWith(PoisonValue::get(I->getType()));
320 I->eraseFromParent();
321}
322
323//===----------------------------------------------------------------------===//
324//
325// Implementation of LoopIdiomRecognize
326//
327//===----------------------------------------------------------------------===//
328
329bool LoopIdiomRecognize::runOnLoop(Loop *L) {
330 CurLoop = L;
331 // If the loop could not be converted to canonical form, it must have an
332 // indirectbr in it, just give up.
333 if (!L->getLoopPreheader())
334 return false;
335
336 // Disable loop idiom recognition if the function's name is a common idiom.
337 StringRef Name = L->getHeader()->getParent()->getName();
338 if (Name == "memset" || Name == "memcpy" || Name == "strlen" ||
339 Name == "wcslen")
340 return false;
341
342 // Determine if code size heuristics need to be applied.
343 ApplyCodeSizeHeuristics =
344 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
345
346 HasMemset = TLI->has(LibFunc_memset);
347 // TODO: Unconditionally enable use of the memset pattern intrinsic (or at
348 // least, opt-in via target hook) once we are confident it will never result
349 // in worse codegen than without. For now, use it only when the target
350 // supports memset_pattern16 libcall (or unless this is overridden by
351 // command line option).
352 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
353 HasMemcpy = TLI->has(LibFunc_memcpy);
354
355 if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic ||
356 HasMemcpy || !DisableLIRP::HashRecognize)
358 return runOnCountableLoop();
359
360 return runOnNoncountableLoop();
361}
362
363bool LoopIdiomRecognize::runOnCountableLoop() {
364 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
366 "runOnCountableLoop() called on a loop without a predictable"
367 "backedge-taken count");
368
369 // If this loop executes exactly one time, then it should be peeled, not
370 // optimized by this pass.
371 if (BECount->isZero())
372 return false;
373
375 CurLoop->getUniqueExitBlocks(ExitBlocks);
376
377 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
378 << CurLoop->getHeader()->getParent()->getName()
379 << "] Countable Loop %" << CurLoop->getHeader()->getName()
380 << "\n");
381
382 // The following transforms hoist stores/memsets into the loop pre-header.
383 // Give up if the loop has instructions that may throw.
384 SimpleLoopSafetyInfo SafetyInfo;
385 SafetyInfo.computeLoopSafetyInfo(CurLoop);
386 if (SafetyInfo.anyBlockMayThrow())
387 return false;
388
389 bool MadeChange = false;
390
391 // Scan all the blocks in the loop that are not in subloops.
392 for (auto *BB : CurLoop->getBlocks()) {
393 // Ignore blocks in subloops.
394 if (LI->getLoopFor(BB) != CurLoop)
395 continue;
396
397 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
398 }
399
400 // Optimize a CRC loop if HashRecognize found one, provided we're not
401 // optimizing for size.
402 if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics)
403 if (auto Res = HashRecognize(*CurLoop, *SE).getResult())
404 optimizeCRCLoop(*Res);
405
406 return MadeChange;
407}
408
409static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
410 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
411 return ConstStride->getAPInt();
412}
413
414/// getMemSetPatternValue - If a strided store of the specified value is safe to
415/// turn into a memset.patternn intrinsic, return the Constant that should
416/// be passed in. Otherwise, return null.
417///
418/// TODO this function could allow more constants than it does today (e.g.
419/// those over 16 bytes) now it has transitioned to being used for the
420/// memset.pattern intrinsic rather than directly the memset_pattern16
421/// libcall.
423 // FIXME: This could check for UndefValue because it can be merged into any
424 // other valid pattern.
425
426 // If the value isn't a constant, we can't promote it to being in a constant
427 // array. We could theoretically do a store to an alloca or something, but
428 // that doesn't seem worthwhile.
430 if (!C || isa<ConstantExpr>(C))
431 return nullptr;
432
433 // Only handle simple values that are a power of two bytes in size.
434 uint64_t Size = DL->getTypeSizeInBits(V->getType());
435 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
436 return nullptr;
437
438 // Don't care enough about darwin/ppc to implement this.
439 if (DL->isBigEndian())
440 return nullptr;
441
442 // Convert to size in bytes.
443 Size /= 8;
444
445 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
446 // if the top and bottom are the same (e.g. for vectors and large integers).
447 if (Size > 16)
448 return nullptr;
449
450 // For now, don't handle types that aren't int, floats, or pointers.
451 Type *CTy = C->getType();
452 if (!CTy->isIntOrPtrTy() && !CTy->isFloatingPointTy())
453 return nullptr;
454
455 return C;
456}
457
458LoopIdiomRecognize::LegalStoreKind
459LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
460 // Don't touch volatile stores.
461 if (SI->isVolatile())
462 return LegalStoreKind::None;
463 // We only want simple or unordered-atomic stores.
464 if (!SI->isUnordered())
465 return LegalStoreKind::None;
466
467 // Avoid merging nontemporal stores.
468 if (SI->getMetadata(LLVMContext::MD_nontemporal))
469 return LegalStoreKind::None;
470
471 Value *StoredVal = SI->getValueOperand();
472 Value *StorePtr = SI->getPointerOperand();
473
474 // Don't convert stores of non-integral pointer types to memsets (which stores
475 // integers).
476 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
477 return LegalStoreKind::None;
478
479 // Reject stores that are so large that they overflow an unsigned.
480 // When storing out scalable vectors we bail out for now, since the code
481 // below currently only works for constant strides.
482 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
483 if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
484 (SizeInBits.getFixedValue() >> 32) != 0)
485 return LegalStoreKind::None;
486
487 // See if the pointer expression is an AddRec like {base,+,1} on the current
488 // loop, which indicates a strided store. If we have something else, it's a
489 // random store we can't handle.
490 const SCEV *StoreEv = SE->getSCEV(StorePtr);
491 const SCEVConstant *Stride;
492 if (!match(StoreEv, m_scev_AffineAddRec(m_SCEV(), m_SCEVConstant(Stride),
493 m_SpecificLoop(CurLoop))))
494 return LegalStoreKind::None;
495
496 // See if the store can be turned into a memset.
497
498 // If the stored value is a byte-wise value (like i32 -1), then it may be
499 // turned into a memset of i8 -1, assuming that all the consecutive bytes
500 // are stored. A store of i32 0x01020304 can never be turned into a memset,
501 // but it can be turned into memset_pattern if the target supports it.
502 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
503
504 // Note: memset and memset_pattern on unordered-atomic is yet not supported
505 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
506
507 // If we're allowed to form a memset, and the stored value would be
508 // acceptable for memset, use it.
509 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
510 // Verify that the stored value is loop invariant. If not, we can't
511 // promote the memset.
512 CurLoop->isLoopInvariant(SplatValue)) {
513 // It looks like we can use SplatValue.
514 return LegalStoreKind::Memset;
515 }
516 if (!UnorderedAtomic && (HasMemsetPattern || ForceMemsetPatternIntrinsic) &&
518 // Don't create memset_pattern16s with address spaces.
519 StorePtr->getType()->getPointerAddressSpace() == 0 &&
520 getMemSetPatternValue(StoredVal, DL)) {
521 // It looks like we can use PatternValue!
522 return LegalStoreKind::MemsetPattern;
523 }
524
525 // Otherwise, see if the store can be turned into a memcpy.
526 if (HasMemcpy && !DisableLIRP::Memcpy) {
527 // Check to see if the stride matches the size of the store. If so, then we
528 // know that every byte is touched in the loop.
529 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
530 APInt StrideAP = Stride->getAPInt();
531 if (StoreSize != StrideAP && StoreSize != -StrideAP)
532 return LegalStoreKind::None;
533
534 // The store must be feeding a non-volatile load.
535 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
536
537 // Only allow non-volatile loads
538 if (!LI || LI->isVolatile())
539 return LegalStoreKind::None;
540 // Only allow simple or unordered-atomic loads
541 if (!LI->isUnordered())
542 return LegalStoreKind::None;
543
544 // See if the pointer expression is an AddRec like {base,+,1} on the current
545 // loop, which indicates a strided load. If we have something else, it's a
546 // random load we can't handle.
547 const SCEV *LoadEv = SE->getSCEV(LI->getPointerOperand());
548
549 // The store and load must share the same stride.
550 if (!match(LoadEv, m_scev_AffineAddRec(m_SCEV(), m_scev_Specific(Stride),
551 m_SpecificLoop(CurLoop))))
552 return LegalStoreKind::None;
553
554 // Success. This store can be converted into a memcpy.
555 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
556 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
557 : LegalStoreKind::Memcpy;
558 }
559 // This store can't be transformed into a memset/memcpy.
560 return LegalStoreKind::None;
561}
562
563void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
564 StoreRefsForMemset.clear();
565 StoreRefsForMemsetPattern.clear();
566 StoreRefsForMemcpy.clear();
567 for (Instruction &I : *BB) {
569 if (!SI)
570 continue;
571
572 // Make sure this is a strided store with a constant stride.
573 switch (isLegalStore(SI)) {
574 case LegalStoreKind::None:
575 // Nothing to do
576 break;
577 case LegalStoreKind::Memset: {
578 // Find the base pointer.
579 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
580 StoreRefsForMemset[Ptr].push_back(SI);
581 } break;
582 case LegalStoreKind::MemsetPattern: {
583 // Find the base pointer.
584 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
585 StoreRefsForMemsetPattern[Ptr].push_back(SI);
586 } break;
587 case LegalStoreKind::Memcpy:
588 case LegalStoreKind::UnorderedAtomicMemcpy:
589 StoreRefsForMemcpy.push_back(SI);
590 break;
591 default:
592 assert(false && "unhandled return value");
593 break;
594 }
595 }
596}
597
598/// runOnLoopBlock - Process the specified block, which lives in a counted loop
599/// with the specified backedge count. This block is known to be in the current
600/// loop and not in any subloops.
601bool LoopIdiomRecognize::runOnLoopBlock(
602 BasicBlock *BB, const SCEV *BECount,
603 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
604 // We can only promote stores in this block if they are unconditionally
605 // executed in the loop. For a block to be unconditionally executed, it has
606 // to dominate all the exit blocks of the loop. Verify this now.
607 for (BasicBlock *ExitBlock : ExitBlocks)
608 if (!DT->dominates(BB, ExitBlock))
609 return false;
610
611 bool MadeChange = false;
612 // Look for store instructions, which may be optimized to memset/memcpy.
613 collectStores(BB);
614
615 // Look for a single store or sets of stores with a common base, which can be
616 // optimized into a memset (memset_pattern). The latter most commonly happens
617 // with structs and handunrolled loops.
618 for (auto &SL : StoreRefsForMemset)
619 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
620
621 for (auto &SL : StoreRefsForMemsetPattern)
622 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
623
624 // Optimize the store into a memcpy, if it feeds an similarly strided load.
625 for (auto &SI : StoreRefsForMemcpy)
626 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
627
628 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
629 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
630 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
631 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
632
633 return MadeChange;
634}
635
636/// See if this store(s) can be promoted to a memset.
637bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
638 const SCEV *BECount, ForMemset For) {
639 // Try to find consecutive stores that can be transformed into memsets.
640 SetVector<StoreInst *> Heads, Tails;
642
643 // Do a quadratic search on all of the given stores and find
644 // all of the pairs of stores that follow each other.
645 SmallVector<unsigned, 16> IndexQueue;
646 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
647 assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
648
649 Value *FirstStoredVal = SL[i]->getValueOperand();
650 Value *FirstStorePtr = SL[i]->getPointerOperand();
651 const SCEVAddRecExpr *FirstStoreEv =
652 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
653 APInt FirstStride = getStoreStride(FirstStoreEv);
654 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
655
656 // See if we can optimize just this store in isolation.
657 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
658 Heads.insert(SL[i]);
659 continue;
660 }
661
662 Value *FirstSplatValue = nullptr;
663 Constant *FirstPatternValue = nullptr;
664
665 if (For == ForMemset::Yes)
666 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
667 else
668 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
669
670 assert((FirstSplatValue || FirstPatternValue) &&
671 "Expected either splat value or pattern value.");
672
673 IndexQueue.clear();
674 // If a store has multiple consecutive store candidates, search Stores
675 // array according to the sequence: from i+1 to e, then from i-1 to 0.
676 // This is because usually pairing with immediate succeeding or preceding
677 // candidate create the best chance to find memset opportunity.
678 unsigned j = 0;
679 for (j = i + 1; j < e; ++j)
680 IndexQueue.push_back(j);
681 for (j = i; j > 0; --j)
682 IndexQueue.push_back(j - 1);
683
684 for (auto &k : IndexQueue) {
685 assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
686 Value *SecondStorePtr = SL[k]->getPointerOperand();
687 const SCEVAddRecExpr *SecondStoreEv =
688 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
689 APInt SecondStride = getStoreStride(SecondStoreEv);
690
691 if (FirstStride != SecondStride)
692 continue;
693
694 Value *SecondStoredVal = SL[k]->getValueOperand();
695 Value *SecondSplatValue = nullptr;
696 Constant *SecondPatternValue = nullptr;
697
698 if (For == ForMemset::Yes)
699 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
700 else
701 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
702
703 assert((SecondSplatValue || SecondPatternValue) &&
704 "Expected either splat value or pattern value.");
705
706 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
707 if (For == ForMemset::Yes) {
708 if (isa<UndefValue>(FirstSplatValue))
709 FirstSplatValue = SecondSplatValue;
710 if (FirstSplatValue != SecondSplatValue)
711 continue;
712 } else {
713 if (isa<UndefValue>(FirstPatternValue))
714 FirstPatternValue = SecondPatternValue;
715 if (FirstPatternValue != SecondPatternValue)
716 continue;
717 }
718 Tails.insert(SL[k]);
719 Heads.insert(SL[i]);
720 ConsecutiveChain[SL[i]] = SL[k];
721 break;
722 }
723 }
724 }
725
726 // We may run into multiple chains that merge into a single chain. We mark the
727 // stores that we transformed so that we don't visit the same store twice.
728 SmallPtrSet<Value *, 16> TransformedStores;
729 bool Changed = false;
730
731 // For stores that start but don't end a link in the chain:
732 for (StoreInst *I : Heads) {
733 if (Tails.count(I))
734 continue;
735
736 // We found a store instr that starts a chain. Now follow the chain and try
737 // to transform it.
738 SmallPtrSet<Instruction *, 8> AdjacentStores;
739 StoreInst *HeadStore = I;
740 unsigned StoreSize = 0;
741
742 // Collect the chain into a list.
743 while (Tails.count(I) || Heads.count(I)) {
744 if (TransformedStores.count(I))
745 break;
746 AdjacentStores.insert(I);
747
748 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
749 // Move to the next value in the chain.
750 I = ConsecutiveChain[I];
751 }
752
753 Value *StoredVal = HeadStore->getValueOperand();
754 Value *StorePtr = HeadStore->getPointerOperand();
755 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
756 APInt Stride = getStoreStride(StoreEv);
757
758 // Check to see if the stride matches the size of the stores. If so, then
759 // we know that every byte is touched in the loop.
760 if (StoreSize != Stride && StoreSize != -Stride)
761 continue;
762
763 bool IsNegStride = StoreSize == -Stride;
764
765 Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
766 const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
767 if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
768 MaybeAlign(HeadStore->getAlign()), StoredVal,
769 HeadStore, AdjacentStores, StoreEv, BECount,
770 IsNegStride)) {
771 TransformedStores.insert_range(AdjacentStores);
772 Changed = true;
773 }
774 }
775
776 return Changed;
777}
778
779/// processLoopMemIntrinsic - Template function for calling different processor
780/// functions based on mem intrinsic type.
781template <typename MemInst>
782bool LoopIdiomRecognize::processLoopMemIntrinsic(
783 BasicBlock *BB,
784 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
785 const SCEV *BECount) {
786 bool MadeChange = false;
787 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
788 Instruction *Inst = &*I++;
789 // Look for memory instructions, which may be optimized to a larger one.
790 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
791 WeakTrackingVH InstPtr(&*I);
792 if (!(this->*Processor)(MI, BECount))
793 continue;
794 MadeChange = true;
795
796 // If processing the instruction invalidated our iterator, start over from
797 // the top of the block.
798 if (!InstPtr)
799 I = BB->begin();
800 }
801 }
802 return MadeChange;
803}
804
805/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
806bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
807 const SCEV *BECount) {
808 // We can only handle non-volatile memcpys with a constant size.
809 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
810 return false;
811
812 // If we're not allowed to hack on memcpy, we fail.
813 if ((!HasMemcpy && !MCI->isForceInlined()) || DisableLIRP::Memcpy)
814 return false;
815
816 Value *Dest = MCI->getDest();
817 Value *Source = MCI->getSource();
818 if (!Dest || !Source)
819 return false;
820
821 // See if the load and store pointer expressions are AddRec like {base,+,1} on
822 // the current loop, which indicates a strided load and store. If we have
823 // something else, it's a random load or store we can't handle.
824 const SCEV *StoreEv = SE->getSCEV(Dest);
825 const SCEV *LoadEv = SE->getSCEV(Source);
826 const APInt *StoreStrideValue, *LoadStrideValue;
827 if (!match(StoreEv,
828 m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StoreStrideValue),
829 m_SpecificLoop(CurLoop))) ||
830 !match(LoadEv,
831 m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(LoadStrideValue),
832 m_SpecificLoop(CurLoop))))
833 return false;
834
835 // Reject memcpys that are so large that they overflow an unsigned.
836 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
837 if ((SizeInBytes >> 32) != 0)
838 return false;
839
840 // Huge stride value - give up
841 if (StoreStrideValue->getBitWidth() > 64 ||
842 LoadStrideValue->getBitWidth() > 64)
843 return false;
844
845 if (SizeInBytes != *StoreStrideValue && SizeInBytes != -*StoreStrideValue) {
846 ORE.emit([&]() {
847 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
848 << ore::NV("Inst", "memcpy") << " in "
849 << ore::NV("Function", MCI->getFunction())
850 << " function will not be hoisted: "
851 << ore::NV("Reason", "memcpy size is not equal to stride");
852 });
853 return false;
854 }
855
856 int64_t StoreStrideInt = StoreStrideValue->getSExtValue();
857 int64_t LoadStrideInt = LoadStrideValue->getSExtValue();
858 // Check if the load stride matches the store stride.
859 if (StoreStrideInt != LoadStrideInt)
860 return false;
861
862 return processLoopStoreOfLoopLoad(
863 Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
864 MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI,
865 cast<SCEVAddRecExpr>(StoreEv), cast<SCEVAddRecExpr>(LoadEv), BECount);
866}
867
868/// processLoopMemSet - See if this memset can be promoted to a large memset.
869bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
870 const SCEV *BECount) {
871 // We can only handle non-volatile memsets.
872 if (MSI->isVolatile())
873 return false;
874
875 // If we're not allowed to hack on memset, we fail.
876 if (!HasMemset || DisableLIRP::Memset)
877 return false;
878
879 Value *Pointer = MSI->getDest();
880
881 // See if the pointer expression is an AddRec like {base,+,1} on the current
882 // loop, which indicates a strided store. If we have something else, it's a
883 // random store we can't handle.
884 const SCEV *Ev = SE->getSCEV(Pointer);
885 const SCEV *PointerStrideSCEV;
886 if (!match(Ev, m_scev_AffineAddRec(m_SCEV(), m_SCEV(PointerStrideSCEV),
887 m_SpecificLoop(CurLoop)))) {
888 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
889 return false;
890 }
891
892 SCEVUse MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
893
894 bool IsNegStride = false;
895 const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
896
897 if (IsConstantSize) {
898 // Memset size is constant.
899 // Check if the pointer stride matches the memset size. If so, then
900 // we know that every byte is touched in the loop.
901 LLVM_DEBUG(dbgs() << " memset size is constant\n");
902 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
903 const APInt *Stride;
904 if (!match(PointerStrideSCEV, m_scev_APInt(Stride)))
905 return false;
906
907 if (SizeInBytes != *Stride && SizeInBytes != -*Stride)
908 return false;
909
910 IsNegStride = SizeInBytes == -*Stride;
911 } else {
912 // Memset size is non-constant.
913 // Check if the pointer stride matches the memset size.
914 // To be conservative, the pass would not promote pointers that aren't in
915 // address space zero. Also, the pass only handles memset length and stride
916 // that are invariant for the top level loop.
917 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
918 if (Pointer->getType()->getPointerAddressSpace() != 0) {
919 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
920 << "abort\n");
921 return false;
922 }
923 if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
924 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
925 << "abort\n");
926 return false;
927 }
928
929 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
930 IsNegStride = PointerStrideSCEV->isNonConstantNegative();
931 SCEVUse PositiveStrideSCEV =
932 IsNegStride ? SCEVUse(SE->getNegativeSCEV(PointerStrideSCEV))
933 : SCEVUse(PointerStrideSCEV);
934 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
935 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
936 << "\n");
937
938 if (PositiveStrideSCEV != MemsetSizeSCEV) {
939 // If an expression is covered by the loop guard, compare again and
940 // proceed with optimization if equal.
941 const SCEV *FoldedPositiveStride =
942 SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
943 const SCEV *FoldedMemsetSize =
944 SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
945
946 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
947 << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
948 << " FoldedPositiveStride: " << *FoldedPositiveStride
949 << "\n");
950
951 if (FoldedPositiveStride != FoldedMemsetSize) {
952 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
953 return false;
954 }
955 }
956 }
957
958 // Verify that the memset value is loop invariant. If not, we can't promote
959 // the memset.
960 Value *SplatValue = MSI->getValue();
961 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
962 return false;
963
965 MSIs.insert(MSI);
966 return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
967 MSI->getDestAlign(), SplatValue, MSI, MSIs,
968 cast<SCEVAddRecExpr>(Ev), BECount, IsNegStride,
969 /*IsLoopMemset=*/true);
970}
971
972/// mayLoopAccessLocation - Return true if the specified loop might access the
973/// specified pointer location, which is a loop-strided access. The 'Access'
974/// argument specifies what the verboten forms of access are (read or write).
975static bool
977 const SCEV *BECount, const SCEV *StoreSizeSCEV,
979 SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
980 // Get the location that may be stored across the loop. Since the access is
981 // strided positively through memory, we say that the modified location starts
982 // at the pointer and has infinite size.
984
985 // If the loop iterates a fixed number of times, we can refine the access size
986 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
987 const APInt *BECst, *ConstSize;
988 if (match(BECount, m_scev_APInt(BECst)) &&
989 match(StoreSizeSCEV, m_scev_APInt(ConstSize))) {
990 std::optional<uint64_t> BEInt = BECst->tryZExtValue();
991 std::optional<uint64_t> SizeInt = ConstSize->tryZExtValue();
992 // FIXME: Should this check for overflow?
993 if (BEInt && SizeInt)
994 AccessSize = LocationSize::precise((*BEInt + 1) * *SizeInt);
995 }
996
997 // TODO: For this to be really effective, we have to dive into the pointer
998 // operand in the store. Store to &A[i] of 100 will always return may alias
999 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1000 // which will then no-alias a store to &A[100].
1001 MemoryLocation StoreLoc(Ptr, AccessSize);
1002
1003 for (BasicBlock *B : L->blocks())
1004 for (Instruction &I : *B)
1005 if (!IgnoredInsts.contains(&I) &&
1006 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
1007 return true;
1008 return false;
1009}
1010
1011// If we have a negative stride, Start refers to the end of the memory location
1012// we're trying to memset. Therefore, we need to recompute the base pointer,
1013// which is just Start - BECount*Size.
1014static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
1015 Type *IntPtr, const SCEV *StoreSizeSCEV,
1016 ScalarEvolution *SE) {
1017 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
1018 if (!StoreSizeSCEV->isOne()) {
1019 // index = back edge count * store size
1020 Index = SE->getMulExpr(Index,
1021 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1023 }
1024 // base pointer = start - index * store size
1025 return SE->getMinusSCEV(Start, Index);
1026}
1027
1028/// Compute the number of bytes as a SCEV from the backedge taken count.
1029///
1030/// This also maps the SCEV into the provided type and tries to handle the
1031/// computation in a way that will fold cleanly.
1032static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1033 const SCEV *StoreSizeSCEV, Loop *CurLoop,
1034 const DataLayout *DL, ScalarEvolution *SE) {
1035 const SCEV *TripCountSCEV =
1036 SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop);
1037 return SE->getMulExpr(TripCountSCEV,
1038 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1040}
1041
1042/// processLoopStridedStore - We see a strided store of some value. If we can
1043/// transform this into a memset or memset_pattern in the loop preheader, do so.
1044bool LoopIdiomRecognize::processLoopStridedStore(
1045 Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1046 Value *StoredVal, Instruction *TheStore,
1048 const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1049 Module *M = TheStore->getModule();
1050
1051 // The trip count of the loop and the base pointer of the addrec SCEV is
1052 // guaranteed to be loop invariant, which means that it should dominate the
1053 // header. This allows us to insert code for it in the preheader.
1054 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1055 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1056 IRBuilder<> Builder(Preheader->getTerminator());
1057 SCEVExpander Expander(*SE, "loop-idiom");
1058 SCEVExpanderCleaner ExpCleaner(Expander);
1059
1060 Type *DestInt8PtrTy = Builder.getPtrTy(DestAS);
1061 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1062
1063 bool Changed = false;
1064 const SCEV *Start = Ev->getStart();
1065 // Handle negative strided loops.
1066 if (IsNegStride)
1067 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1068
1069 // TODO: ideally we should still be able to generate memset if SCEV expander
1070 // is taught to generate the dependencies at the latest point.
1071 if (!Expander.isSafeToExpand(Start))
1072 return Changed;
1073
1074 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1075 // this into a memset in the loop preheader now if we want. However, this
1076 // would be unsafe to do if there is anything else in the loop that may read
1077 // or write to the aliased location. Check for any overlap by generating the
1078 // base pointer and checking the region.
1079 Value *BasePtr =
1080 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1081
1082 // From here on out, conservatively report to the pass manager that we've
1083 // changed the IR, even if we later clean up these added instructions. There
1084 // may be structural differences e.g. in the order of use lists not accounted
1085 // for in just a textual dump of the IR. This is written as a variable, even
1086 // though statically all the places this dominates could be replaced with
1087 // 'true', with the hope that anyone trying to be clever / "more precise" with
1088 // the return value will read this comment, and leave them alone.
1089 Changed = true;
1090
1091 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1092 StoreSizeSCEV, *AA, Stores))
1093 return Changed;
1094
1095 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1096 return Changed;
1097
1098 // Okay, everything looks good, insert the memset.
1099 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1100 Constant *PatternValue = nullptr;
1101 if (!SplatValue)
1102 PatternValue = getMemSetPatternValue(StoredVal, DL);
1103
1104 // MemsetArg is the number of bytes for the memset libcall, and the number
1105 // of pattern repetitions if the memset.pattern intrinsic is being used.
1106 Value *MemsetArg;
1107 std::optional<int64_t> BytesWritten;
1108
1109 if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) {
1110 const SCEV *TripCountS =
1111 SE->getTripCountFromExitCount(BECount, IntIdxTy, CurLoop);
1112 if (!Expander.isSafeToExpand(TripCountS))
1113 return Changed;
1114 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1115 if (!ConstStoreSize)
1116 return Changed;
1117 Value *TripCount = Expander.expandCodeFor(TripCountS, IntIdxTy,
1118 Preheader->getTerminator());
1119 uint64_t PatternRepsPerTrip =
1120 (ConstStoreSize->getValue()->getZExtValue() * 8) /
1121 DL->getTypeSizeInBits(PatternValue->getType());
1122 // If ConstStoreSize is not equal to the width of PatternValue, then
1123 // MemsetArg is TripCount * (ConstStoreSize/PatternValueWidth). Else
1124 // MemSetArg is just TripCount.
1125 MemsetArg =
1126 PatternRepsPerTrip == 1
1127 ? TripCount
1128 : Builder.CreateMul(TripCount,
1129 Builder.getIntN(IntIdxTy->getIntegerBitWidth(),
1130 PatternRepsPerTrip));
1131 if (auto *CI = dyn_cast<ConstantInt>(TripCount))
1132 BytesWritten =
1133 CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue();
1134
1135 } else {
1136 const SCEV *NumBytesS =
1137 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1138
1139 // TODO: ideally we should still be able to generate memset if SCEV expander
1140 // is taught to generate the dependencies at the latest point.
1141 if (!Expander.isSafeToExpand(NumBytesS))
1142 return Changed;
1143 MemsetArg =
1144 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1145 if (auto *CI = dyn_cast<ConstantInt>(MemsetArg))
1146 BytesWritten = CI->getZExtValue();
1147 }
1148 assert(MemsetArg && "MemsetArg should have been set");
1149
1150 AAMDNodes AATags = TheStore->getAAMetadata();
1151 for (Instruction *Store : Stores)
1152 AATags = AATags.merge(Store->getAAMetadata());
1153 if (BytesWritten)
1154 AATags = AATags.extendTo(BytesWritten.value());
1155 else
1156 AATags = AATags.extendTo(-1);
1157
1158 CallInst *NewCall;
1159 if (SplatValue) {
1160 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, MemsetArg,
1161 MaybeAlign(StoreAlignment),
1162 /*isVolatile=*/false, AATags);
1163 } else if (ForceMemsetPatternIntrinsic ||
1164 isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
1165 assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size");
1166
1167 NewCall = Builder.CreateIntrinsic(
1168 Intrinsic::experimental_memset_pattern,
1169 {DestInt8PtrTy, PatternValue->getType(), IntIdxTy},
1170 {BasePtr, PatternValue, MemsetArg,
1171 ConstantInt::getFalse(M->getContext())});
1172 if (StoreAlignment)
1173 cast<MemSetPatternInst>(NewCall)->setDestAlignment(*StoreAlignment);
1174 NewCall->setAAMetadata(AATags);
1175 } else {
1176 // Neither a memset, nor memset_pattern16
1177 return Changed;
1178 }
1179
1180 NewCall->setDebugLoc(TheStore->getDebugLoc());
1181
1182 if (MSSAU) {
1183 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1184 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1185 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1186 }
1187
1188 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1189 << " from store to: " << *Ev << " at: " << *TheStore
1190 << "\n");
1191
1192 ORE.emit([&]() {
1193 OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1194 NewCall->getDebugLoc(), Preheader);
1195 R << "Transformed loop-strided store in "
1196 << ore::NV("Function", TheStore->getFunction())
1197 << " function into a call to "
1198 << ore::NV("NewFunction", NewCall->getCalledFunction())
1199 << "() intrinsic";
1200 if (!Stores.empty())
1201 R << ore::setExtraArgs();
1202 for (auto *I : Stores) {
1203 R << ore::NV("FromBlock", I->getParent()->getName())
1204 << ore::NV("ToBlock", Preheader->getName());
1205 }
1206 return R;
1207 });
1208
1209 // Okay, the memset has been formed. Zap the original store and anything that
1210 // feeds into it.
1211 for (auto *I : Stores) {
1212 if (MSSAU)
1213 MSSAU->removeMemoryAccess(I, true);
1215 }
1216 if (MSSAU && VerifyMemorySSA)
1217 MSSAU->getMemorySSA()->verifyMemorySSA();
1218 ++NumMemSet;
1219 ExpCleaner.markResultUsed();
1220 return true;
1221}
1222
1223/// If the stored value is a strided load in the same loop with the same stride
1224/// this may be transformable into a memcpy. This kicks in for stuff like
1225/// for (i) A[i] = B[i];
1226bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1227 const SCEV *BECount) {
1228 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1229
1230 Value *StorePtr = SI->getPointerOperand();
1231 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1232 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1233
1234 // The store must be feeding a non-volatile load.
1235 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1236 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1237
1238 // See if the pointer expression is an AddRec like {base,+,1} on the current
1239 // loop, which indicates a strided load. If we have something else, it's a
1240 // random load we can't handle.
1241 Value *LoadPtr = LI->getPointerOperand();
1242 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1243
1244 const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1245 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1246 SI->getAlign(), LI->getAlign(), SI, LI,
1247 StoreEv, LoadEv, BECount);
1248}
1249
1250namespace {
1251class MemmoveVerifier {
1252public:
1253 explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1254 const DataLayout &DL)
1256 LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1258 StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1259 IsSameObject(BP1 == BP2) {}
1260
1261 bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1262 const Instruction &TheLoad,
1263 bool IsMemCpy) const {
1264 if (IsMemCpy) {
1265 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1266 // for negative stride.
1267 if ((!IsNegStride && LoadOff <= StoreOff) ||
1268 (IsNegStride && LoadOff >= StoreOff))
1269 return false;
1270 } else {
1271 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1272 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1273 int64_t LoadSize =
1274 DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
1275 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1276 return false;
1277 if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1278 (IsNegStride && LoadOff + LoadSize > StoreOff))
1279 return false;
1280 }
1281 return true;
1282 }
1283
1284private:
1285 const DataLayout &DL;
1286 int64_t LoadOff = 0;
1287 int64_t StoreOff = 0;
1288 const Value *BP1;
1289 const Value *BP2;
1290
1291public:
1292 const bool IsSameObject;
1293};
1294} // namespace
1295
1296bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1297 Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1298 MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1299 Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1300 const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1301
1302 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1303 // conservatively bail here, since otherwise we may have to transform
1304 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1305 if (auto *MCI = dyn_cast<MemCpyInst>(TheStore); MCI && MCI->isForceInlined())
1306 return false;
1307
1308 // The trip count of the loop and the base pointer of the addrec SCEV is
1309 // guaranteed to be loop invariant, which means that it should dominate the
1310 // header. This allows us to insert code for it in the preheader.
1311 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1312 IRBuilder<> Builder(Preheader->getTerminator());
1313 SCEVExpander Expander(*SE, "loop-idiom");
1314
1315 SCEVExpanderCleaner ExpCleaner(Expander);
1316
1317 bool Changed = false;
1318 const SCEV *StrStart = StoreEv->getStart();
1319 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1320 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1321
1322 APInt Stride = getStoreStride(StoreEv);
1323 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1324
1325 // TODO: Deal with non-constant size; Currently expect constant store size
1326 assert(ConstStoreSize && "store size is expected to be a constant");
1327
1328 int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1329 bool IsNegStride = StoreSize == -Stride;
1330
1331 // Handle negative strided loops.
1332 if (IsNegStride)
1333 StrStart =
1334 getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1335
1336 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1337 // this into a memcpy in the loop preheader now if we want. However, this
1338 // would be unsafe to do if there is anything else in the loop that may read
1339 // or write the memory region we're storing to. This includes the load that
1340 // feeds the stores. Check for an alias by generating the base address and
1341 // checking everything.
1342 Value *StoreBasePtr = Expander.expandCodeFor(
1343 StrStart, Builder.getPtrTy(StrAS), Preheader->getTerminator());
1344
1345 // From here on out, conservatively report to the pass manager that we've
1346 // changed the IR, even if we later clean up these added instructions. There
1347 // may be structural differences e.g. in the order of use lists not accounted
1348 // for in just a textual dump of the IR. This is written as a variable, even
1349 // though statically all the places this dominates could be replaced with
1350 // 'true', with the hope that anyone trying to be clever / "more precise" with
1351 // the return value will read this comment, and leave them alone.
1352 Changed = true;
1353
1354 SmallPtrSet<Instruction *, 2> IgnoredInsts;
1355 IgnoredInsts.insert(TheStore);
1356
1357 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1358 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1359
1360 bool LoopAccessStore =
1361 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1362 StoreSizeSCEV, *AA, IgnoredInsts);
1363 if (LoopAccessStore) {
1364 // For memmove case it's not enough to guarantee that loop doesn't access
1365 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1366 // the only user of TheLoad.
1367 if (!TheLoad->hasOneUse())
1368 return Changed;
1369 IgnoredInsts.insert(TheLoad);
1370 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1371 BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1372 ORE.emit([&]() {
1373 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1374 TheStore)
1375 << ore::NV("Inst", InstRemark) << " in "
1376 << ore::NV("Function", TheStore->getFunction())
1377 << " function will not be hoisted: "
1378 << ore::NV("Reason", "The loop may access store location");
1379 });
1380 return Changed;
1381 }
1382 IgnoredInsts.erase(TheLoad);
1383 }
1384
1385 const SCEV *LdStart = LoadEv->getStart();
1386 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1387
1388 // Handle negative strided loops.
1389 if (IsNegStride)
1390 LdStart =
1391 getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1392
1393 // For a memcpy, we have to make sure that the input array is not being
1394 // mutated by the loop.
1395 Value *LoadBasePtr = Expander.expandCodeFor(LdStart, Builder.getPtrTy(LdAS),
1396 Preheader->getTerminator());
1397
1398 // If the store is a memcpy instruction, we must check if it will write to
1399 // the load memory locations. So remove it from the ignored stores.
1400 MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1401 if (IsMemCpy && !Verifier.IsSameObject)
1402 IgnoredInsts.erase(TheStore);
1403 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1404 StoreSizeSCEV, *AA, IgnoredInsts)) {
1405 ORE.emit([&]() {
1406 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1407 << ore::NV("Inst", InstRemark) << " in "
1408 << ore::NV("Function", TheStore->getFunction())
1409 << " function will not be hoisted: "
1410 << ore::NV("Reason", "The loop may access load location");
1411 });
1412 return Changed;
1413 }
1414
1415 bool IsAtomic = TheStore->isAtomic() || TheLoad->isAtomic();
1416 bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1417
1418 if (IsAtomic) {
1419 // For now don't support unordered atomic memmove.
1420 if (UseMemMove)
1421 return Changed;
1422
1423 // We cannot allow unaligned ops for unordered load/store, so reject
1424 // anything where the alignment isn't at least the element size.
1425 assert((StoreAlign && LoadAlign) &&
1426 "Expect unordered load/store to have align.");
1427 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1428 return Changed;
1429
1430 // If the element.atomic memcpy is not lowered into explicit
1431 // loads/stores later, then it will be lowered into an element-size
1432 // specific lib call. If the lib call doesn't exist for our store size, then
1433 // we shouldn't generate the memcpy.
1434 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1435 return Changed;
1436 }
1437
1438 if (UseMemMove)
1439 if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1440 IsMemCpy))
1441 return Changed;
1442
1443 if (avoidLIRForMultiBlockLoop())
1444 return Changed;
1445
1446 // Okay, everything is safe, we can transform this!
1447
1448 const SCEV *NumBytesS =
1449 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1450
1451 Value *NumBytes =
1452 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1453
1454 AAMDNodes AATags = TheLoad->getAAMetadata();
1455 AAMDNodes StoreAATags = TheStore->getAAMetadata();
1456 AATags = AATags.merge(StoreAATags);
1457 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1458 AATags = AATags.extendTo(CI->getZExtValue());
1459 else
1460 AATags = AATags.extendTo(-1);
1461
1462 CallInst *NewCall = nullptr;
1463 // Check whether to generate an unordered atomic memcpy:
1464 // If the load or store are atomic, then they must necessarily be unordered
1465 // by previous checks.
1466 if (!IsAtomic) {
1467 if (UseMemMove)
1468 NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr,
1469 LoadAlign, NumBytes,
1470 /*isVolatile=*/false, AATags);
1471 else
1472 NewCall =
1473 Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1474 NumBytes, /*isVolatile=*/false, AATags);
1475 } else {
1476 // Create the call.
1477 // Note that unordered atomic loads/stores are *required* by the spec to
1478 // have an alignment but non-atomic loads/stores may not.
1479 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1480 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
1481 AATags);
1482 }
1483 NewCall->setDebugLoc(TheStore->getDebugLoc());
1484
1485 if (MSSAU) {
1486 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1487 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1488 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1489 }
1490
1491 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1492 << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1493 << "\n"
1494 << " from store ptr=" << *StoreEv << " at: " << *TheStore
1495 << "\n");
1496
1497 ORE.emit([&]() {
1498 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1499 NewCall->getDebugLoc(), Preheader)
1500 << "Formed a call to "
1501 << ore::NV("NewFunction", NewCall->getCalledFunction())
1502 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1503 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1504 << " function"
1506 << ore::NV("FromBlock", TheStore->getParent()->getName())
1507 << ore::NV("ToBlock", Preheader->getName());
1508 });
1509
1510 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1511 // and anything that feeds into it.
1512 if (MSSAU)
1513 MSSAU->removeMemoryAccess(TheStore, true);
1514 deleteDeadInstruction(TheStore);
1515 if (MSSAU && VerifyMemorySSA)
1516 MSSAU->getMemorySSA()->verifyMemorySSA();
1517 if (UseMemMove)
1518 ++NumMemMove;
1519 else
1520 ++NumMemCpy;
1521 ExpCleaner.markResultUsed();
1522 return true;
1523}
1524
1525// When compiling for codesize we avoid idiom recognition for a multi-block loop
1526// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1527//
1528bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1529 bool IsLoopMemset) {
1530 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1531 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1532 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1533 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1534 << " avoided: multi-block top-level loop\n");
1535 return true;
1536 }
1537 }
1538
1539 return false;
1540}
1541
1542bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) {
1543 // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using
1544 // carry-less multiplication instructions, which is more efficient than our
1545 // Sarwate table-lookup optimization. Hence, until we're able to emit
1546 // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom,
1547 // disable the optimization for Hexagon.
1548 Module &M = *CurLoop->getHeader()->getModule();
1549 Triple TT(M.getTargetTriple());
1550 if (TT.getArch() == Triple::hexagon)
1551 return false;
1552
1553 // First, create a new GlobalVariable corresponding to the
1554 // Sarwate-lookup-table.
1555 Type *CRCTy = Info.LHS->getType();
1556 unsigned CRCBW = CRCTy->getIntegerBitWidth();
1557 std::array<Constant *, 256> CRCConstants;
1558 transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped),
1559 CRCConstants.begin(),
1560 [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); });
1561 Constant *ConstArray =
1562 ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants);
1563 GlobalVariable *GV =
1564 new GlobalVariable(M, ConstArray->getType(), true,
1565 GlobalValue::PrivateLinkage, ConstArray, ".crctable");
1566
1569
1570 // Next, mark all PHIs for removal except IV.
1571 {
1572 for (PHINode &PN : CurLoop->getHeader()->phis()) {
1573 if (&PN == IV)
1574 continue;
1575 PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
1576 Cleanup.push_back(&PN);
1577 }
1578 }
1579
1580 // Next, fix up the trip count.
1581 {
1582 unsigned NewBTC = (Info.TripCount / 8) - 1;
1583 BasicBlock *LoopBlk = CurLoop->getLoopLatch();
1584 CondBrInst *BrInst = cast<CondBrInst>(LoopBlk->getTerminator());
1585 CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk
1588 Instruction *ExitCond = CurLoop->getLatchCmpInst();
1589 Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC);
1590 IRBuilder<> Builder(ExitCond);
1591 Value *NewExitCond =
1592 Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond");
1593 ExitCond->replaceAllUsesWith(NewExitCond);
1594 deleteDeadInstruction(ExitCond);
1595 }
1596
1597 // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all
1598 // uses of ComputedValue.
1599 //
1600 // Little-endian:
1601 // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)]
1602 // Big-Endian:
1603 // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)]
1604 {
1605 auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) {
1606 return Builder.CreateZExtOrTrunc(
1607 Op, IntegerType::getInt8Ty(Op->getContext()), Name);
1608 };
1609 auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op,
1610 const Twine &Name) {
1611 Type *OpTy = Op->getType();
1612
1613 // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to
1614 // use the CRC's bitwidth as the reference for shifting right.
1615 return LoByte(Builder,
1616 CRCBW > 8 ? Builder.CreateLShr(
1617 Op, ConstantInt::get(OpTy, CRCBW - 8), Name)
1618 : Op,
1619 Name + ".lo.byte");
1620 };
1621
1622 IRBuilder<> Builder(CurLoop->getHeader(),
1623 CurLoop->getHeader()->getFirstNonPHIIt());
1624
1625 // Create the CRC PHI, and initialize its incoming value to the initial
1626 // value of CRC.
1627 PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc");
1628 CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader());
1629
1630 // CRC is now an evolving variable, initialized to the PHI.
1631 Value *CRC = CRCPhi;
1632
1633 // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte
1634 // of LHSAux), if LHSAux is non-nullptr.
1635 Value *Indexer = CRC;
1636 if (Value *Data = Info.LHSAux) {
1637 Type *DataTy = Data->getType();
1638
1639 // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we
1640 // shift right by that amount, and take the lo-byte (in the little-endian
1641 // case), or shift left by that amount, and take the hi-idx (in the
1642 // big-endian case).
1643 Value *IVBits = Builder.CreateZExtOrTrunc(
1644 Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer");
1645 Value *DataIndexer =
1646 Info.ByteOrderSwapped
1647 ? Builder.CreateShl(Data, IVBits, "data.indexer")
1648 : Builder.CreateLShr(Data, IVBits, "data.indexer");
1649 Indexer = Builder.CreateXor(
1650 DataIndexer,
1651 Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"),
1652 "crc.data.indexer");
1653 }
1654
1655 Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi")
1656 : LoByte(Builder, Indexer, "indexer.lo");
1657
1658 // Always index into a GEP using the index type.
1659 Indexer = Builder.CreateZExt(
1660 Indexer, SE->getDataLayout().getIndexType(GV->getType()),
1661 "indexer.ext");
1662
1663 // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC].
1664 Value *CRCTableGEP =
1665 Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd");
1666 Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld");
1667
1668 // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of
1669 // CRC-8.
1670 Value *CRCNext = CRCTableLd;
1671 if (CRCBW > 8) {
1672 Value *CRCShift = Info.ByteOrderSwapped
1673 ? Builder.CreateShl(CRC, 8, "crc.be.shift")
1674 : Builder.CreateLShr(CRC, 8, "crc.le.shift");
1675 CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next");
1676 }
1677
1678 // Connect the back-edge for the loop, and RAUW the ComputedValue.
1679 CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch());
1680 Info.ComputedValue->replaceUsesOutsideBlock(CRCNext,
1681 CurLoop->getLoopLatch());
1682 }
1683
1684 // Cleanup.
1685 {
1686 for (PHINode *PN : Cleanup)
1688 SE->forgetLoop(CurLoop);
1689 }
1690 return true;
1691}
1692
1693bool LoopIdiomRecognize::runOnNoncountableLoop() {
1694 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1695 << CurLoop->getHeader()->getParent()->getName()
1696 << "] Noncountable Loop %"
1697 << CurLoop->getHeader()->getName() << "\n");
1698
1699 return recognizePopcount() || recognizeAndInsertFFS() ||
1700 recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
1701 recognizeShiftUntilLessThan() || recognizeAndInsertStrLen();
1702}
1703
1704/// Check if the given conditional branch is based on the comparison between
1705/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1706/// true), the control yields to the loop entry. If the branch matches the
1707/// behavior, the variable involved in the comparison is returned. This function
1708/// will be called to see if the precondition and postcondition of the loop are
1709/// in desirable form.
1711 bool JmpOnZero = false) {
1713 if (!Cond)
1714 return nullptr;
1715
1716 auto *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1717 if (!CmpZero || !CmpZero->isZero())
1718 return nullptr;
1719
1720 BasicBlock *TrueSucc = BI->getSuccessor(0);
1721 BasicBlock *FalseSucc = BI->getSuccessor(1);
1722 if (JmpOnZero)
1723 std::swap(TrueSucc, FalseSucc);
1724
1725 ICmpInst::Predicate Pred = Cond->getPredicate();
1726 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1727 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1728 return Cond->getOperand(0);
1729
1730 return nullptr;
1731}
1732
1733namespace {
1734
1735class StrlenVerifier {
1736public:
1737 explicit StrlenVerifier(const Loop *CurLoop, ScalarEvolution *SE,
1738 const TargetLibraryInfo *TLI)
1739 : CurLoop(CurLoop), SE(SE), TLI(TLI) {}
1740
1741 bool isValidStrlenIdiom() {
1742 // Give up if the loop has multiple blocks, multiple backedges, or
1743 // multiple exit blocks
1744 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1 ||
1745 !CurLoop->getUniqueExitBlock())
1746 return false;
1747
1748 // It should have a preheader and a branch instruction.
1749 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1750 if (!Preheader ||
1752 return false;
1753
1754 // The loop exit must be conditioned on an icmp with 0 the null terminator.
1755 // The icmp operand has to be a load on some SSA reg that increments
1756 // by 1 in the loop.
1757 BasicBlock *LoopBody = *CurLoop->block_begin();
1758
1759 // Skip if the body is too big as it most likely is not a strlen idiom.
1760 if (!LoopBody || LoopBody->size() >= 15)
1761 return false;
1762
1763 CondBrInst *LoopTerm = dyn_cast<CondBrInst>(LoopBody->getTerminator());
1764 if (!LoopTerm)
1765 return false;
1766 Value *LoopCond = matchCondition(LoopTerm, LoopBody);
1767 if (!LoopCond)
1768 return false;
1769
1770 LoadInst *LoopLoad = dyn_cast<LoadInst>(LoopCond);
1771 if (!LoopLoad || LoopLoad->getPointerAddressSpace() != 0)
1772 return false;
1773
1774 OperandType = LoopLoad->getType();
1775 if (!OperandType || !OperandType->isIntegerTy())
1776 return false;
1777
1778 // See if the pointer expression is an AddRec with constant step a of form
1779 // ({n,+,a}) where a is the width of the char type.
1780 Value *IncPtr = LoopLoad->getPointerOperand();
1781 const SCEV *LoadEv = SE->getSCEV(IncPtr);
1782 const APInt *Step;
1783 if (!match(LoadEv,
1784 m_scev_AffineAddRec(m_SCEV(LoadBaseEv), m_scev_APInt(Step))))
1785 return false;
1786
1787 LLVM_DEBUG(dbgs() << "pointer load scev: " << *LoadEv << "\n");
1788
1789 unsigned StepSize = Step->getZExtValue();
1790
1791 // Verify that StepSize is consistent with platform char width.
1792 OpWidth = OperandType->getIntegerBitWidth();
1793 unsigned WcharSize = TLI->getWCharSize(*LoopLoad->getModule());
1794 if (OpWidth != StepSize * 8)
1795 return false;
1796 if (OpWidth != 8 && OpWidth != 16 && OpWidth != 32)
1797 return false;
1798 if (OpWidth >= 16)
1799 if (OpWidth != WcharSize * 8)
1800 return false;
1801
1802 // Scan every instruction in the loop to ensure there are no side effects.
1803 for (Instruction &I : *LoopBody)
1804 if (I.mayHaveSideEffects())
1805 return false;
1806
1807 BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1808 if (!LoopExitBB)
1809 return false;
1810
1811 for (PHINode &PN : LoopExitBB->phis()) {
1812 if (!SE->isSCEVable(PN.getType()))
1813 return false;
1814
1815 const SCEV *Ev = SE->getSCEV(&PN);
1816 if (!Ev)
1817 return false;
1818
1819 LLVM_DEBUG(dbgs() << "loop exit phi scev: " << *Ev << "\n");
1820
1821 // Since we verified that the loop trip count will be a valid strlen
1822 // idiom, we can expand all lcssa phi with {n,+,1} as (n + strlen) and use
1823 // SCEVExpander materialize the loop output.
1824 const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
1825 if (!AddRecEv || !AddRecEv->isAffine())
1826 return false;
1827
1828 // We only want RecAddExpr with recurrence step that is constant. This
1829 // is good enough for all the idioms we want to recognize. Later we expand
1830 // and materialize the recurrence as {base,+,a} -> (base + a * strlen)
1831 if (!isa<SCEVConstant>(AddRecEv->getStepRecurrence(*SE)))
1832 return false;
1833 }
1834
1835 return true;
1836 }
1837
1838public:
1839 const Loop *CurLoop;
1840 ScalarEvolution *SE;
1841 const TargetLibraryInfo *TLI;
1842
1843 unsigned OpWidth;
1844 ConstantInt *StepSizeCI;
1845 const SCEV *LoadBaseEv;
1847};
1848
1849} // namespace
1850
1851/// The Strlen Idiom we are trying to detect has the following structure
1852///
1853/// preheader:
1854/// ...
1855/// br label %body, ...
1856///
1857/// body:
1858/// ... ; %0 is incremented by a gep
1859/// %1 = load i8, ptr %0, align 1
1860/// %2 = icmp eq i8 %1, 0
1861/// br i1 %2, label %exit, label %body
1862///
1863/// exit:
1864/// %lcssa = phi [%0, %body], ...
1865///
1866/// We expect the strlen idiom to have a load of a character type that
1867/// is compared against '\0', and such load pointer operand must have scev
1868/// expression of the form {%str,+,c} where c is a ConstantInt of the
1869/// appropiate character width for the idiom, and %str is the base of the string
1870/// And, that all lcssa phis have the form {...,+,n} where n is a constant,
1871///
1872/// When transforming the output of the strlen idiom, the lccsa phi are
1873/// expanded using SCEVExpander as {base scev,+,a} -> (base scev + a * strlen)
1874/// and all subsequent uses are replaced. For example,
1875///
1876/// \code{.c}
1877/// const char* base = str;
1878/// while (*str != '\0')
1879/// ++str;
1880/// size_t result = str - base;
1881/// \endcode
1882///
1883/// will be transformed as follows: The idiom will be replaced by a strlen
1884/// computation to compute the address of the null terminator of the string.
1885///
1886/// \code{.c}
1887/// const char* base = str;
1888/// const char* end = base + strlen(str);
1889/// size_t result = end - base;
1890/// \endcode
1891///
1892/// In the case we index by an induction variable, as long as the induction
1893/// variable has a constant int increment, we can replace all such indvars
1894/// with the closed form computation of strlen
1895///
1896/// \code{.c}
1897/// size_t i = 0;
1898/// while (str[i] != '\0')
1899/// ++i;
1900/// size_t result = i;
1901/// \endcode
1902///
1903/// Will be replaced by
1904///
1905/// \code{.c}
1906/// size_t i = 0 + strlen(str);
1907/// size_t result = i;
1908/// \endcode
1909///
1910bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
1911 if (DisableLIRP::All)
1912 return false;
1913
1914 StrlenVerifier Verifier(CurLoop, SE, TLI);
1915
1916 if (!Verifier.isValidStrlenIdiom())
1917 return false;
1918
1919 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1920 BasicBlock *LoopBody = *CurLoop->block_begin();
1921 BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1922 CondBrInst *LoopTerm = cast<CondBrInst>(LoopBody->getTerminator());
1923 assert(Preheader && LoopBody && LoopExitBB &&
1924 "Should be verified to be valid by StrlenVerifier");
1925
1926 if (Verifier.OpWidth == 8) {
1928 return false;
1929 if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_strlen))
1930 return false;
1931 } else {
1933 return false;
1934 if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_wcslen))
1935 return false;
1936 }
1937
1938 IRBuilder<> Builder(Preheader->getTerminator());
1939 Builder.SetCurrentDebugLocation(CurLoop->getStartLoc());
1940 SCEVExpander Expander(*SE, "strlen_idiom");
1941 Value *MaterialzedBase = Expander.expandCodeFor(
1942 Verifier.LoadBaseEv, Verifier.LoadBaseEv->getType(),
1943 Builder.GetInsertPoint());
1944
1945 Value *StrLenFunc = nullptr;
1946 if (Verifier.OpWidth == 8) {
1947 StrLenFunc = emitStrLen(MaterialzedBase, Builder, *DL, TLI);
1948 } else {
1949 StrLenFunc = emitWcsLen(MaterialzedBase, Builder, *DL, TLI);
1950 }
1951 assert(StrLenFunc && "Failed to emit strlen function.");
1952
1953 const SCEV *StrlenEv = SE->getSCEV(StrLenFunc);
1955 for (PHINode &PN : LoopExitBB->phis()) {
1956 // We can now materialize the loop output as all phi have scev {base,+,a}.
1957 // We expand the phi as:
1958 // %strlen = call i64 @strlen(%str)
1959 // %phi.new = base expression + step * %strlen
1960 const SCEV *Ev = SE->getSCEV(&PN);
1961 const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
1962 const SCEVConstant *Step =
1964 const SCEV *Base = AddRecEv->getStart();
1965
1966 // It is safe to truncate to base since if base is narrower than size_t
1967 // the equivalent user code will have to truncate anyways.
1968 const SCEV *NewEv = SE->getAddExpr(
1970 StrlenEv, Base->getType())));
1971
1972 Value *MaterializedPHI = Expander.expandCodeFor(NewEv, NewEv->getType(),
1973 Builder.GetInsertPoint());
1974 Expander.clear();
1975 PN.replaceAllUsesWith(MaterializedPHI);
1976 Cleanup.push_back(&PN);
1977 }
1978
1979 // All LCSSA Loop Phi are dead, the left over dead loop body can be cleaned
1980 // up by later passes
1981 for (PHINode *PN : Cleanup)
1983
1984 // LoopDeletion only delete invariant loops with known trip-count. We can
1985 // update the condition so it will reliablely delete the invariant loop
1986 assert((LoopTerm->getSuccessor(0) == LoopBody ||
1987 LoopTerm->getSuccessor(1) == LoopBody) &&
1988 "loop body must have a successor that is it self");
1989 ConstantInt *NewLoopCond = LoopTerm->getSuccessor(0) == LoopBody
1990 ? Builder.getFalse()
1991 : Builder.getTrue();
1992 LoopTerm->setCondition(NewLoopCond);
1993 SE->forgetLoop(CurLoop);
1994
1995 ++NumStrLen;
1996 LLVM_DEBUG(dbgs() << " Formed strlen idiom: " << *StrLenFunc << "\n");
1997 ORE.emit([&]() {
1998 return OptimizationRemark(DEBUG_TYPE, "recognizeAndInsertStrLen",
1999 CurLoop->getStartLoc(), Preheader)
2000 << "Transformed " << StrLenFunc->getName() << " loop idiom";
2001 });
2002
2003 return true;
2004}
2005
2006/// Check if the given conditional branch is based on an unsigned less-than
2007/// comparison between a variable and a constant, and if the comparison is false
2008/// the control yields to the loop entry. If the branch matches the behaviour,
2009/// the variable involved in the comparison is returned.
2011 APInt &Threshold) {
2013 if (!Cond)
2014 return nullptr;
2015
2016 ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1));
2017 if (!CmpConst)
2018 return nullptr;
2019
2020 BasicBlock *FalseSucc = BI->getSuccessor(1);
2021 ICmpInst::Predicate Pred = Cond->getPredicate();
2022
2023 if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
2024 Threshold = CmpConst->getValue();
2025 return Cond->getOperand(0);
2026 }
2027
2028 return nullptr;
2029}
2030
2031// Check if the recurrence variable `VarX` is in the right form to create
2032// the idiom. Returns the value coerced to a PHINode if so.
2034 BasicBlock *LoopEntry) {
2035 auto *PhiX = dyn_cast<PHINode>(VarX);
2036 if (PhiX && PhiX->getParent() == LoopEntry &&
2037 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
2038 return PhiX;
2039 return nullptr;
2040}
2041
2042/// Return true if the idiom is detected in the loop.
2043///
2044/// Additionally:
2045/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
2046/// or nullptr if there is no such.
2047/// 2) \p CntPhi is set to the corresponding phi node
2048/// or nullptr if there is no such.
2049/// 3) \p InitX is set to the value whose CTLZ could be used.
2050/// 4) \p DefX is set to the instruction calculating Loop exit condition.
2051/// 5) \p Threshold is set to the constant involved in the unsigned less-than
2052/// comparison.
2053///
2054/// The core idiom we are trying to detect is:
2055/// \code
2056/// if (x0 < 2)
2057/// goto loop-exit // the precondition of the loop
2058/// cnt0 = init-val
2059/// do {
2060/// x = phi (x0, x.next); //PhiX
2061/// cnt = phi (cnt0, cnt.next)
2062///
2063/// cnt.next = cnt + 1;
2064/// ...
2065/// x.next = x >> 1; // DefX
2066/// } while (x >= 4)
2067/// loop-exit:
2068/// \endcode
2070 Intrinsic::ID &IntrinID,
2071 Value *&InitX, Instruction *&CntInst,
2072 PHINode *&CntPhi, Instruction *&DefX,
2073 APInt &Threshold) {
2074 BasicBlock *LoopEntry;
2075
2076 DefX = nullptr;
2077 CntInst = nullptr;
2078 CntPhi = nullptr;
2079 LoopEntry = *(CurLoop->block_begin());
2080
2081 // step 1: Check if the loop-back branch is in desirable form.
2082 auto *EntryBI = dyn_cast<CondBrInst>(LoopEntry->getTerminator());
2083 if (!EntryBI)
2084 return false;
2085 if (Value *T = matchShiftULTCondition(EntryBI, LoopEntry, Threshold))
2086 DefX = dyn_cast<Instruction>(T);
2087 else
2088 return false;
2089
2090 // step 2: Check the recurrence of variable X
2091 if (!DefX || !isa<PHINode>(DefX))
2092 return false;
2093
2094 PHINode *VarPhi = cast<PHINode>(DefX);
2095 int Idx = VarPhi->getBasicBlockIndex(LoopEntry);
2096 if (Idx == -1)
2097 return false;
2098
2099 DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx));
2100 if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi)
2101 return false;
2102
2103 // step 3: detect instructions corresponding to "x.next = x >> 1"
2104 if (DefX->getOpcode() != Instruction::LShr)
2105 return false;
2106
2107 IntrinID = Intrinsic::ctlz;
2109 if (!Shft || !Shft->isOne())
2110 return false;
2111
2112 InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader());
2113
2114 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
2115 // or cnt.next = cnt + -1.
2116 // TODO: We can skip the step. If loop trip count is known (CTLZ),
2117 // then all uses of "cnt.next" could be optimized to the trip count
2118 // plus "cnt0". Currently it is not optimized.
2119 // This step could be used to detect POPCNT instruction:
2120 // cnt.next = cnt + (x.next & 1)
2121 for (Instruction &Inst :
2122 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2123 if (Inst.getOpcode() != Instruction::Add)
2124 continue;
2125
2127 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
2128 continue;
2129
2130 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2131 if (!Phi)
2132 continue;
2133
2134 CntInst = &Inst;
2135 CntPhi = Phi;
2136 break;
2137 }
2138 if (!CntInst)
2139 return false;
2140
2141 return true;
2142}
2143
2144/// Return true iff the idiom is detected in the loop.
2145///
2146/// Additionally:
2147/// 1) \p CntInst is set to the instruction counting the population bit.
2148/// 2) \p CntPhi is set to the corresponding phi node.
2149/// 3) \p Var is set to the value whose population bits are being counted.
2150///
2151/// The core idiom we are trying to detect is:
2152/// \code
2153/// if (x0 != 0)
2154/// goto loop-exit // the precondition of the loop
2155/// cnt0 = init-val;
2156/// do {
2157/// x1 = phi (x0, x2);
2158/// cnt1 = phi(cnt0, cnt2);
2159///
2160/// cnt2 = cnt1 + 1;
2161/// ...
2162/// x2 = x1 & (x1 - 1);
2163/// ...
2164/// } while(x != 0);
2165///
2166/// loop-exit:
2167/// \endcode
2168static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
2169 Instruction *&CntInst, PHINode *&CntPhi,
2170 Value *&Var) {
2171 // step 1: Check to see if the look-back branch match this pattern:
2172 // "if (a!=0) goto loop-entry".
2173 BasicBlock *LoopEntry;
2174 Instruction *DefX2, *CountInst;
2175 Value *VarX1, *VarX0;
2176 PHINode *PhiX, *CountPhi;
2177
2178 DefX2 = CountInst = nullptr;
2179 VarX1 = VarX0 = nullptr;
2180 PhiX = CountPhi = nullptr;
2181 LoopEntry = *(CurLoop->block_begin());
2182
2183 // step 1: Check if the loop-back branch is in desirable form.
2184 {
2185 auto *LoopTerm = dyn_cast<CondBrInst>(LoopEntry->getTerminator());
2186 if (!LoopTerm)
2187 return false;
2188 DefX2 = dyn_cast_or_null<Instruction>(matchCondition(LoopTerm, LoopEntry));
2189 }
2190
2191 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
2192 {
2193 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
2194 return false;
2195
2196 BinaryOperator *SubOneOp;
2197
2198 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
2199 VarX1 = DefX2->getOperand(1);
2200 else {
2201 VarX1 = DefX2->getOperand(0);
2202 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
2203 }
2204 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
2205 return false;
2206
2207 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
2208 if (!Dec ||
2209 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
2210 (SubOneOp->getOpcode() == Instruction::Add &&
2211 Dec->isMinusOne()))) {
2212 return false;
2213 }
2214 }
2215
2216 // step 3: Check the recurrence of variable X
2217 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
2218 if (!PhiX)
2219 return false;
2220
2221 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
2222 {
2223 CountInst = nullptr;
2224 for (Instruction &Inst :
2225 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2226 if (Inst.getOpcode() != Instruction::Add)
2227 continue;
2228
2230 if (!Inc || !Inc->isOne())
2231 continue;
2232
2233 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2234 if (!Phi)
2235 continue;
2236
2237 // Check if the result of the instruction is live of the loop.
2238 bool LiveOutLoop = false;
2239 for (User *U : Inst.users()) {
2240 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
2241 LiveOutLoop = true;
2242 break;
2243 }
2244 }
2245
2246 if (LiveOutLoop) {
2247 CountInst = &Inst;
2248 CountPhi = Phi;
2249 break;
2250 }
2251 }
2252
2253 if (!CountInst)
2254 return false;
2255 }
2256
2257 // step 5: check if the precondition is in this form:
2258 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
2259 {
2260 auto *PreCondBr = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
2261 if (!PreCondBr)
2262 return false;
2263 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
2264 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
2265 return false;
2266
2267 CntInst = CountInst;
2268 CntPhi = CountPhi;
2269 Var = T;
2270 }
2271
2272 return true;
2273}
2274
2275/// Return true if the idiom is detected in the loop.
2276///
2277/// Additionally:
2278/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
2279/// or nullptr if there is no such.
2280/// 2) \p CntPhi is set to the corresponding phi node
2281/// or nullptr if there is no such.
2282/// 3) \p Var is set to the value whose CTLZ could be used.
2283/// 4) \p DefX is set to the instruction calculating Loop exit condition.
2284///
2285/// The core idiom we are trying to detect is:
2286/// \code
2287/// if (x0 == 0)
2288/// goto loop-exit // the precondition of the loop
2289/// cnt0 = init-val;
2290/// do {
2291/// x = phi (x0, x.next); //PhiX
2292/// cnt = phi(cnt0, cnt.next);
2293///
2294/// cnt.next = cnt + 1;
2295/// ...
2296/// x.next = x >> 1; // DefX
2297/// ...
2298/// } while(x.next != 0);
2299///
2300/// loop-exit:
2301/// \endcode
2302static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
2303 Intrinsic::ID &IntrinID, Value *&InitX,
2304 Instruction *&CntInst, PHINode *&CntPhi,
2305 Instruction *&DefX) {
2306 BasicBlock *LoopEntry;
2307 Value *VarX = nullptr;
2308
2309 DefX = nullptr;
2310 CntInst = nullptr;
2311 CntPhi = nullptr;
2312 LoopEntry = *(CurLoop->block_begin());
2313
2314 // step 1: Check if the loop-back branch is in desirable form.
2315 auto *LoopTerm = dyn_cast<CondBrInst>(LoopEntry->getTerminator());
2316 if (!LoopTerm)
2317 return false;
2318 DefX = dyn_cast_or_null<Instruction>(matchCondition(LoopTerm, LoopEntry));
2319
2320 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
2321 if (!DefX || !DefX->isShift())
2322 return false;
2323 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
2324 Intrinsic::ctlz;
2326 if (!Shft || !Shft->isOne())
2327 return false;
2328 VarX = DefX->getOperand(0);
2329
2330 // step 3: Check the recurrence of variable X
2331 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
2332 if (!PhiX)
2333 return false;
2334
2335 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
2336
2337 // Make sure the initial value can't be negative otherwise the ashr in the
2338 // loop might never reach zero which would make the loop infinite.
2339 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
2340 return false;
2341
2342 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
2343 // or cnt.next = cnt + -1.
2344 // TODO: We can skip the step. If loop trip count is known (CTLZ),
2345 // then all uses of "cnt.next" could be optimized to the trip count
2346 // plus "cnt0". Currently it is not optimized.
2347 // This step could be used to detect POPCNT instruction:
2348 // cnt.next = cnt + (x.next & 1)
2349 for (Instruction &Inst :
2350 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2351 if (Inst.getOpcode() != Instruction::Add)
2352 continue;
2353
2355 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
2356 continue;
2357
2358 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2359 if (!Phi)
2360 continue;
2361
2362 CntInst = &Inst;
2363 CntPhi = Phi;
2364 break;
2365 }
2366 if (!CntInst)
2367 return false;
2368
2369 return true;
2370}
2371
2372// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
2373// profitable if we delete the loop.
2374bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
2375 Value *InitX, bool ZeroCheck,
2376 size_t CanonicalSize) {
2377 const Value *Args[] = {InitX,
2378 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
2379
2380 uint32_t HeaderSize = CurLoop->getHeader()->size();
2381
2382 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
2383 InstructionCost Cost = TTI->getIntrinsicInstrCost(
2385 if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
2386 return false;
2387
2388 return true;
2389}
2390
2391/// Convert CTLZ / CTTZ idiom loop into countable loop.
2392/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
2393/// returns false.
2394bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
2395 Value *InitX, Instruction *DefX,
2396 PHINode *CntPhi,
2397 Instruction *CntInst) {
2398 bool IsCntPhiUsedOutsideLoop = false;
2399 for (User *U : CntPhi->users())
2400 if (!CurLoop->contains(cast<Instruction>(U))) {
2401 IsCntPhiUsedOutsideLoop = true;
2402 break;
2403 }
2404 bool IsCntInstUsedOutsideLoop = false;
2405 for (User *U : CntInst->users())
2406 if (!CurLoop->contains(cast<Instruction>(U))) {
2407 IsCntInstUsedOutsideLoop = true;
2408 break;
2409 }
2410 // If both CntInst and CntPhi are used outside the loop the profitability
2411 // is questionable.
2412 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
2413 return false;
2414
2415 // For some CPUs result of CTLZ(X) intrinsic is undefined
2416 // when X is 0. If we can not guarantee X != 0, we need to check this
2417 // when expand.
2418 bool ZeroCheck = false;
2419 // It is safe to assume Preheader exist as it was checked in
2420 // parent function RunOnLoop.
2421 BasicBlock *PH = CurLoop->getLoopPreheader();
2422
2423 // If we are using the count instruction outside the loop, make sure we
2424 // have a zero check as a precondition. Without the check the loop would run
2425 // one iteration for before any check of the input value. This means 0 and 1
2426 // would have identical behavior in the original loop and thus
2427 if (!IsCntPhiUsedOutsideLoop) {
2428 auto *PreCondBB = PH->getSinglePredecessor();
2429 if (!PreCondBB)
2430 return false;
2431 auto *PreCondBI = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
2432 if (!PreCondBI)
2433 return false;
2434 if (matchCondition(PreCondBI, PH) != InitX)
2435 return false;
2436 ZeroCheck = true;
2437 }
2438
2439 // FFS idiom loop has only 6 instructions:
2440 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2441 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2442 // %shr = ashr %n.addr.0, 1
2443 // %tobool = icmp eq %shr, 0
2444 // %inc = add nsw %i.0, 1
2445 // br i1 %tobool
2446 size_t IdiomCanonicalSize = 6;
2447 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2448 return false;
2449
2450 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
2451 DefX->getDebugLoc(), ZeroCheck,
2452 IsCntPhiUsedOutsideLoop);
2453 return true;
2454}
2455
2456/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
2457/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
2458/// trip count returns true; otherwise, returns false.
2459bool LoopIdiomRecognize::recognizeAndInsertFFS() {
2460 // Give up if the loop has multiple blocks or multiple backedges.
2461 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2462 return false;
2463
2464 Intrinsic::ID IntrinID;
2465 Value *InitX;
2466 Instruction *DefX = nullptr;
2467 PHINode *CntPhi = nullptr;
2468 Instruction *CntInst = nullptr;
2469
2470 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi,
2471 DefX))
2472 return false;
2473
2474 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2475}
2476
2477bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
2478 // Give up if the loop has multiple blocks or multiple backedges.
2479 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2480 return false;
2481
2482 Intrinsic::ID IntrinID;
2483 Value *InitX;
2484 Instruction *DefX = nullptr;
2485 PHINode *CntPhi = nullptr;
2486 Instruction *CntInst = nullptr;
2487
2488 APInt LoopThreshold;
2489 if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,
2490 CntPhi, DefX, LoopThreshold))
2491 return false;
2492
2493 if (LoopThreshold == 2) {
2494 // Treat as regular FFS.
2495 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2496 }
2497
2498 // Look for Floor Log2 Idiom.
2499 if (LoopThreshold != 4)
2500 return false;
2501
2502 // Abort if CntPhi is used outside of the loop.
2503 for (User *U : CntPhi->users())
2504 if (!CurLoop->contains(cast<Instruction>(U)))
2505 return false;
2506
2507 // It is safe to assume Preheader exist as it was checked in
2508 // parent function RunOnLoop.
2509 BasicBlock *PH = CurLoop->getLoopPreheader();
2510 auto *PreCondBB = PH->getSinglePredecessor();
2511 if (!PreCondBB)
2512 return false;
2513 auto *PreCondBI = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
2514 if (!PreCondBI)
2515 return false;
2516
2517 APInt PreLoopThreshold;
2518 if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||
2519 PreLoopThreshold != 2)
2520 return false;
2521
2522 bool ZeroCheck = true;
2523
2524 // the loop has only 6 instructions:
2525 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2526 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2527 // %shr = ashr %n.addr.0, 1
2528 // %tobool = icmp ult %n.addr.0, C
2529 // %inc = add nsw %i.0, 1
2530 // br i1 %tobool
2531 size_t IdiomCanonicalSize = 6;
2532 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2533 return false;
2534
2535 // log2(x) = w − 1 − clz(x)
2536 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
2537 DefX->getDebugLoc(), ZeroCheck,
2538 /*IsCntPhiUsedOutsideLoop=*/false,
2539 /*InsertSub=*/true);
2540 return true;
2541}
2542
2543/// Recognizes a population count idiom in a non-countable loop.
2544///
2545/// If detected, transforms the relevant code to issue the popcount intrinsic
2546/// function call, and returns true; otherwise, returns false.
2547bool LoopIdiomRecognize::recognizePopcount() {
2548 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
2549 return false;
2550
2551 // Counting population are usually conducted by few arithmetic instructions.
2552 // Such instructions can be easily "absorbed" by vacant slots in a
2553 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
2554 // in a compact loop.
2555
2556 // Give up if the loop has multiple blocks or multiple backedges.
2557 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2558 return false;
2559
2560 BasicBlock *LoopBody = *(CurLoop->block_begin());
2561 if (LoopBody->size() >= 20) {
2562 // The loop is too big, bail out.
2563 return false;
2564 }
2565
2566 // It should have a preheader containing nothing but an unconditional branch.
2567 BasicBlock *PH = CurLoop->getLoopPreheader();
2568 if (!PH || &PH->front() != PH->getTerminator())
2569 return false;
2570 auto *EntryBI = dyn_cast<UncondBrInst>(PH->getTerminator());
2571 if (!EntryBI)
2572 return false;
2573
2574 // It should have a precondition block where the generated popcount intrinsic
2575 // function can be inserted.
2576 auto *PreCondBB = PH->getSinglePredecessor();
2577 if (!PreCondBB)
2578 return false;
2579 auto *PreCondBI = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
2580 if (!PreCondBI)
2581 return false;
2582
2583 Instruction *CntInst;
2584 PHINode *CntPhi;
2585 Value *Val;
2586 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
2587 return false;
2588
2589 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
2590 return true;
2591}
2592
2594 const DebugLoc &DL) {
2595 Value *Ops[] = {Val};
2596 Type *Tys[] = {Val->getType()};
2597
2598 CallInst *CI = IRBuilder.CreateIntrinsic(Intrinsic::ctpop, Tys, Ops);
2599 CI->setDebugLoc(DL);
2600
2601 return CI;
2602}
2603
2605 const DebugLoc &DL, bool ZeroCheck,
2606 Intrinsic::ID IID) {
2607 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
2608 Type *Tys[] = {Val->getType()};
2609
2610 CallInst *CI = IRBuilder.CreateIntrinsic(IID, Tys, Ops);
2611 CI->setDebugLoc(DL);
2612
2613 return CI;
2614}
2615
2616/// Transform the following loop (Using CTLZ, CTTZ is similar):
2617/// loop:
2618/// CntPhi = PHI [Cnt0, CntInst]
2619/// PhiX = PHI [InitX, DefX]
2620/// CntInst = CntPhi + 1
2621/// DefX = PhiX >> 1
2622/// LOOP_BODY
2623/// Br: loop if (DefX != 0)
2624/// Use(CntPhi) or Use(CntInst)
2625///
2626/// Into:
2627/// If CntPhi used outside the loop:
2628/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2629/// Count = CountPrev + 1
2630/// else
2631/// Count = BitWidth(InitX) - CTLZ(InitX)
2632/// loop:
2633/// CntPhi = PHI [Cnt0, CntInst]
2634/// PhiX = PHI [InitX, DefX]
2635/// PhiCount = PHI [Count, Dec]
2636/// CntInst = CntPhi + 1
2637/// DefX = PhiX >> 1
2638/// Dec = PhiCount - 1
2639/// LOOP_BODY
2640/// Br: loop if (Dec != 0)
2641/// Use(CountPrev + Cnt0) // Use(CntPhi)
2642/// or
2643/// Use(Count + Cnt0) // Use(CntInst)
2644///
2645/// If LOOP_BODY is empty the loop will be deleted.
2646/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2647void LoopIdiomRecognize::transformLoopToCountable(
2648 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
2649 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
2650 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
2651 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2652 IRBuilder<> Builder(Preheader->getTerminator());
2653 Builder.SetCurrentDebugLocation(DL);
2654
2655 // If there are no uses of CntPhi crate:
2656 // Count = BitWidth - CTLZ(InitX);
2657 // NewCount = Count;
2658 // If there are uses of CntPhi create:
2659 // NewCount = BitWidth - CTLZ(InitX >> 1);
2660 // Count = NewCount + 1;
2661 Value *InitXNext;
2662 if (IsCntPhiUsedOutsideLoop) {
2663 if (DefX->getOpcode() == Instruction::AShr)
2664 InitXNext = Builder.CreateAShr(InitX, 1);
2665 else if (DefX->getOpcode() == Instruction::LShr)
2666 InitXNext = Builder.CreateLShr(InitX, 1);
2667 else if (DefX->getOpcode() == Instruction::Shl) // cttz
2668 InitXNext = Builder.CreateShl(InitX, 1);
2669 else
2670 llvm_unreachable("Unexpected opcode!");
2671 } else
2672 InitXNext = InitX;
2673 Value *Count =
2674 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2675 Type *CountTy = Count->getType();
2676 Count = Builder.CreateSub(
2677 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2678 if (InsertSub)
2679 Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1));
2680 Value *NewCount = Count;
2681 if (IsCntPhiUsedOutsideLoop)
2682 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2683
2684 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2685
2686 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2687 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2688 // If the counter was being incremented in the loop, add NewCount to the
2689 // counter's initial value, but only if the initial value is not zero.
2690 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2691 if (!InitConst || !InitConst->isZero())
2692 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2693 } else {
2694 // If the count was being decremented in the loop, subtract NewCount from
2695 // the counter's initial value.
2696 NewCount = Builder.CreateSub(CntInitVal, NewCount);
2697 }
2698
2699 // Step 2: Insert new IV and loop condition:
2700 // loop:
2701 // ...
2702 // PhiCount = PHI [Count, Dec]
2703 // ...
2704 // Dec = PhiCount - 1
2705 // ...
2706 // Br: loop if (Dec != 0)
2707 BasicBlock *Body = *(CurLoop->block_begin());
2708 auto *LbBr = cast<CondBrInst>(Body->getTerminator());
2709 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2710
2711 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi");
2712 TcPhi->insertBefore(Body->begin());
2713
2714 Builder.SetInsertPoint(LbCond);
2715 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2716 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2717
2718 TcPhi->addIncoming(Count, Preheader);
2719 TcPhi->addIncoming(TcDec, Body);
2720
2721 CmpInst::Predicate Pred =
2722 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2723 LbCond->setPredicate(Pred);
2724 LbCond->setOperand(0, TcDec);
2725 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2726
2727 // Step 3: All the references to the original counter outside
2728 // the loop are replaced with the NewCount
2729 if (IsCntPhiUsedOutsideLoop)
2730 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2731 else
2732 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2733
2734 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2735 // loop. The loop would otherwise not be deleted even if it becomes empty.
2736 SE->forgetLoop(CurLoop);
2737}
2738
2739void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2740 Instruction *CntInst,
2741 PHINode *CntPhi, Value *Var) {
2742 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2743 auto *PreCondBr = cast<CondBrInst>(PreCondBB->getTerminator());
2744 const DebugLoc &DL = CntInst->getDebugLoc();
2745
2746 // Assuming before transformation, the loop is following:
2747 // if (x) // the precondition
2748 // do { cnt++; x &= x - 1; } while(x);
2749
2750 // Step 1: Insert the ctpop instruction at the end of the precondition block
2751 IRBuilder<> Builder(PreCondBr);
2752 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2753 {
2754 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2755 NewCount = PopCntZext =
2756 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2757
2758 if (NewCount != PopCnt)
2759 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2760
2761 // TripCnt is exactly the number of iterations the loop has
2762 TripCnt = NewCount;
2763
2764 // If the population counter's initial value is not zero, insert Add Inst.
2765 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2766 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2767 if (!InitConst || !InitConst->isZero()) {
2768 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2769 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2770 }
2771 }
2772
2773 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2774 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2775 // function would be partial dead code, and downstream passes will drag
2776 // it back from the precondition block to the preheader.
2777 {
2778 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2779
2780 Value *Opnd0 = PopCntZext;
2781 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2782 if (PreCond->getOperand(0) != Var)
2783 std::swap(Opnd0, Opnd1);
2784
2785 ICmpInst *NewPreCond = cast<ICmpInst>(
2786 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2787 PreCondBr->setCondition(NewPreCond);
2788
2790 }
2791
2792 // Step 3: Note that the population count is exactly the trip count of the
2793 // loop in question, which enable us to convert the loop from noncountable
2794 // loop into a countable one. The benefit is twofold:
2795 //
2796 // - If the loop only counts population, the entire loop becomes dead after
2797 // the transformation. It is a lot easier to prove a countable loop dead
2798 // than to prove a noncountable one. (In some C dialects, an infinite loop
2799 // isn't dead even if it computes nothing useful. In general, DCE needs
2800 // to prove a noncountable loop finite before safely delete it.)
2801 //
2802 // - If the loop also performs something else, it remains alive.
2803 // Since it is transformed to countable form, it can be aggressively
2804 // optimized by some optimizations which are in general not applicable
2805 // to a noncountable loop.
2806 //
2807 // After this step, this loop (conceptually) would look like following:
2808 // newcnt = __builtin_ctpop(x);
2809 // t = newcnt;
2810 // if (x)
2811 // do { cnt++; x &= x-1; t--) } while (t > 0);
2812 BasicBlock *Body = *(CurLoop->block_begin());
2813 {
2814 auto *LbBr = cast<CondBrInst>(Body->getTerminator());
2815 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2816 Type *Ty = TripCnt->getType();
2817
2818 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi");
2819 TcPhi->insertBefore(Body->begin());
2820
2821 Builder.SetInsertPoint(LbCond);
2823 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2824 "tcdec", false, true));
2825
2826 TcPhi->addIncoming(TripCnt, PreHead);
2827 TcPhi->addIncoming(TcDec, Body);
2828
2829 CmpInst::Predicate Pred =
2830 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2831 LbCond->setPredicate(Pred);
2832 LbCond->setOperand(0, TcDec);
2833 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2834 }
2835
2836 // Step 4: All the references to the original population counter outside
2837 // the loop are replaced with the NewCount -- the value returned from
2838 // __builtin_ctpop().
2839 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2840
2841 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2842 // loop. The loop would otherwise not be deleted even if it becomes empty.
2843 SE->forgetLoop(CurLoop);
2844}
2845
2846/// Match loop-invariant value.
2847template <typename SubPattern_t> struct match_LoopInvariant {
2848 SubPattern_t SubPattern;
2849 const Loop *L;
2850
2851 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2852 : SubPattern(SP), L(L) {}
2853
2854 template <typename ITy> bool match(ITy *V) const {
2855 return L->isLoopInvariant(V) && SubPattern.match(V);
2856 }
2857};
2858
2859/// Matches if the value is loop-invariant.
2860template <typename Ty>
2861inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2862 return match_LoopInvariant<Ty>(M, L);
2863}
2864
2865/// Return true if the idiom is detected in the loop.
2866///
2867/// The core idiom we are trying to detect is:
2868/// \code
2869/// entry:
2870/// <...>
2871/// %bitmask = shl i32 1, %bitpos
2872/// br label %loop
2873///
2874/// loop:
2875/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2876/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2877/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2878/// %x.next = shl i32 %x.curr, 1
2879/// <...>
2880/// br i1 %x.curr.isbitunset, label %loop, label %end
2881///
2882/// end:
2883/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2884/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2885/// <...>
2886/// \endcode
2887static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2888 Value *&BitMask, Value *&BitPos,
2889 Value *&CurrX, Instruction *&NextX) {
2891 " Performing shift-until-bittest idiom detection.\n");
2892
2893 // Give up if the loop has multiple blocks or multiple backedges.
2894 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2895 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2896 return false;
2897 }
2898
2899 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2900 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2901 assert(LoopPreheaderBB && "There is always a loop preheader.");
2902
2903 using namespace PatternMatch;
2904
2905 // Step 1: Check if the loop backedge is in desirable form.
2906
2907 CmpPredicate Pred;
2908 Value *CmpLHS, *CmpRHS;
2909 BasicBlock *TrueBB, *FalseBB;
2910 if (!match(LoopHeaderBB->getTerminator(),
2911 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2912 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2913 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2914 return false;
2915 }
2916
2917 // Step 2: Check if the backedge's condition is in desirable form.
2918
2919 auto MatchVariableBitMask = [&]() {
2920 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2921 match(CmpLHS,
2922 m_c_And(m_Value(CurrX),
2924 m_Value(BitMask),
2925 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2926 CurLoop))));
2927 };
2928
2929 auto MatchDecomposableConstantBitMask = [&]() {
2930 auto Res = llvm::decomposeBitTestICmp(
2931 CmpLHS, CmpRHS, Pred, /*LookThroughTrunc=*/true,
2932 /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true);
2933 if (Res && Res->Mask.isPowerOf2()) {
2934 assert(ICmpInst::isEquality(Res->Pred));
2935 Pred = Res->Pred;
2936 CurrX = Res->X;
2937 BitMask = ConstantInt::get(CurrX->getType(), Res->Mask);
2938 BitPos = ConstantInt::get(CurrX->getType(), Res->Mask.logBase2());
2939 return true;
2940 }
2941 return false;
2942 };
2943
2944 if (!MatchVariableBitMask() && !MatchDecomposableConstantBitMask()) {
2945 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2946 return false;
2947 }
2948
2949 // Step 3: Check if the recurrence is in desirable form.
2950 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2951 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2952 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2953 return false;
2954 }
2955
2956 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2957 NextX =
2958 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2959
2960 assert(CurLoop->isLoopInvariant(BaseX) &&
2961 "Expected BaseX to be available in the preheader!");
2962
2963 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2964 // FIXME: support right-shift?
2965 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2966 return false;
2967 }
2968
2969 // Step 4: Check if the backedge's destinations are in desirable form.
2970
2972 "Should only get equality predicates here.");
2973
2974 // cmp-br is commutative, so canonicalize to a single variant.
2975 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2976 Pred = ICmpInst::getInversePredicate(Pred);
2977 std::swap(TrueBB, FalseBB);
2978 }
2979
2980 // We expect to exit loop when comparison yields false,
2981 // so when it yields true we should branch back to loop header.
2982 if (TrueBB != LoopHeaderBB) {
2983 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2984 return false;
2985 }
2986
2987 // Okay, idiom checks out.
2988 return true;
2989}
2990
2991/// Look for the following loop:
2992/// \code
2993/// entry:
2994/// <...>
2995/// %bitmask = shl i32 1, %bitpos
2996/// br label %loop
2997///
2998/// loop:
2999/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
3000/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
3001/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
3002/// %x.next = shl i32 %x.curr, 1
3003/// <...>
3004/// br i1 %x.curr.isbitunset, label %loop, label %end
3005///
3006/// end:
3007/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
3008/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
3009/// <...>
3010/// \endcode
3011///
3012/// And transform it into:
3013/// \code
3014/// entry:
3015/// %bitmask = shl i32 1, %bitpos
3016/// %lowbitmask = add i32 %bitmask, -1
3017/// %mask = or i32 %lowbitmask, %bitmask
3018/// %x.masked = and i32 %x, %mask
3019/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
3020/// i1 true)
3021/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
3022/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
3023/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
3024/// %tripcount = add i32 %backedgetakencount, 1
3025/// %x.curr = shl i32 %x, %backedgetakencount
3026/// %x.next = shl i32 %x, %tripcount
3027/// br label %loop
3028///
3029/// loop:
3030/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
3031/// %loop.iv.next = add nuw i32 %loop.iv, 1
3032/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
3033/// <...>
3034/// br i1 %loop.ivcheck, label %end, label %loop
3035///
3036/// end:
3037/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
3038/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
3039/// <...>
3040/// \endcode
3041bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
3042 bool MadeChange = false;
3043
3044 Value *X, *BitMask, *BitPos, *XCurr;
3045 Instruction *XNext;
3046 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
3047 XNext)) {
3049 " shift-until-bittest idiom detection failed.\n");
3050 return MadeChange;
3051 }
3052 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
3053
3054 // Ok, it is the idiom we were looking for, we *could* transform this loop,
3055 // but is it profitable to transform?
3056
3057 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3058 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3059 assert(LoopPreheaderBB && "There is always a loop preheader.");
3060
3061 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
3062 assert(SuccessorBB && "There is only a single successor.");
3063
3064 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
3065 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
3066
3067 Intrinsic::ID IntrID = Intrinsic::ctlz;
3068 Type *Ty = X->getType();
3069 unsigned Bitwidth = Ty->getScalarSizeInBits();
3070
3073
3074 // The rewrite is considered to be unprofitable iff and only iff the
3075 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
3076 // making the loop countable, even if nothing else changes.
3078 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getTrue()});
3079 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
3082 " Intrinsic is too costly, not beneficial\n");
3083 return MadeChange;
3084 }
3085 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
3087 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
3088 return MadeChange;
3089 }
3090
3091 // Ok, transform appears worthwhile.
3092 MadeChange = true;
3093
3094 if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) {
3095 // BitMask may be computed from BitPos, Freeze BitPos so we can increase
3096 // it's use count.
3097 std::optional<BasicBlock::iterator> InsertPt = std::nullopt;
3098 if (auto *BitPosI = dyn_cast<Instruction>(BitPos))
3099 InsertPt = BitPosI->getInsertionPointAfterDef();
3100 else
3101 InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
3102 if (!InsertPt)
3103 return false;
3104 FreezeInst *BitPosFrozen =
3105 new FreezeInst(BitPos, BitPos->getName() + ".fr", *InsertPt);
3106 BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) {
3107 return U.getUser() != BitPosFrozen;
3108 });
3109 BitPos = BitPosFrozen;
3110 }
3111
3112 // Step 1: Compute the loop trip count.
3113
3114 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
3115 BitPos->getName() + ".lowbitmask");
3116 Value *Mask =
3117 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
3118 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
3119 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
3120 IntrID, Ty, {XMasked, /*is_zero_poison=*/Builder.getTrue()},
3121 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
3122 Value *XMaskedNumActiveBits = Builder.CreateSub(
3123 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
3124 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
3125 /*HasNSW=*/Bitwidth != 2);
3126 Value *XMaskedLeadingOnePos =
3127 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
3128 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
3129 /*HasNSW=*/Bitwidth > 2);
3130
3131 Value *LoopBackedgeTakenCount = Builder.CreateSub(
3132 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
3133 /*HasNUW=*/true, /*HasNSW=*/true);
3134 // We know loop's backedge-taken count, but what's loop's trip count?
3135 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
3136 Value *LoopTripCount =
3137 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
3138 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
3139 /*HasNSW=*/Bitwidth != 2);
3140
3141 // Step 2: Compute the recurrence's final value without a loop.
3142
3143 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
3144 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
3145 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
3146 NewX->takeName(XCurr);
3147 if (auto *I = dyn_cast<Instruction>(NewX))
3148 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
3149
3150 Value *NewXNext;
3151 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
3152 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
3153 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
3154 // that isn't the case, we'll need to emit an alternative, safe IR.
3155 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
3159 Ty->getScalarSizeInBits() - 1))))
3160 NewXNext = Builder.CreateShl(X, LoopTripCount);
3161 else {
3162 // Otherwise, just additionally shift by one. It's the smallest solution,
3163 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
3164 // and select 0 instead.
3165 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
3166 }
3167
3168 NewXNext->takeName(XNext);
3169 if (auto *I = dyn_cast<Instruction>(NewXNext))
3170 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
3171
3172 // Step 3: Adjust the successor basic block to receive the computed
3173 // recurrence's final value instead of the recurrence itself.
3174
3175 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
3176 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
3177
3178 // Step 4: Rewrite the loop into a countable form, with canonical IV.
3179
3180 // The new canonical induction variable.
3181 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
3182 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
3183
3184 // The induction itself.
3185 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
3186 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3187 auto *IVNext =
3188 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
3189 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
3190
3191 // The loop trip count check.
3192 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
3193 CurLoop->getName() + ".ivcheck");
3194 SmallVector<uint32_t> BranchWeights;
3195 const bool HasBranchWeights =
3197 extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
3198
3199 auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
3200 if (HasBranchWeights) {
3201 if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
3202 std::swap(BranchWeights[0], BranchWeights[1]);
3203 // We're not changing the loop profile, so we can reuse the original loop's
3204 // profile.
3205 setBranchWeights(*BI, BranchWeights,
3206 /*IsExpected=*/false);
3207 }
3208
3209 LoopHeaderBB->getTerminator()->eraseFromParent();
3210
3211 // Populate the IV PHI.
3212 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
3213 IV->addIncoming(IVNext, LoopHeaderBB);
3214
3215 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
3216 // loop. The loop would otherwise not be deleted even if it becomes empty.
3217
3218 SE->forgetLoop(CurLoop);
3219
3220 // Other passes will take care of actually deleting the loop if possible.
3221
3222 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
3223
3224 ++NumShiftUntilBitTest;
3225 return MadeChange;
3226}
3227
3228/// Return true if the idiom is detected in the loop.
3229///
3230/// The core idiom we are trying to detect is:
3231/// \code
3232/// entry:
3233/// <...>
3234/// %start = <...>
3235/// %extraoffset = <...>
3236/// <...>
3237/// br label %for.cond
3238///
3239/// loop:
3240/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
3241/// %nbits = add nsw i8 %iv, %extraoffset
3242/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3243/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3244/// %iv.next = add i8 %iv, 1
3245/// <...>
3246/// br i1 %val.shifted.iszero, label %end, label %loop
3247///
3248/// end:
3249/// %iv.res = phi i8 [ %iv, %loop ] <...>
3250/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3251/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3252/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3253/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3254/// <...>
3255/// \endcode
3257 Instruction *&ValShiftedIsZero,
3258 Intrinsic::ID &IntrinID, Instruction *&IV,
3259 Value *&Start, Value *&Val,
3260 const SCEV *&ExtraOffsetExpr,
3261 bool &InvertedCond) {
3263 " Performing shift-until-zero idiom detection.\n");
3264
3265 // Give up if the loop has multiple blocks or multiple backedges.
3266 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
3267 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
3268 return false;
3269 }
3270
3271 Instruction *ValShifted, *NBits, *IVNext;
3272 Value *ExtraOffset;
3273
3274 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3275 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3276 assert(LoopPreheaderBB && "There is always a loop preheader.");
3277
3278 using namespace PatternMatch;
3279
3280 // Step 1: Check if the loop backedge, condition is in desirable form.
3281
3282 CmpPredicate Pred;
3283 BasicBlock *TrueBB, *FalseBB;
3284 if (!match(LoopHeaderBB->getTerminator(),
3285 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
3286 m_BasicBlock(FalseBB))) ||
3287 !match(ValShiftedIsZero,
3288 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
3289 !ICmpInst::isEquality(Pred)) {
3290 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
3291 return false;
3292 }
3293
3294 // Step 2: Check if the comparison's operand is in desirable form.
3295 // FIXME: Val could be a one-input PHI node, which we should look past.
3296 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
3297 m_Instruction(NBits)))) {
3298 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
3299 return false;
3300 }
3301 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
3302 : Intrinsic::ctlz;
3303
3304 // Step 3: Check if the shift amount is in desirable form.
3305
3306 if (match(NBits, m_c_Add(m_Instruction(IV),
3307 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
3308 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
3309 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
3310 else if (match(NBits,
3312 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
3313 NBits->hasNoSignedWrap())
3314 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
3315 else {
3316 IV = NBits;
3317 ExtraOffsetExpr = SE->getZero(NBits->getType());
3318 }
3319
3320 // Step 4: Check if the recurrence is in desirable form.
3321 auto *IVPN = dyn_cast<PHINode>(IV);
3322 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
3323 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
3324 return false;
3325 }
3326
3327 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
3328 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
3329
3330 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
3331 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
3332 return false;
3333 }
3334
3335 // Step 4: Check if the backedge's destinations are in desirable form.
3336
3338 "Should only get equality predicates here.");
3339
3340 // cmp-br is commutative, so canonicalize to a single variant.
3341 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
3342 if (InvertedCond) {
3343 Pred = ICmpInst::getInversePredicate(Pred);
3344 std::swap(TrueBB, FalseBB);
3345 }
3346
3347 // We expect to exit loop when comparison yields true,
3348 // so when it yields false we should branch back to loop header.
3349 if (FalseBB != LoopHeaderBB) {
3350 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
3351 return false;
3352 }
3353
3354 // The new, countable, loop will certainly only run a known number of
3355 // iterations, It won't be infinite. But the old loop might be infinite
3356 // under certain conditions. For logical shifts, the value will become zero
3357 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
3358 // right-shift, iff the sign bit was set, the value will never become zero,
3359 // and the loop may never finish.
3360 if (ValShifted->getOpcode() == Instruction::AShr &&
3361 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
3362 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
3363 return false;
3364 }
3365
3366 // Okay, idiom checks out.
3367 return true;
3368}
3369
3370/// Look for the following loop:
3371/// \code
3372/// entry:
3373/// <...>
3374/// %start = <...>
3375/// %extraoffset = <...>
3376/// <...>
3377/// br label %loop
3378///
3379/// loop:
3380/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
3381/// %nbits = add nsw i8 %iv, %extraoffset
3382/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3383/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3384/// %iv.next = add i8 %iv, 1
3385/// <...>
3386/// br i1 %val.shifted.iszero, label %end, label %loop
3387///
3388/// end:
3389/// %iv.res = phi i8 [ %iv, %loop ] <...>
3390/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3391/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3392/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3393/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3394/// <...>
3395/// \endcode
3396///
3397/// And transform it into:
3398/// \code
3399/// entry:
3400/// <...>
3401/// %start = <...>
3402/// %extraoffset = <...>
3403/// <...>
3404/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
3405/// %val.numactivebits = sub i8 8, %val.numleadingzeros
3406/// %extraoffset.neg = sub i8 0, %extraoffset
3407/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
3408/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
3409/// %loop.tripcount = sub i8 %iv.final, %start
3410/// br label %loop
3411///
3412/// loop:
3413/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
3414/// %loop.iv.next = add i8 %loop.iv, 1
3415/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
3416/// %iv = add i8 %loop.iv, %start
3417/// <...>
3418/// br i1 %loop.ivcheck, label %end, label %loop
3419///
3420/// end:
3421/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
3422/// <...>
3423/// \endcode
3424bool LoopIdiomRecognize::recognizeShiftUntilZero() {
3425 bool MadeChange = false;
3426
3427 Instruction *ValShiftedIsZero;
3428 Intrinsic::ID IntrID;
3429 Instruction *IV;
3430 Value *Start, *Val;
3431 const SCEV *ExtraOffsetExpr;
3432 bool InvertedCond;
3433 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
3434 Start, Val, ExtraOffsetExpr, InvertedCond)) {
3436 " shift-until-zero idiom detection failed.\n");
3437 return MadeChange;
3438 }
3439 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
3440
3441 // Ok, it is the idiom we were looking for, we *could* transform this loop,
3442 // but is it profitable to transform?
3443
3444 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3445 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3446 assert(LoopPreheaderBB && "There is always a loop preheader.");
3447
3448 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
3449 assert(SuccessorBB && "There is only a single successor.");
3450
3451 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
3452 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
3453
3454 Type *Ty = Val->getType();
3455 unsigned Bitwidth = Ty->getScalarSizeInBits();
3456
3459
3460 // The rewrite is considered to be unprofitable iff and only iff the
3461 // intrinsic we'll use are not cheap. Note that we are okay with *just*
3462 // making the loop countable, even if nothing else changes.
3464 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
3465 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
3468 " Intrinsic is too costly, not beneficial\n");
3469 return MadeChange;
3470 }
3471
3472 // Ok, transform appears worthwhile.
3473 MadeChange = true;
3474
3475 bool OffsetIsZero = ExtraOffsetExpr->isZero();
3476
3477 // Step 1: Compute the loop's final IV value / trip count.
3478
3479 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
3480 IntrID, Ty, {Val, /*is_zero_poison=*/Builder.getFalse()},
3481 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
3482 Value *ValNumActiveBits = Builder.CreateSub(
3483 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
3484 Val->getName() + ".numactivebits", /*HasNUW=*/true,
3485 /*HasNSW=*/Bitwidth != 2);
3486
3487 SCEVExpander Expander(*SE, "loop-idiom");
3488 Expander.setInsertPoint(&*Builder.GetInsertPoint());
3489 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
3490
3491 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
3492 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
3493 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
3494 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
3495 {ValNumActiveBitsOffset, Start},
3496 /*FMFSource=*/nullptr, "iv.final");
3497
3498 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
3499 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
3500 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
3501 // FIXME: or when the offset was `add nuw`
3502
3503 // We know loop's backedge-taken count, but what's loop's trip count?
3504 Value *LoopTripCount =
3505 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
3506 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
3507 /*HasNSW=*/Bitwidth != 2);
3508
3509 // Step 2: Adjust the successor basic block to receive the original
3510 // induction variable's final value instead of the orig. IV itself.
3511
3512 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
3513
3514 // Step 3: Rewrite the loop into a countable form, with canonical IV.
3515
3516 // The new canonical induction variable.
3517 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
3518 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
3519
3520 // The induction itself.
3521 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->getFirstNonPHIIt());
3522 auto *CIVNext =
3523 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
3524 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
3525
3526 // The loop trip count check.
3527 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
3528 CurLoop->getName() + ".ivcheck");
3529 auto *NewIVCheck = CIVCheck;
3530 if (InvertedCond) {
3531 NewIVCheck = Builder.CreateNot(CIVCheck);
3532 NewIVCheck->takeName(ValShiftedIsZero);
3533 }
3534
3535 // The original IV, but rebased to be an offset to the CIV.
3536 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
3537 /*HasNSW=*/true); // FIXME: what about NUW?
3538 IVDePHId->takeName(IV);
3539
3540 // The loop terminator.
3541 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3542 SmallVector<uint32_t> BranchWeights;
3543 const bool HasBranchWeights =
3545 extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
3546
3547 auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
3548 if (HasBranchWeights) {
3549 if (InvertedCond)
3550 std::swap(BranchWeights[0], BranchWeights[1]);
3551 // We're not changing the loop profile, so we can reuse the original loop's
3552 // profile.
3553 setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
3554 }
3555 LoopHeaderBB->getTerminator()->eraseFromParent();
3556
3557 // Populate the IV PHI.
3558 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
3559 CIV->addIncoming(CIVNext, LoopHeaderBB);
3560
3561 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
3562 // loop. The loop would otherwise not be deleted even if it becomes empty.
3563
3564 SE->forgetLoop(CurLoop);
3565
3566 // Step 5: Try to cleanup the loop's body somewhat.
3567 IV->replaceAllUsesWith(IVDePHId);
3568 IV->eraseFromParent();
3569
3570 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
3571 ValShiftedIsZero->eraseFromParent();
3572
3573 // Other passes will take care of actually deleting the loop if possible.
3574
3575 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
3576
3577 ++NumShiftUntilZero;
3578 return MadeChange;
3579}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
DXIL Resource Access
This file defines the DenseMap class.
#define DEBUG_TYPE
ManagedStatic< HTTPClientCleanup > Cleanup
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
static Value * matchShiftULTCondition(CondBrInst *BI, BasicBlock *LoopEntry, APInt &Threshold)
Check if the given conditional branch is based on an unsigned less-than comparison between a variable...
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX, APInt &Threshold)
Return true if the idiom is detected in the loop.
static Value * matchCondition(CondBrInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset....
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
static void deleteDeadInstruction(Instruction *I)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...
if(PassOpts->AAPipeline)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static bool isSimple(Instruction *I)
verify safepoint Safepoint IR Verifier
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
std::optional< uint64_t > tryZExtValue() const
Get zero extended value if possible.
Definition APInt.h:1567
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:462
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:449
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:518
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:472
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:470
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
This class represents a function call, abstracting a target machine's calling convention.
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition InstrTypes.h:768
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Conditional Branch instruction.
void setCondition(Value *V)
Value * getCondition() const
BasicBlock * getSuccessor(unsigned i) const
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition Constants.h:231
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition Constants.h:225
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:123
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This class represents a freeze function that returns random concrete value if an operand is either a ...
PointerType * getType() const
Global values are always pointers.
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
static CRCTable genSarwateTable(const APInt &GenPoly, bool ByteOrderSwapped)
Generate a lookup table of 256 entries by interleaving the generating polynomial.
This instruction compares its operands according to the predicate given to the constructor.
bool isEquality() const
Return true if this predicate is either EQ or NE.
static bool isEquality(Predicate P)
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition IRBuilder.h:497
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2095
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
LLVM_ABI bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
LLVM_ABI bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isShift() const
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
bool isUnordered() const
Align getAlign() const
Return the alignment of the access that is being performed.
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
block_iterator block_begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:653
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:67
ICmpInst * getLatchCmpInst() const
Get the latch condition instruction.
Definition LoopInfo.cpp:198
StringRef getName() const
Definition LoopInfo.h:389
PHINode * getCanonicalInductionVariable() const
Check to see if the loop has a canonical induction variable: an integer recurrence that starts at 0 a...
Definition LoopInfo.cpp:174
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
MaybeAlign getDestAlign() const
bool isForceInlined() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
MaybeAlign getSourceAlign() const
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Representation for a specific memory location.
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:922
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
SCEVUse getOperand(unsigned i) const
This class represents an analyzed expression in the program.
LLVM_ABI bool isOne() const
Return true if the expression is a constant one.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
LLVM_ABI const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:262
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Align getAlign() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
unsigned getWCharSize(const Module &M) const
Returns the size of the wchar_t type in bytes or 0 if the size is unknown.
bool has(LibFunc F) const
Tests whether a library function is available.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCC_Basic
The cost of a typical 'add' instruction.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition Type.h:272
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:427
LLVM_ABI void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition Value.cpp:611
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:561
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Value handle that is nullable, but tries to track the Value.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
const ParentTy * getParent() const
Definition ilist_node.h:34
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
@ HeaderSize
Definition BTF.h:61
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
OperandType
Operands are tagged with one of the values of this enum.
Definition MCInstrDesc.h:59
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
class_match< const SCEVConstant > m_SCEVConstant()
specificloop_ty m_SpecificLoop(const Loop *L)
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
specificscev_ty m_scev_Specific(const SCEV *S)
Match if we have a specific specified SCEV.
class_match< const SCEV > m_SCEV()
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
constexpr double e
DiagnosticInfoOptimizationBase::Argument NV
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
cl::opt< bool > ProfcheckDisableMetadataFixes
Definition Metadata.cpp:64
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
static cl::opt< bool, true > EnableLIRPWcslen("disable-loop-idiom-wcslen", cl::desc("Proceed with loop idiom recognize pass, " "enable conversion of loop(s) to wcslen."), cl::location(DisableLIRP::Wcslen), cl::init(false), cl::ReallyHidden)
InstructionCost Cost
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static cl::opt< bool, true > DisableLIRPStrlen("disable-loop-idiom-strlen", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to strlen."), cl::location(DisableLIRP::Strlen), cl::init(false), cl::ReallyHidden)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
static cl::opt< bool > ForceMemsetPatternIntrinsic("loop-idiom-force-memset-pattern-intrinsic", cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), cl::Hidden)
LLVM_ABI bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected, bool ElideAllZero=false)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2026
LLVM_ABI bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
LLVM_ABI Value * emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the strlen function to the builder, for the specified pointer.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition ModRef.h:28
@ ModRef
The access may reference and may modify the value stored in memory.
Definition ModRef.h:36
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
static cl::opt< bool, true > DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize", cl::desc("Proceed with loop idiom recognize pass, " "but do not optimize CRC loops."), cl::location(DisableLIRP::HashRecognize), cl::init(false), cl::ReallyHidden)
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
LLVM_ABI bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition MemorySSA.cpp:84
LLVM_ABI bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
DWARFExpression::Operation Op
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
LLVM_ABI Value * emitWcsLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the wcslen function to the builder, for the specified pointer.
LLVM_ABI bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
LLVM_ABI Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling " "with -Os/-Oz"), cl::init(true), cl::Hidden)
LLVM_ABI bool RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
If the specified value is an effectively dead PHI node, due to being a def-use chain of single-use no...
Definition Local.cpp:643
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
std::optional< DecomposedBitTest > decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred, bool LookThroughTrunc=true, bool AllowNonZeroC=false, bool DecomposeAnd=false)
Decompose an icmp into the form ((X & Mask) pred C) if possible.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
LLVM_ABI AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition Metadata.h:836
static bool Memcpy
When true, Memcpy is disabled.
static bool Wcslen
When true, Wcslen is disabled.
static bool Strlen
When true, Strlen is disabled.
static bool HashRecognize
When true, HashRecognize is disabled.
static bool Memset
When true, Memset is disabled.
static bool All
When true, the entire pass is disabled.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
The structure that is returned when a polynomial algorithm was recognized by the analysis.
Match loop-invariant value.
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)