LLVM 23.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize: memcmp, etc.
24//
25// This could recognize common matrix multiplies and dot product idioms and
26// replace them with calls to BLAS (if linked in??).
27//
28//===----------------------------------------------------------------------===//
29
31#include "llvm/ADT/APInt.h"
32#include "llvm/ADT/ArrayRef.h"
33#include "llvm/ADT/DenseMap.h"
34#include "llvm/ADT/MapVector.h"
35#include "llvm/ADT/SetVector.h"
38#include "llvm/ADT/Statistic.h"
39#include "llvm/ADT/StringRef.h"
57#include "llvm/IR/BasicBlock.h"
58#include "llvm/IR/Constant.h"
59#include "llvm/IR/Constants.h"
60#include "llvm/IR/DataLayout.h"
61#include "llvm/IR/DebugLoc.h"
63#include "llvm/IR/Dominators.h"
64#include "llvm/IR/GlobalValue.h"
66#include "llvm/IR/IRBuilder.h"
67#include "llvm/IR/InstrTypes.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/LLVMContext.h"
73#include "llvm/IR/Module.h"
74#include "llvm/IR/PassManager.h"
77#include "llvm/IR/Type.h"
78#include "llvm/IR/User.h"
79#include "llvm/IR/Value.h"
80#include "llvm/IR/ValueHandle.h"
83#include "llvm/Support/Debug.h"
90#include <algorithm>
91#include <cassert>
92#include <cstdint>
93#include <utility>
94
95using namespace llvm;
96using namespace SCEVPatternMatch;
97
98#define DEBUG_TYPE "loop-idiom"
99
100STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
101STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
102STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
103STATISTIC(NumStrLen, "Number of strlen's and wcslen's formed from loop loads");
105 NumShiftUntilBitTest,
106 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
107STATISTIC(NumShiftUntilZero,
108 "Number of uncountable loops recognized as 'shift until zero' idiom");
109
110namespace llvm {
113 DisableLIRPAll("disable-" DEBUG_TYPE "-all",
114 cl::desc("Options to disable Loop Idiom Recognize Pass."),
117
120 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
121 cl::desc("Proceed with loop idiom recognize pass, but do "
122 "not convert loop(s) to memset."),
125
128 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
129 cl::desc("Proceed with loop idiom recognize pass, but do "
130 "not convert loop(s) to memcpy."),
133
136 DisableLIRPStrlen("disable-loop-idiom-strlen",
137 cl::desc("Proceed with loop idiom recognize pass, but do "
138 "not convert loop(s) to strlen."),
141
144 EnableLIRPWcslen("disable-loop-idiom-wcslen",
145 cl::desc("Proceed with loop idiom recognize pass, "
146 "enable conversion of loop(s) to wcslen."),
149
152 DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize",
153 cl::desc("Proceed with loop idiom recognize pass, "
154 "but do not optimize CRC loops."),
156 cl::init(false), cl::ReallyHidden);
157
159 "use-lir-code-size-heurs",
160 cl::desc("Use loop idiom recognition code size heuristics when compiling "
161 "with -Os/-Oz"),
162 cl::init(true), cl::Hidden);
163
165 "loop-idiom-force-memset-pattern-intrinsic",
166 cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false),
167 cl::Hidden);
168
170
171} // namespace llvm
172
173namespace {
174
175class LoopIdiomRecognize {
176 Loop *CurLoop = nullptr;
178 DominatorTree *DT;
179 LoopInfo *LI;
180 ScalarEvolution *SE;
183 const DataLayout *DL;
185 bool ApplyCodeSizeHeuristics;
186 std::unique_ptr<MemorySSAUpdater> MSSAU;
187
188public:
189 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
190 LoopInfo *LI, ScalarEvolution *SE,
192 const TargetTransformInfo *TTI, MemorySSA *MSSA,
193 const DataLayout *DL,
195 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
196 if (MSSA)
197 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
198 }
199
200 bool runOnLoop(Loop *L);
201
202private:
203 using StoreList = SmallVector<StoreInst *, 8>;
204 using StoreListMap = MapVector<Value *, StoreList>;
205
206 StoreListMap StoreRefsForMemset;
207 StoreListMap StoreRefsForMemsetPattern;
208 StoreList StoreRefsForMemcpy;
209 bool HasMemset;
210 bool HasMemsetPattern;
211 bool HasMemcpy;
212
213 /// Return code for isLegalStore()
214 enum LegalStoreKind {
215 None = 0,
216 Memset,
217 MemsetPattern,
218 Memcpy,
219 UnorderedAtomicMemcpy,
220 DontUse // Dummy retval never to be used. Allows catching errors in retval
221 // handling.
222 };
223
224 /// \name Countable Loop Idiom Handling
225 /// @{
226
227 bool runOnCountableLoop();
228 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
229 SmallVectorImpl<BasicBlock *> &ExitBlocks);
230
231 void collectStores(BasicBlock *BB);
232 LegalStoreKind isLegalStore(StoreInst *SI);
233 enum class ForMemset { No, Yes };
234 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
235 ForMemset For);
236
237 template <typename MemInst>
238 bool processLoopMemIntrinsic(
239 BasicBlock *BB,
240 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
241 const SCEV *BECount);
242 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
243 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
244
245 bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
246 MaybeAlign StoreAlignment, Value *StoredVal,
247 Instruction *TheStore,
248 SmallPtrSetImpl<Instruction *> &Stores,
249 const SCEVAddRecExpr *Ev, const SCEV *BECount,
250 bool IsNegStride, bool IsLoopMemset = false);
251 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
252 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
253 const SCEV *StoreSize, MaybeAlign StoreAlign,
254 MaybeAlign LoadAlign, Instruction *TheStore,
255 Instruction *TheLoad,
256 const SCEVAddRecExpr *StoreEv,
257 const SCEVAddRecExpr *LoadEv,
258 const SCEV *BECount);
259 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
260 bool IsLoopMemset = false);
261 bool optimizeCRCLoop(const PolynomialInfo &Info);
262
263 /// @}
264 /// \name Noncountable Loop Idiom Handling
265 /// @{
266
267 bool runOnNoncountableLoop();
268
269 bool recognizePopcount();
270 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
271 PHINode *CntPhi, Value *Var);
272 bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
273 bool ZeroCheck, size_t CanonicalSize);
274 bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
275 Instruction *DefX, PHINode *CntPhi,
276 Instruction *CntInst);
277 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
278 bool recognizeShiftUntilLessThan();
279 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
280 Instruction *CntInst, PHINode *CntPhi,
281 Value *Var, Instruction *DefX,
282 const DebugLoc &DL, bool ZeroCheck,
283 bool IsCntPhiUsedOutsideLoop,
284 bool InsertSub = false);
285
286 bool recognizeShiftUntilBitTest();
287 bool recognizeShiftUntilZero();
288 bool recognizeAndInsertStrLen();
289
290 /// @}
291};
292} // end anonymous namespace
293
296 LPMUpdater &) {
298 return PreservedAnalyses::all();
299
300 const auto *DL = &L.getHeader()->getDataLayout();
301
302 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
303 // pass. Function analyses need to be preserved across loop transformations
304 // but ORE cannot be preserved (see comment before the pass definition).
305 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
306
307 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
308 AR.MSSA, DL, ORE);
309 if (!LIR.runOnLoop(&L))
310 return PreservedAnalyses::all();
311
313 if (AR.MSSA)
314 PA.preserve<MemorySSAAnalysis>();
315 return PA;
316}
317
319 I->replaceAllUsesWith(PoisonValue::get(I->getType()));
320 I->eraseFromParent();
321}
322
323//===----------------------------------------------------------------------===//
324//
325// Implementation of LoopIdiomRecognize
326//
327//===----------------------------------------------------------------------===//
328
329bool LoopIdiomRecognize::runOnLoop(Loop *L) {
330 CurLoop = L;
331 // If the loop could not be converted to canonical form, it must have an
332 // indirectbr in it, just give up.
333 if (!L->getLoopPreheader())
334 return false;
335
336 // Disable loop idiom recognition if the function's name is a common idiom.
337 StringRef Name = L->getHeader()->getParent()->getName();
338 if (Name == "memset" || Name == "memcpy" || Name == "strlen" ||
339 Name == "wcslen")
340 return false;
341
342 // Determine if code size heuristics need to be applied.
343 ApplyCodeSizeHeuristics =
344 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
345
346 HasMemset = TLI->has(LibFunc_memset);
347 // TODO: Unconditionally enable use of the memset pattern intrinsic (or at
348 // least, opt-in via target hook) once we are confident it will never result
349 // in worse codegen than without. For now, use it only when the target
350 // supports memset_pattern16 libcall (or unless this is overridden by
351 // command line option).
352 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
353 HasMemcpy = TLI->has(LibFunc_memcpy);
354
355 if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic ||
356 HasMemcpy || !DisableLIRP::HashRecognize)
358 return runOnCountableLoop();
359
360 return runOnNoncountableLoop();
361}
362
363bool LoopIdiomRecognize::runOnCountableLoop() {
364 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
366 "runOnCountableLoop() called on a loop without a predictable"
367 "backedge-taken count");
368
369 // If this loop executes exactly one time, then it should be peeled, not
370 // optimized by this pass.
371 if (BECount->isZero())
372 return false;
373
375 CurLoop->getUniqueExitBlocks(ExitBlocks);
376
377 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
378 << CurLoop->getHeader()->getParent()->getName()
379 << "] Countable Loop %" << CurLoop->getHeader()->getName()
380 << "\n");
381
382 // The following transforms hoist stores/memsets into the loop pre-header.
383 // Give up if the loop has instructions that may throw.
384 SimpleLoopSafetyInfo SafetyInfo;
385 SafetyInfo.computeLoopSafetyInfo(CurLoop);
386 if (SafetyInfo.anyBlockMayThrow())
387 return false;
388
389 bool MadeChange = false;
390
391 // Scan all the blocks in the loop that are not in subloops.
392 for (auto *BB : CurLoop->getBlocks()) {
393 // Ignore blocks in subloops.
394 if (LI->getLoopFor(BB) != CurLoop)
395 continue;
396
397 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
398 }
399
400 // Optimize a CRC loop if HashRecognize found one, provided we're not
401 // optimizing for size.
402 if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics)
403 if (auto Res = HashRecognize(*CurLoop, *SE).getResult())
404 optimizeCRCLoop(*Res);
405
406 return MadeChange;
407}
408
409static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
410 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
411 return ConstStride->getAPInt();
412}
413
414/// getMemSetPatternValue - If a strided store of the specified value is safe to
415/// turn into a memset.patternn intrinsic, return the Constant that should
416/// be passed in. Otherwise, return null.
417///
418/// TODO this function could allow more constants than it does today (e.g.
419/// those over 16 bytes) now it has transitioned to being used for the
420/// memset.pattern intrinsic rather than directly the memset_pattern16
421/// libcall.
423 // FIXME: This could check for UndefValue because it can be merged into any
424 // other valid pattern.
425
426 // If the value isn't a constant, we can't promote it to being in a constant
427 // array. We could theoretically do a store to an alloca or something, but
428 // that doesn't seem worthwhile.
430 if (!C || isa<ConstantExpr>(C))
431 return nullptr;
432
433 // Only handle simple values that are a power of two bytes in size.
434 uint64_t Size = DL->getTypeSizeInBits(V->getType());
435 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
436 return nullptr;
437
438 // Don't care enough about darwin/ppc to implement this.
439 if (DL->isBigEndian())
440 return nullptr;
441
442 // Convert to size in bytes.
443 Size /= 8;
444
445 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
446 // if the top and bottom are the same (e.g. for vectors and large integers).
447 if (Size > 16)
448 return nullptr;
449
450 // For now, don't handle types that aren't int, floats, or pointers.
451 Type *CTy = C->getType();
452 if (!CTy->isIntOrPtrTy() && !CTy->isFloatingPointTy())
453 return nullptr;
454
455 return C;
456}
457
458LoopIdiomRecognize::LegalStoreKind
459LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
460 // Don't touch volatile stores.
461 if (SI->isVolatile())
462 return LegalStoreKind::None;
463 // We only want simple or unordered-atomic stores.
464 if (!SI->isUnordered())
465 return LegalStoreKind::None;
466
467 // Avoid merging nontemporal stores.
468 if (SI->getMetadata(LLVMContext::MD_nontemporal))
469 return LegalStoreKind::None;
470
471 Value *StoredVal = SI->getValueOperand();
472 Value *StorePtr = SI->getPointerOperand();
473
474 if (DL->hasUnstableRepresentation(StoredVal->getType()))
475 return LegalStoreKind::None;
476
477 // Transformations could invalidate the external-state pointers
478 // memcpy - LangRef specifies that a valid memcpy must preserve external
479 // state, so no transformations are blocked by it.
480 // memset - We assume that a memset of 0 has an equivalent external state
481 // effect as a null pointer store. This is currently not explicitly
482 // specified, but is true of the one exemplar we have (CHERI
483 // capabilities). All other memset formations are not safe.
484 bool MustPreserveExternalState = DL->hasExternalState(StoredVal->getType()) &&
485 !isa<ConstantPointerNull>(StoredVal);
486
487 // Reject stores that are so large that they overflow an unsigned.
488 // When storing out scalable vectors we bail out for now, since the code
489 // below currently only works for constant strides.
490 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
491 if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
492 (SizeInBits.getFixedValue() >> 32) != 0)
493 return LegalStoreKind::None;
494
495 // See if the pointer expression is an AddRec like {base,+,1} on the current
496 // loop, which indicates a strided store. If we have something else, it's a
497 // random store we can't handle.
498 const SCEV *StoreEv = SE->getSCEV(StorePtr);
499 const SCEVConstant *Stride;
500 if (!match(StoreEv, m_scev_AffineAddRec(m_SCEV(), m_SCEVConstant(Stride),
501 m_SpecificLoop(CurLoop))))
502 return LegalStoreKind::None;
503
504 // See if the store can be turned into a memset.
505
506 // If the stored value is a byte-wise value (like i32 -1), then it may be
507 // turned into a memset of i8 -1, assuming that all the consecutive bytes
508 // are stored. A store of i32 0x01020304 can never be turned into a memset,
509 // but it can be turned into memset_pattern if the target supports it.
510 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
511
512 // Note: memset and memset_pattern on unordered-atomic is yet not supported
513 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
514
515 // If we're allowed to form a memset, and the stored value would be
516 // acceptable for memset, use it.
517 if (!MustPreserveExternalState && !UnorderedAtomic && HasMemset &&
518 SplatValue && !DisableLIRP::Memset &&
519 // Verify that the stored value is loop invariant. If not, we can't
520 // promote the memset.
521 CurLoop->isLoopInvariant(SplatValue)) {
522 // It looks like we can use SplatValue.
523 return LegalStoreKind::Memset;
524 }
525 if (!MustPreserveExternalState && !UnorderedAtomic &&
526 (HasMemsetPattern || ForceMemsetPatternIntrinsic) &&
528 // Don't create memset_pattern16s with address spaces.
529 StorePtr->getType()->getPointerAddressSpace() == 0 &&
530 getMemSetPatternValue(StoredVal, DL)) {
531 // It looks like we can use PatternValue!
532 return LegalStoreKind::MemsetPattern;
533 }
534
535 // Otherwise, see if the store can be turned into a memcpy.
536 if (HasMemcpy && !DisableLIRP::Memcpy) {
537 // Check to see if the stride matches the size of the store. If so, then we
538 // know that every byte is touched in the loop.
539 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
540 APInt StrideAP = Stride->getAPInt();
541 if (StoreSize != StrideAP && StoreSize != -StrideAP)
542 return LegalStoreKind::None;
543
544 // The store must be feeding a non-volatile load.
545 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
546
547 // Only allow non-volatile loads
548 if (!LI || LI->isVolatile())
549 return LegalStoreKind::None;
550 // Only allow simple or unordered-atomic loads
551 if (!LI->isUnordered())
552 return LegalStoreKind::None;
553
554 // See if the pointer expression is an AddRec like {base,+,1} on the current
555 // loop, which indicates a strided load. If we have something else, it's a
556 // random load we can't handle.
557 const SCEV *LoadEv = SE->getSCEV(LI->getPointerOperand());
558
559 // The store and load must share the same stride.
560 if (!match(LoadEv, m_scev_AffineAddRec(m_SCEV(), m_scev_Specific(Stride),
561 m_SpecificLoop(CurLoop))))
562 return LegalStoreKind::None;
563
564 // Success. This store can be converted into a memcpy.
565 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
566 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
567 : LegalStoreKind::Memcpy;
568 }
569 // This store can't be transformed into a memset/memcpy.
570 return LegalStoreKind::None;
571}
572
573void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
574 StoreRefsForMemset.clear();
575 StoreRefsForMemsetPattern.clear();
576 StoreRefsForMemcpy.clear();
577 for (Instruction &I : *BB) {
579 if (!SI)
580 continue;
581
582 // Make sure this is a strided store with a constant stride.
583 switch (isLegalStore(SI)) {
584 case LegalStoreKind::None:
585 // Nothing to do
586 break;
587 case LegalStoreKind::Memset: {
588 // Find the base pointer.
589 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
590 StoreRefsForMemset[Ptr].push_back(SI);
591 } break;
592 case LegalStoreKind::MemsetPattern: {
593 // Find the base pointer.
594 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
595 StoreRefsForMemsetPattern[Ptr].push_back(SI);
596 } break;
597 case LegalStoreKind::Memcpy:
598 case LegalStoreKind::UnorderedAtomicMemcpy:
599 StoreRefsForMemcpy.push_back(SI);
600 break;
601 default:
602 assert(false && "unhandled return value");
603 break;
604 }
605 }
606}
607
608/// runOnLoopBlock - Process the specified block, which lives in a counted loop
609/// with the specified backedge count. This block is known to be in the current
610/// loop and not in any subloops.
611bool LoopIdiomRecognize::runOnLoopBlock(
612 BasicBlock *BB, const SCEV *BECount,
613 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
614 // We can only promote stores in this block if they are unconditionally
615 // executed in the loop. For a block to be unconditionally executed, it has
616 // to dominate all the exit blocks of the loop. Verify this now.
617 for (BasicBlock *ExitBlock : ExitBlocks)
618 if (!DT->dominates(BB, ExitBlock))
619 return false;
620
621 bool MadeChange = false;
622 // Look for store instructions, which may be optimized to memset/memcpy.
623 collectStores(BB);
624
625 // Look for a single store or sets of stores with a common base, which can be
626 // optimized into a memset (memset_pattern). The latter most commonly happens
627 // with structs and handunrolled loops.
628 for (auto &SL : StoreRefsForMemset)
629 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
630
631 for (auto &SL : StoreRefsForMemsetPattern)
632 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
633
634 // Optimize the store into a memcpy, if it feeds an similarly strided load.
635 for (auto &SI : StoreRefsForMemcpy)
636 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
637
638 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
639 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
640 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
641 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
642
643 return MadeChange;
644}
645
646/// See if this store(s) can be promoted to a memset.
647bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
648 const SCEV *BECount, ForMemset For) {
649 // Try to find consecutive stores that can be transformed into memsets.
650 SetVector<StoreInst *> Heads, Tails;
652
653 // Do a quadratic search on all of the given stores and find
654 // all of the pairs of stores that follow each other.
655 SmallVector<unsigned, 16> IndexQueue;
656 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
657 assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
658
659 Value *FirstStoredVal = SL[i]->getValueOperand();
660 Value *FirstStorePtr = SL[i]->getPointerOperand();
661 const SCEVAddRecExpr *FirstStoreEv =
662 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
663 APInt FirstStride = getStoreStride(FirstStoreEv);
664 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
665
666 // See if we can optimize just this store in isolation.
667 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
668 Heads.insert(SL[i]);
669 continue;
670 }
671
672 Value *FirstSplatValue = nullptr;
673 Constant *FirstPatternValue = nullptr;
674
675 if (For == ForMemset::Yes)
676 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
677 else
678 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
679
680 assert((FirstSplatValue || FirstPatternValue) &&
681 "Expected either splat value or pattern value.");
682
683 IndexQueue.clear();
684 // If a store has multiple consecutive store candidates, search Stores
685 // array according to the sequence: from i+1 to e, then from i-1 to 0.
686 // This is because usually pairing with immediate succeeding or preceding
687 // candidate create the best chance to find memset opportunity.
688 unsigned j = 0;
689 for (j = i + 1; j < e; ++j)
690 IndexQueue.push_back(j);
691 for (j = i; j > 0; --j)
692 IndexQueue.push_back(j - 1);
693
694 for (auto &k : IndexQueue) {
695 assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
696 Value *SecondStorePtr = SL[k]->getPointerOperand();
697 const SCEVAddRecExpr *SecondStoreEv =
698 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
699 APInt SecondStride = getStoreStride(SecondStoreEv);
700
701 if (FirstStride != SecondStride)
702 continue;
703
704 Value *SecondStoredVal = SL[k]->getValueOperand();
705 Value *SecondSplatValue = nullptr;
706 Constant *SecondPatternValue = nullptr;
707
708 if (For == ForMemset::Yes)
709 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
710 else
711 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
712
713 assert((SecondSplatValue || SecondPatternValue) &&
714 "Expected either splat value or pattern value.");
715
716 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
717 if (For == ForMemset::Yes) {
718 if (isa<UndefValue>(FirstSplatValue))
719 FirstSplatValue = SecondSplatValue;
720 if (FirstSplatValue != SecondSplatValue)
721 continue;
722 } else {
723 if (isa<UndefValue>(FirstPatternValue))
724 FirstPatternValue = SecondPatternValue;
725 if (FirstPatternValue != SecondPatternValue)
726 continue;
727 }
728 Tails.insert(SL[k]);
729 Heads.insert(SL[i]);
730 ConsecutiveChain[SL[i]] = SL[k];
731 break;
732 }
733 }
734 }
735
736 // We may run into multiple chains that merge into a single chain. We mark the
737 // stores that we transformed so that we don't visit the same store twice.
738 SmallPtrSet<Value *, 16> TransformedStores;
739 bool Changed = false;
740
741 // For stores that start but don't end a link in the chain:
742 for (StoreInst *I : Heads) {
743 if (Tails.count(I))
744 continue;
745
746 // We found a store instr that starts a chain. Now follow the chain and try
747 // to transform it.
748 SmallPtrSet<Instruction *, 8> AdjacentStores;
749 StoreInst *HeadStore = I;
750 unsigned StoreSize = 0;
751
752 // Collect the chain into a list.
753 while (Tails.count(I) || Heads.count(I)) {
754 if (TransformedStores.count(I))
755 break;
756 AdjacentStores.insert(I);
757
758 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
759 // Move to the next value in the chain.
760 I = ConsecutiveChain[I];
761 }
762
763 Value *StoredVal = HeadStore->getValueOperand();
764 Value *StorePtr = HeadStore->getPointerOperand();
765 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
766 APInt Stride = getStoreStride(StoreEv);
767
768 // Check to see if the stride matches the size of the stores. If so, then
769 // we know that every byte is touched in the loop.
770 if (StoreSize != Stride && StoreSize != -Stride)
771 continue;
772
773 bool IsNegStride = StoreSize == -Stride;
774
775 Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
776 const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
777 if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
778 MaybeAlign(HeadStore->getAlign()), StoredVal,
779 HeadStore, AdjacentStores, StoreEv, BECount,
780 IsNegStride)) {
781 TransformedStores.insert_range(AdjacentStores);
782 Changed = true;
783 }
784 }
785
786 return Changed;
787}
788
789/// processLoopMemIntrinsic - Template function for calling different processor
790/// functions based on mem intrinsic type.
791template <typename MemInst>
792bool LoopIdiomRecognize::processLoopMemIntrinsic(
793 BasicBlock *BB,
794 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
795 const SCEV *BECount) {
796 bool MadeChange = false;
797 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
798 Instruction *Inst = &*I++;
799 // Look for memory instructions, which may be optimized to a larger one.
800 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
801 WeakTrackingVH InstPtr(&*I);
802 if (!(this->*Processor)(MI, BECount))
803 continue;
804 MadeChange = true;
805
806 // If processing the instruction invalidated our iterator, start over from
807 // the top of the block.
808 if (!InstPtr)
809 I = BB->begin();
810 }
811 }
812 return MadeChange;
813}
814
815/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
816bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
817 const SCEV *BECount) {
818 // We can only handle non-volatile memcpys with a constant size.
819 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
820 return false;
821
822 // If we're not allowed to hack on memcpy, we fail.
823 if ((!HasMemcpy && !MCI->isForceInlined()) || DisableLIRP::Memcpy)
824 return false;
825
826 Value *Dest = MCI->getDest();
827 Value *Source = MCI->getSource();
828 if (!Dest || !Source)
829 return false;
830
831 // See if the load and store pointer expressions are AddRec like {base,+,1} on
832 // the current loop, which indicates a strided load and store. If we have
833 // something else, it's a random load or store we can't handle.
834 const SCEV *StoreEv = SE->getSCEV(Dest);
835 const SCEV *LoadEv = SE->getSCEV(Source);
836 const APInt *StoreStrideValue, *LoadStrideValue;
837 if (!match(StoreEv,
838 m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StoreStrideValue),
839 m_SpecificLoop(CurLoop))) ||
840 !match(LoadEv,
841 m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(LoadStrideValue),
842 m_SpecificLoop(CurLoop))))
843 return false;
844
845 // Reject memcpys that are so large that they overflow an unsigned.
846 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
847 if ((SizeInBytes >> 32) != 0)
848 return false;
849
850 // Huge stride value - give up
851 if (StoreStrideValue->getBitWidth() > 64 ||
852 LoadStrideValue->getBitWidth() > 64)
853 return false;
854
855 if (SizeInBytes != *StoreStrideValue && SizeInBytes != -*StoreStrideValue) {
856 ORE.emit([&]() {
857 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
858 << ore::NV("Inst", "memcpy") << " in "
859 << ore::NV("Function", MCI->getFunction())
860 << " function will not be hoisted: "
861 << ore::NV("Reason", "memcpy size is not equal to stride");
862 });
863 return false;
864 }
865
866 int64_t StoreStrideInt = StoreStrideValue->getSExtValue();
867 int64_t LoadStrideInt = LoadStrideValue->getSExtValue();
868 // Check if the load stride matches the store stride.
869 if (StoreStrideInt != LoadStrideInt)
870 return false;
871
872 return processLoopStoreOfLoopLoad(
873 Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
874 MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI,
875 cast<SCEVAddRecExpr>(StoreEv), cast<SCEVAddRecExpr>(LoadEv), BECount);
876}
877
878/// processLoopMemSet - See if this memset can be promoted to a large memset.
879bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
880 const SCEV *BECount) {
881 // We can only handle non-volatile memsets.
882 if (MSI->isVolatile())
883 return false;
884
885 // If we're not allowed to hack on memset, we fail.
886 if (!HasMemset || DisableLIRP::Memset)
887 return false;
888
889 Value *Pointer = MSI->getDest();
890
891 // See if the pointer expression is an AddRec like {base,+,1} on the current
892 // loop, which indicates a strided store. If we have something else, it's a
893 // random store we can't handle.
894 const SCEV *Ev = SE->getSCEV(Pointer);
895 const SCEV *PointerStrideSCEV;
896 if (!match(Ev, m_scev_AffineAddRec(m_SCEV(), m_SCEV(PointerStrideSCEV),
897 m_SpecificLoop(CurLoop)))) {
898 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
899 return false;
900 }
901
902 SCEVUse MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
903
904 bool IsNegStride = false;
905 const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
906
907 if (IsConstantSize) {
908 // Memset size is constant.
909 // Check if the pointer stride matches the memset size. If so, then
910 // we know that every byte is touched in the loop.
911 LLVM_DEBUG(dbgs() << " memset size is constant\n");
912 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
913 const APInt *Stride;
914 if (!match(PointerStrideSCEV, m_scev_APInt(Stride)))
915 return false;
916
917 if (SizeInBytes != *Stride && SizeInBytes != -*Stride)
918 return false;
919
920 IsNegStride = SizeInBytes == -*Stride;
921 } else {
922 // Memset size is non-constant.
923 // Check if the pointer stride matches the memset size.
924 // To be conservative, the pass would not promote pointers that aren't in
925 // address space zero. Also, the pass only handles memset length and stride
926 // that are invariant for the top level loop.
927 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
928 if (Pointer->getType()->getPointerAddressSpace() != 0) {
929 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
930 << "abort\n");
931 return false;
932 }
933 if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
934 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
935 << "abort\n");
936 return false;
937 }
938
939 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
940 IsNegStride = PointerStrideSCEV->isNonConstantNegative();
941 SCEVUse PositiveStrideSCEV =
942 IsNegStride ? SCEVUse(SE->getNegativeSCEV(PointerStrideSCEV))
943 : SCEVUse(PointerStrideSCEV);
944 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
945 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
946 << "\n");
947
948 if (PositiveStrideSCEV != MemsetSizeSCEV) {
949 // If an expression is covered by the loop guard, compare again and
950 // proceed with optimization if equal.
951 const SCEV *FoldedPositiveStride =
952 SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
953 const SCEV *FoldedMemsetSize =
954 SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
955
956 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
957 << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
958 << " FoldedPositiveStride: " << *FoldedPositiveStride
959 << "\n");
960
961 if (FoldedPositiveStride != FoldedMemsetSize) {
962 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
963 return false;
964 }
965 }
966 }
967
968 // Verify that the memset value is loop invariant. If not, we can't promote
969 // the memset.
970 Value *SplatValue = MSI->getValue();
971 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
972 return false;
973
975 MSIs.insert(MSI);
976 return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
977 MSI->getDestAlign(), SplatValue, MSI, MSIs,
978 cast<SCEVAddRecExpr>(Ev), BECount, IsNegStride,
979 /*IsLoopMemset=*/true);
980}
981
982/// mayLoopAccessLocation - Return true if the specified loop might access the
983/// specified pointer location, which is a loop-strided access. The 'Access'
984/// argument specifies what the verboten forms of access are (read or write).
985static bool
987 const SCEV *BECount, const SCEV *StoreSizeSCEV,
989 SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
990 // Get the location that may be stored across the loop. Since the access is
991 // strided positively through memory, we say that the modified location starts
992 // at the pointer and has infinite size.
994
995 // If the loop iterates a fixed number of times, we can refine the access size
996 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
997 const APInt *BECst, *ConstSize;
998 if (match(BECount, m_scev_APInt(BECst)) &&
999 match(StoreSizeSCEV, m_scev_APInt(ConstSize))) {
1000 std::optional<uint64_t> BEInt = BECst->tryZExtValue();
1001 std::optional<uint64_t> SizeInt = ConstSize->tryZExtValue();
1002 // FIXME: Should this check for overflow?
1003 if (BEInt && SizeInt)
1004 AccessSize = LocationSize::precise((*BEInt + 1) * *SizeInt);
1005 }
1006
1007 // TODO: For this to be really effective, we have to dive into the pointer
1008 // operand in the store. Store to &A[i] of 100 will always return may alias
1009 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1010 // which will then no-alias a store to &A[100].
1011 MemoryLocation StoreLoc(Ptr, AccessSize);
1012
1013 for (BasicBlock *B : L->blocks())
1014 for (Instruction &I : *B)
1015 if (!IgnoredInsts.contains(&I) &&
1016 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
1017 return true;
1018 return false;
1019}
1020
1021// If we have a negative stride, Start refers to the end of the memory location
1022// we're trying to memset. Therefore, we need to recompute the base pointer,
1023// which is just Start - BECount*Size.
1024static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
1025 Type *IntPtr, const SCEV *StoreSizeSCEV,
1026 ScalarEvolution *SE) {
1027 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
1028 if (!StoreSizeSCEV->isOne()) {
1029 // index = back edge count * store size
1030 Index = SE->getMulExpr(Index,
1031 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1033 }
1034 // base pointer = start - index * store size
1035 return SE->getMinusSCEV(Start, Index);
1036}
1037
1038/// Compute the number of bytes as a SCEV from the backedge taken count.
1039///
1040/// This also maps the SCEV into the provided type and tries to handle the
1041/// computation in a way that will fold cleanly.
1042static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1043 const SCEV *StoreSizeSCEV, Loop *CurLoop,
1044 const DataLayout *DL, ScalarEvolution *SE) {
1045 const SCEV *TripCountSCEV =
1046 SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop);
1047 return SE->getMulExpr(TripCountSCEV,
1048 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1050}
1051
1052/// processLoopStridedStore - We see a strided store of some value. If we can
1053/// transform this into a memset or memset_pattern in the loop preheader, do so.
1054bool LoopIdiomRecognize::processLoopStridedStore(
1055 Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1056 Value *StoredVal, Instruction *TheStore,
1058 const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1059 Module *M = TheStore->getModule();
1060
1061 // The trip count of the loop and the base pointer of the addrec SCEV is
1062 // guaranteed to be loop invariant, which means that it should dominate the
1063 // header. This allows us to insert code for it in the preheader.
1064 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1065 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1066 IRBuilder<> Builder(Preheader->getTerminator());
1067 SCEVExpander Expander(*SE, "loop-idiom");
1068 SCEVExpanderCleaner ExpCleaner(Expander);
1069
1070 Type *DestInt8PtrTy = Builder.getPtrTy(DestAS);
1071 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1072
1073 bool Changed = false;
1074 const SCEV *Start = Ev->getStart();
1075 // Handle negative strided loops.
1076 if (IsNegStride)
1077 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1078
1079 // TODO: ideally we should still be able to generate memset if SCEV expander
1080 // is taught to generate the dependencies at the latest point.
1081 if (!Expander.isSafeToExpand(Start))
1082 return Changed;
1083
1084 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1085 // this into a memset in the loop preheader now if we want. However, this
1086 // would be unsafe to do if there is anything else in the loop that may read
1087 // or write to the aliased location. Check for any overlap by generating the
1088 // base pointer and checking the region.
1089 Value *BasePtr =
1090 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1091
1092 // From here on out, conservatively report to the pass manager that we've
1093 // changed the IR, even if we later clean up these added instructions. There
1094 // may be structural differences e.g. in the order of use lists not accounted
1095 // for in just a textual dump of the IR. This is written as a variable, even
1096 // though statically all the places this dominates could be replaced with
1097 // 'true', with the hope that anyone trying to be clever / "more precise" with
1098 // the return value will read this comment, and leave them alone.
1099 Changed = true;
1100
1101 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1102 StoreSizeSCEV, *AA, Stores))
1103 return Changed;
1104
1105 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1106 return Changed;
1107
1108 // Okay, everything looks good, insert the memset.
1109 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1110 Constant *PatternValue = nullptr;
1111 if (!SplatValue)
1112 PatternValue = getMemSetPatternValue(StoredVal, DL);
1113
1114 // MemsetArg is the number of bytes for the memset libcall, and the number
1115 // of pattern repetitions if the memset.pattern intrinsic is being used.
1116 Value *MemsetArg;
1117 std::optional<int64_t> BytesWritten;
1118
1119 if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) {
1120 const SCEV *TripCountS =
1121 SE->getTripCountFromExitCount(BECount, IntIdxTy, CurLoop);
1122 if (!Expander.isSafeToExpand(TripCountS))
1123 return Changed;
1124 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1125 if (!ConstStoreSize)
1126 return Changed;
1127 Value *TripCount = Expander.expandCodeFor(TripCountS, IntIdxTy,
1128 Preheader->getTerminator());
1129 uint64_t PatternRepsPerTrip =
1130 (ConstStoreSize->getValue()->getZExtValue() * 8) /
1131 DL->getTypeSizeInBits(PatternValue->getType());
1132 // If ConstStoreSize is not equal to the width of PatternValue, then
1133 // MemsetArg is TripCount * (ConstStoreSize/PatternValueWidth). Else
1134 // MemSetArg is just TripCount.
1135 MemsetArg =
1136 PatternRepsPerTrip == 1
1137 ? TripCount
1138 : Builder.CreateMul(TripCount,
1139 Builder.getIntN(IntIdxTy->getIntegerBitWidth(),
1140 PatternRepsPerTrip));
1141 if (auto *CI = dyn_cast<ConstantInt>(TripCount))
1142 BytesWritten =
1143 CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue();
1144
1145 } else {
1146 const SCEV *NumBytesS =
1147 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1148
1149 // TODO: ideally we should still be able to generate memset if SCEV expander
1150 // is taught to generate the dependencies at the latest point.
1151 if (!Expander.isSafeToExpand(NumBytesS))
1152 return Changed;
1153 MemsetArg =
1154 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1155 if (auto *CI = dyn_cast<ConstantInt>(MemsetArg))
1156 BytesWritten = CI->getZExtValue();
1157 }
1158 assert(MemsetArg && "MemsetArg should have been set");
1159
1160 AAMDNodes AATags = TheStore->getAAMetadata();
1161 for (Instruction *Store : Stores)
1162 AATags = AATags.merge(Store->getAAMetadata());
1163 if (BytesWritten)
1164 AATags = AATags.extendTo(BytesWritten.value());
1165 else
1166 AATags = AATags.extendTo(-1);
1167
1168 CallInst *NewCall;
1169 if (SplatValue) {
1170 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, MemsetArg,
1171 MaybeAlign(StoreAlignment),
1172 /*isVolatile=*/false, AATags);
1173 } else if (ForceMemsetPatternIntrinsic ||
1174 isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
1175 assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size");
1176
1177 NewCall = Builder.CreateIntrinsic(
1178 Intrinsic::experimental_memset_pattern,
1179 {DestInt8PtrTy, PatternValue->getType(), IntIdxTy},
1180 {BasePtr, PatternValue, MemsetArg,
1181 ConstantInt::getFalse(M->getContext())});
1182 if (StoreAlignment)
1183 cast<MemSetPatternInst>(NewCall)->setDestAlignment(*StoreAlignment);
1184 NewCall->setAAMetadata(AATags);
1185 } else {
1186 // Neither a memset, nor memset_pattern16
1187 return Changed;
1188 }
1189
1190 NewCall->setDebugLoc(TheStore->getDebugLoc());
1191
1192 if (MSSAU) {
1193 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1194 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1195 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1196 }
1197
1198 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1199 << " from store to: " << *Ev << " at: " << *TheStore
1200 << "\n");
1201
1202 ORE.emit([&]() {
1203 OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1204 NewCall->getDebugLoc(), Preheader);
1205 R << "Transformed loop-strided store in "
1206 << ore::NV("Function", TheStore->getFunction())
1207 << " function into a call to "
1208 << ore::NV("NewFunction", NewCall->getCalledFunction())
1209 << "() intrinsic";
1210 if (!Stores.empty())
1211 R << ore::setExtraArgs();
1212 for (auto *I : Stores) {
1213 R << ore::NV("FromBlock", I->getParent()->getName())
1214 << ore::NV("ToBlock", Preheader->getName());
1215 }
1216 return R;
1217 });
1218
1219 // Okay, the memset has been formed. Zap the original store and anything that
1220 // feeds into it.
1221 for (auto *I : Stores) {
1222 if (MSSAU)
1223 MSSAU->removeMemoryAccess(I, true);
1225 }
1226 if (MSSAU && VerifyMemorySSA)
1227 MSSAU->getMemorySSA()->verifyMemorySSA();
1228 ++NumMemSet;
1229 ExpCleaner.markResultUsed();
1230 return true;
1231}
1232
1233/// If the stored value is a strided load in the same loop with the same stride
1234/// this may be transformable into a memcpy. This kicks in for stuff like
1235/// for (i) A[i] = B[i];
1236bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1237 const SCEV *BECount) {
1238 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1239
1240 Value *StorePtr = SI->getPointerOperand();
1241 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1242 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1243
1244 // The store must be feeding a non-volatile load.
1245 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1246 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1247
1248 // See if the pointer expression is an AddRec like {base,+,1} on the current
1249 // loop, which indicates a strided load. If we have something else, it's a
1250 // random load we can't handle.
1251 Value *LoadPtr = LI->getPointerOperand();
1252 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1253
1254 const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1255 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1256 SI->getAlign(), LI->getAlign(), SI, LI,
1257 StoreEv, LoadEv, BECount);
1258}
1259
1260namespace {
1261class MemmoveVerifier {
1262public:
1263 explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1264 const DataLayout &DL)
1266 LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1268 StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1269 IsSameObject(BP1 == BP2) {}
1270
1271 bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1272 const Instruction &TheLoad,
1273 bool IsMemCpy) const {
1274 if (IsMemCpy) {
1275 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1276 // for negative stride.
1277 if ((!IsNegStride && LoadOff <= StoreOff) ||
1278 (IsNegStride && LoadOff >= StoreOff))
1279 return false;
1280 } else {
1281 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1282 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1283 int64_t LoadSize =
1284 DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
1285 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1286 return false;
1287 if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1288 (IsNegStride && LoadOff + LoadSize > StoreOff))
1289 return false;
1290 }
1291 return true;
1292 }
1293
1294private:
1295 const DataLayout &DL;
1296 int64_t LoadOff = 0;
1297 int64_t StoreOff = 0;
1298 const Value *BP1;
1299 const Value *BP2;
1300
1301public:
1302 const bool IsSameObject;
1303};
1304} // namespace
1305
1306bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1307 Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1308 MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1309 Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1310 const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1311
1312 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1313 // conservatively bail here, since otherwise we may have to transform
1314 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1315 if (auto *MCI = dyn_cast<MemCpyInst>(TheStore); MCI && MCI->isForceInlined())
1316 return false;
1317
1318 // The trip count of the loop and the base pointer of the addrec SCEV is
1319 // guaranteed to be loop invariant, which means that it should dominate the
1320 // header. This allows us to insert code for it in the preheader.
1321 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1322 IRBuilder<> Builder(Preheader->getTerminator());
1323 SCEVExpander Expander(*SE, "loop-idiom");
1324
1325 SCEVExpanderCleaner ExpCleaner(Expander);
1326
1327 bool Changed = false;
1328 const SCEV *StrStart = StoreEv->getStart();
1329 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1330 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1331
1332 APInt Stride = getStoreStride(StoreEv);
1333 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1334
1335 // TODO: Deal with non-constant size; Currently expect constant store size
1336 assert(ConstStoreSize && "store size is expected to be a constant");
1337
1338 int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1339 bool IsNegStride = StoreSize == -Stride;
1340
1341 // Handle negative strided loops.
1342 if (IsNegStride)
1343 StrStart =
1344 getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1345
1346 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1347 // this into a memcpy in the loop preheader now if we want. However, this
1348 // would be unsafe to do if there is anything else in the loop that may read
1349 // or write the memory region we're storing to. This includes the load that
1350 // feeds the stores. Check for an alias by generating the base address and
1351 // checking everything.
1352 Value *StoreBasePtr = Expander.expandCodeFor(
1353 StrStart, Builder.getPtrTy(StrAS), Preheader->getTerminator());
1354
1355 // From here on out, conservatively report to the pass manager that we've
1356 // changed the IR, even if we later clean up these added instructions. There
1357 // may be structural differences e.g. in the order of use lists not accounted
1358 // for in just a textual dump of the IR. This is written as a variable, even
1359 // though statically all the places this dominates could be replaced with
1360 // 'true', with the hope that anyone trying to be clever / "more precise" with
1361 // the return value will read this comment, and leave them alone.
1362 Changed = true;
1363
1364 SmallPtrSet<Instruction *, 2> IgnoredInsts;
1365 IgnoredInsts.insert(TheStore);
1366
1367 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1368 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1369
1370 bool LoopAccessStore =
1371 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1372 StoreSizeSCEV, *AA, IgnoredInsts);
1373 if (LoopAccessStore) {
1374 // For memmove case it's not enough to guarantee that loop doesn't access
1375 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1376 // the only user of TheLoad.
1377 if (!TheLoad->hasOneUse())
1378 return Changed;
1379 IgnoredInsts.insert(TheLoad);
1380 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1381 BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1382 ORE.emit([&]() {
1383 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1384 TheStore)
1385 << ore::NV("Inst", InstRemark) << " in "
1386 << ore::NV("Function", TheStore->getFunction())
1387 << " function will not be hoisted: "
1388 << ore::NV("Reason", "The loop may access store location");
1389 });
1390 return Changed;
1391 }
1392 IgnoredInsts.erase(TheLoad);
1393 }
1394
1395 const SCEV *LdStart = LoadEv->getStart();
1396 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1397
1398 // Handle negative strided loops.
1399 if (IsNegStride)
1400 LdStart =
1401 getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1402
1403 // For a memcpy, we have to make sure that the input array is not being
1404 // mutated by the loop.
1405 Value *LoadBasePtr = Expander.expandCodeFor(LdStart, Builder.getPtrTy(LdAS),
1406 Preheader->getTerminator());
1407
1408 // If the store is a memcpy instruction, we must check if it will write to
1409 // the load memory locations. So remove it from the ignored stores.
1410 MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1411 if (IsMemCpy && !Verifier.IsSameObject)
1412 IgnoredInsts.erase(TheStore);
1413 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1414 StoreSizeSCEV, *AA, IgnoredInsts)) {
1415 ORE.emit([&]() {
1416 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1417 << ore::NV("Inst", InstRemark) << " in "
1418 << ore::NV("Function", TheStore->getFunction())
1419 << " function will not be hoisted: "
1420 << ore::NV("Reason", "The loop may access load location");
1421 });
1422 return Changed;
1423 }
1424
1425 bool IsAtomic = TheStore->isAtomic() || TheLoad->isAtomic();
1426 bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1427
1428 if (IsAtomic) {
1429 // For now don't support unordered atomic memmove.
1430 if (UseMemMove)
1431 return Changed;
1432
1433 // We cannot allow unaligned ops for unordered load/store, so reject
1434 // anything where the alignment isn't at least the element size.
1435 assert((StoreAlign && LoadAlign) &&
1436 "Expect unordered load/store to have align.");
1437 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1438 return Changed;
1439
1440 // If the element.atomic memcpy is not lowered into explicit
1441 // loads/stores later, then it will be lowered into an element-size
1442 // specific lib call. If the lib call doesn't exist for our store size, then
1443 // we shouldn't generate the memcpy.
1444 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1445 return Changed;
1446 }
1447
1448 if (UseMemMove)
1449 if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1450 IsMemCpy))
1451 return Changed;
1452
1453 if (avoidLIRForMultiBlockLoop())
1454 return Changed;
1455
1456 // Okay, everything is safe, we can transform this!
1457
1458 const SCEV *NumBytesS =
1459 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1460
1461 Value *NumBytes =
1462 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1463
1464 AAMDNodes AATags = TheLoad->getAAMetadata();
1465 AAMDNodes StoreAATags = TheStore->getAAMetadata();
1466 AATags = AATags.merge(StoreAATags);
1467 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1468 AATags = AATags.extendTo(CI->getZExtValue());
1469 else
1470 AATags = AATags.extendTo(-1);
1471
1472 CallInst *NewCall = nullptr;
1473 // Check whether to generate an unordered atomic memcpy:
1474 // If the load or store are atomic, then they must necessarily be unordered
1475 // by previous checks.
1476 if (!IsAtomic) {
1477 if (UseMemMove)
1478 NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr,
1479 LoadAlign, NumBytes,
1480 /*isVolatile=*/false, AATags);
1481 else
1482 NewCall =
1483 Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1484 NumBytes, /*isVolatile=*/false, AATags);
1485 } else {
1486 // Create the call.
1487 // Note that unordered atomic loads/stores are *required* by the spec to
1488 // have an alignment but non-atomic loads/stores may not.
1489 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1490 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
1491 AATags);
1492 }
1493 NewCall->setDebugLoc(TheStore->getDebugLoc());
1494
1495 if (MSSAU) {
1496 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1497 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1498 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1499 }
1500
1501 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1502 << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1503 << "\n"
1504 << " from store ptr=" << *StoreEv << " at: " << *TheStore
1505 << "\n");
1506
1507 ORE.emit([&]() {
1508 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1509 NewCall->getDebugLoc(), Preheader)
1510 << "Formed a call to "
1511 << ore::NV("NewFunction", NewCall->getCalledFunction())
1512 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1513 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1514 << " function"
1516 << ore::NV("FromBlock", TheStore->getParent()->getName())
1517 << ore::NV("ToBlock", Preheader->getName());
1518 });
1519
1520 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1521 // and anything that feeds into it.
1522 if (MSSAU)
1523 MSSAU->removeMemoryAccess(TheStore, true);
1524 deleteDeadInstruction(TheStore);
1525 if (MSSAU && VerifyMemorySSA)
1526 MSSAU->getMemorySSA()->verifyMemorySSA();
1527 if (UseMemMove)
1528 ++NumMemMove;
1529 else
1530 ++NumMemCpy;
1531 ExpCleaner.markResultUsed();
1532 return true;
1533}
1534
1535// When compiling for codesize we avoid idiom recognition for a multi-block loop
1536// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1537//
1538bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1539 bool IsLoopMemset) {
1540 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1541 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1542 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1543 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1544 << " avoided: multi-block top-level loop\n");
1545 return true;
1546 }
1547 }
1548
1549 return false;
1550}
1551
1552bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) {
1553 // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using
1554 // carry-less multiplication instructions, which is more efficient than our
1555 // Sarwate table-lookup optimization. Hence, until we're able to emit
1556 // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom,
1557 // disable the optimization for Hexagon.
1558 Module &M = *CurLoop->getHeader()->getModule();
1559 Triple TT(M.getTargetTriple());
1560 if (TT.getArch() == Triple::hexagon)
1561 return false;
1562
1563 // First, create a new GlobalVariable corresponding to the
1564 // Sarwate-lookup-table.
1565 Type *CRCTy = Info.LHS->getType();
1566 unsigned CRCBW = CRCTy->getIntegerBitWidth();
1567 std::array<Constant *, 256> CRCConstants;
1568 transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped),
1569 CRCConstants.begin(),
1570 [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); });
1571 Constant *ConstArray =
1572 ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants);
1573 GlobalVariable *GV =
1574 new GlobalVariable(M, ConstArray->getType(), true,
1575 GlobalValue::PrivateLinkage, ConstArray, ".crctable");
1576
1579
1580 // Next, mark all PHIs for removal except IV.
1581 {
1582 for (PHINode &PN : CurLoop->getHeader()->phis()) {
1583 if (&PN == IV)
1584 continue;
1585 PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
1586 Cleanup.push_back(&PN);
1587 }
1588 }
1589
1590 // Next, fix up the trip count.
1591 {
1592 unsigned NewBTC = (Info.TripCount / 8) - 1;
1593 BasicBlock *LoopBlk = CurLoop->getLoopLatch();
1594 CondBrInst *BrInst = cast<CondBrInst>(LoopBlk->getTerminator());
1595 CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk
1598 Instruction *ExitCond = CurLoop->getLatchCmpInst();
1599 Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC);
1600 IRBuilder<> Builder(ExitCond);
1601 Value *NewExitCond =
1602 Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond");
1603 ExitCond->replaceAllUsesWith(NewExitCond);
1604 deleteDeadInstruction(ExitCond);
1605 }
1606
1607 // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all
1608 // uses of ComputedValue.
1609 //
1610 // Little-endian:
1611 // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)]
1612 // Big-Endian:
1613 // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)]
1614 {
1615 auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) {
1616 return Builder.CreateZExtOrTrunc(
1617 Op, IntegerType::getInt8Ty(Op->getContext()), Name);
1618 };
1619 auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op,
1620 const Twine &Name) {
1621 Type *OpTy = Op->getType();
1622
1623 // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to
1624 // use the CRC's bitwidth as the reference for shifting right.
1625 return LoByte(Builder,
1626 CRCBW > 8 ? Builder.CreateLShr(
1627 Op, ConstantInt::get(OpTy, CRCBW - 8), Name)
1628 : Op,
1629 Name + ".lo.byte");
1630 };
1631
1632 IRBuilder<> Builder(CurLoop->getHeader(),
1633 CurLoop->getHeader()->getFirstNonPHIIt());
1634
1635 // Create the CRC PHI, and initialize its incoming value to the initial
1636 // value of CRC.
1637 PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc");
1638 CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader());
1639
1640 // CRC is now an evolving variable, initialized to the PHI.
1641 Value *CRC = CRCPhi;
1642
1643 // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte
1644 // of LHSAux), if LHSAux is non-nullptr.
1645 Value *Indexer = CRC;
1646 if (Value *Data = Info.LHSAux) {
1647 Type *DataTy = Data->getType();
1648
1649 // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we
1650 // shift right by that amount, and take the lo-byte (in the little-endian
1651 // case), or shift left by that amount, and take the hi-idx (in the
1652 // big-endian case).
1653 Value *IVBits = Builder.CreateZExtOrTrunc(
1654 Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer");
1655 Value *DataIndexer =
1656 Info.ByteOrderSwapped
1657 ? Builder.CreateShl(Data, IVBits, "data.indexer")
1658 : Builder.CreateLShr(Data, IVBits, "data.indexer");
1659 Indexer = Builder.CreateXor(
1660 DataIndexer,
1661 Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"),
1662 "crc.data.indexer");
1663 }
1664
1665 Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi")
1666 : LoByte(Builder, Indexer, "indexer.lo");
1667
1668 // Always index into a GEP using the index type.
1669 Indexer = Builder.CreateZExt(
1670 Indexer, SE->getDataLayout().getIndexType(GV->getType()),
1671 "indexer.ext");
1672
1673 // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC].
1674 Value *CRCTableGEP =
1675 Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd");
1676 Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld");
1677
1678 // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of
1679 // CRC-8.
1680 Value *CRCNext = CRCTableLd;
1681 if (CRCBW > 8) {
1682 Value *CRCShift = Info.ByteOrderSwapped
1683 ? Builder.CreateShl(CRC, 8, "crc.be.shift")
1684 : Builder.CreateLShr(CRC, 8, "crc.le.shift");
1685 CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next");
1686 }
1687
1688 // Connect the back-edge for the loop, and RAUW the ComputedValue.
1689 CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch());
1690 Info.ComputedValue->replaceUsesOutsideBlock(CRCNext,
1691 CurLoop->getLoopLatch());
1692 }
1693
1694 // Cleanup.
1695 {
1696 for (PHINode *PN : Cleanup)
1698 SE->forgetLoop(CurLoop);
1699 }
1700 return true;
1701}
1702
1703bool LoopIdiomRecognize::runOnNoncountableLoop() {
1704 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1705 << CurLoop->getHeader()->getParent()->getName()
1706 << "] Noncountable Loop %"
1707 << CurLoop->getHeader()->getName() << "\n");
1708
1709 return recognizePopcount() || recognizeAndInsertFFS() ||
1710 recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
1711 recognizeShiftUntilLessThan() || recognizeAndInsertStrLen();
1712}
1713
1714/// Check if the given conditional branch is based on the comparison between
1715/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1716/// true), the control yields to the loop entry. If the branch matches the
1717/// behavior, the variable involved in the comparison is returned. This function
1718/// will be called to see if the precondition and postcondition of the loop are
1719/// in desirable form.
1721 bool JmpOnZero = false) {
1723 if (!Cond)
1724 return nullptr;
1725
1726 auto *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1727 if (!CmpZero || !CmpZero->isZero())
1728 return nullptr;
1729
1730 BasicBlock *TrueSucc = BI->getSuccessor(0);
1731 BasicBlock *FalseSucc = BI->getSuccessor(1);
1732 if (JmpOnZero)
1733 std::swap(TrueSucc, FalseSucc);
1734
1735 ICmpInst::Predicate Pred = Cond->getPredicate();
1736 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1737 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1738 return Cond->getOperand(0);
1739
1740 return nullptr;
1741}
1742
1743namespace {
1744
1745class StrlenVerifier {
1746public:
1747 explicit StrlenVerifier(const Loop *CurLoop, ScalarEvolution *SE,
1748 const TargetLibraryInfo *TLI)
1749 : CurLoop(CurLoop), SE(SE), TLI(TLI) {}
1750
1751 bool isValidStrlenIdiom() {
1752 // Give up if the loop has multiple blocks, multiple backedges, or
1753 // multiple exit blocks
1754 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1 ||
1755 !CurLoop->getUniqueExitBlock())
1756 return false;
1757
1758 // It should have a preheader and a branch instruction.
1759 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1760 if (!Preheader ||
1762 return false;
1763
1764 // The loop exit must be conditioned on an icmp with 0 the null terminator.
1765 // The icmp operand has to be a load on some SSA reg that increments
1766 // by 1 in the loop.
1767 BasicBlock *LoopBody = *CurLoop->block_begin();
1768
1769 // Skip if the body is too big as it most likely is not a strlen idiom.
1770 if (!LoopBody || LoopBody->size() >= 15)
1771 return false;
1772
1773 CondBrInst *LoopTerm = dyn_cast<CondBrInst>(LoopBody->getTerminator());
1774 if (!LoopTerm)
1775 return false;
1776 Value *LoopCond = matchCondition(LoopTerm, LoopBody);
1777 if (!LoopCond)
1778 return false;
1779
1780 LoadInst *LoopLoad = dyn_cast<LoadInst>(LoopCond);
1781 if (!LoopLoad || LoopLoad->getPointerAddressSpace() != 0)
1782 return false;
1783
1784 OperandType = LoopLoad->getType();
1785 if (!OperandType || !OperandType->isIntegerTy())
1786 return false;
1787
1788 // See if the pointer expression is an AddRec with constant step a of form
1789 // ({n,+,a}) where a is the width of the char type.
1790 Value *IncPtr = LoopLoad->getPointerOperand();
1791 const SCEV *LoadEv = SE->getSCEV(IncPtr);
1792 const APInt *Step;
1793 if (!match(LoadEv,
1794 m_scev_AffineAddRec(m_SCEV(LoadBaseEv), m_scev_APInt(Step))))
1795 return false;
1796
1797 LLVM_DEBUG(dbgs() << "pointer load scev: " << *LoadEv << "\n");
1798
1799 unsigned StepSize = Step->getZExtValue();
1800
1801 // Verify that StepSize is consistent with platform char width.
1802 OpWidth = OperandType->getIntegerBitWidth();
1803 unsigned WcharSize = TLI->getWCharSize(*LoopLoad->getModule());
1804 if (OpWidth != StepSize * 8)
1805 return false;
1806 if (OpWidth != 8 && OpWidth != 16 && OpWidth != 32)
1807 return false;
1808 if (OpWidth >= 16)
1809 if (OpWidth != WcharSize * 8)
1810 return false;
1811
1812 // Scan every instruction in the loop to ensure there are no side effects.
1813 for (Instruction &I : *LoopBody)
1814 if (I.mayHaveSideEffects())
1815 return false;
1816
1817 BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1818 if (!LoopExitBB)
1819 return false;
1820
1821 for (PHINode &PN : LoopExitBB->phis()) {
1822 if (!SE->isSCEVable(PN.getType()))
1823 return false;
1824
1825 const SCEV *Ev = SE->getSCEV(&PN);
1826 if (!Ev)
1827 return false;
1828
1829 LLVM_DEBUG(dbgs() << "loop exit phi scev: " << *Ev << "\n");
1830
1831 // Since we verified that the loop trip count will be a valid strlen
1832 // idiom, we can expand all lcssa phi with {n,+,1} as (n + strlen) and use
1833 // SCEVExpander materialize the loop output.
1834 const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
1835 if (!AddRecEv || !AddRecEv->isAffine())
1836 return false;
1837
1838 // We only want RecAddExpr with recurrence step that is constant. This
1839 // is good enough for all the idioms we want to recognize. Later we expand
1840 // and materialize the recurrence as {base,+,a} -> (base + a * strlen)
1841 if (!isa<SCEVConstant>(AddRecEv->getStepRecurrence(*SE)))
1842 return false;
1843 }
1844
1845 return true;
1846 }
1847
1848public:
1849 const Loop *CurLoop;
1850 ScalarEvolution *SE;
1851 const TargetLibraryInfo *TLI;
1852
1853 unsigned OpWidth;
1854 ConstantInt *StepSizeCI;
1855 const SCEV *LoadBaseEv;
1857};
1858
1859} // namespace
1860
1861/// The Strlen Idiom we are trying to detect has the following structure
1862///
1863/// preheader:
1864/// ...
1865/// br label %body, ...
1866///
1867/// body:
1868/// ... ; %0 is incremented by a gep
1869/// %1 = load i8, ptr %0, align 1
1870/// %2 = icmp eq i8 %1, 0
1871/// br i1 %2, label %exit, label %body
1872///
1873/// exit:
1874/// %lcssa = phi [%0, %body], ...
1875///
1876/// We expect the strlen idiom to have a load of a character type that
1877/// is compared against '\0', and such load pointer operand must have scev
1878/// expression of the form {%str,+,c} where c is a ConstantInt of the
1879/// appropiate character width for the idiom, and %str is the base of the string
1880/// And, that all lcssa phis have the form {...,+,n} where n is a constant,
1881///
1882/// When transforming the output of the strlen idiom, the lccsa phi are
1883/// expanded using SCEVExpander as {base scev,+,a} -> (base scev + a * strlen)
1884/// and all subsequent uses are replaced. For example,
1885///
1886/// \code{.c}
1887/// const char* base = str;
1888/// while (*str != '\0')
1889/// ++str;
1890/// size_t result = str - base;
1891/// \endcode
1892///
1893/// will be transformed as follows: The idiom will be replaced by a strlen
1894/// computation to compute the address of the null terminator of the string.
1895///
1896/// \code{.c}
1897/// const char* base = str;
1898/// const char* end = base + strlen(str);
1899/// size_t result = end - base;
1900/// \endcode
1901///
1902/// In the case we index by an induction variable, as long as the induction
1903/// variable has a constant int increment, we can replace all such indvars
1904/// with the closed form computation of strlen
1905///
1906/// \code{.c}
1907/// size_t i = 0;
1908/// while (str[i] != '\0')
1909/// ++i;
1910/// size_t result = i;
1911/// \endcode
1912///
1913/// Will be replaced by
1914///
1915/// \code{.c}
1916/// size_t i = 0 + strlen(str);
1917/// size_t result = i;
1918/// \endcode
1919///
1920bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
1921 if (DisableLIRP::All)
1922 return false;
1923
1924 StrlenVerifier Verifier(CurLoop, SE, TLI);
1925
1926 if (!Verifier.isValidStrlenIdiom())
1927 return false;
1928
1929 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1930 BasicBlock *LoopBody = *CurLoop->block_begin();
1931 BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1932 CondBrInst *LoopTerm = cast<CondBrInst>(LoopBody->getTerminator());
1933 assert(Preheader && LoopBody && LoopExitBB &&
1934 "Should be verified to be valid by StrlenVerifier");
1935
1936 if (Verifier.OpWidth == 8) {
1938 return false;
1939 if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_strlen))
1940 return false;
1941 } else {
1943 return false;
1944 if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_wcslen))
1945 return false;
1946 }
1947
1948 IRBuilder<> Builder(Preheader->getTerminator());
1949 Builder.SetCurrentDebugLocation(CurLoop->getStartLoc());
1950 SCEVExpander Expander(*SE, "strlen_idiom");
1951 Value *MaterialzedBase = Expander.expandCodeFor(
1952 Verifier.LoadBaseEv, Verifier.LoadBaseEv->getType(),
1953 Builder.GetInsertPoint());
1954
1955 Value *StrLenFunc = nullptr;
1956 if (Verifier.OpWidth == 8) {
1957 StrLenFunc = emitStrLen(MaterialzedBase, Builder, *DL, TLI);
1958 } else {
1959 StrLenFunc = emitWcsLen(MaterialzedBase, Builder, *DL, TLI);
1960 }
1961 assert(StrLenFunc && "Failed to emit strlen function.");
1962
1963 const SCEV *StrlenEv = SE->getSCEV(StrLenFunc);
1965 for (PHINode &PN : LoopExitBB->phis()) {
1966 // We can now materialize the loop output as all phi have scev {base,+,a}.
1967 // We expand the phi as:
1968 // %strlen = call i64 @strlen(%str)
1969 // %phi.new = base expression + step * %strlen
1970 const SCEV *Ev = SE->getSCEV(&PN);
1971 const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
1972 const SCEVConstant *Step =
1974 const SCEV *Base = AddRecEv->getStart();
1975
1976 // It is safe to truncate to base since if base is narrower than size_t
1977 // the equivalent user code will have to truncate anyways.
1978 const SCEV *NewEv = SE->getAddExpr(
1980 StrlenEv, Base->getType())));
1981
1982 Value *MaterializedPHI = Expander.expandCodeFor(NewEv, NewEv->getType(),
1983 Builder.GetInsertPoint());
1984 Expander.clear();
1985 PN.replaceAllUsesWith(MaterializedPHI);
1986 Cleanup.push_back(&PN);
1987 }
1988
1989 // All LCSSA Loop Phi are dead, the left over dead loop body can be cleaned
1990 // up by later passes
1991 for (PHINode *PN : Cleanup)
1993
1994 // LoopDeletion only delete invariant loops with known trip-count. We can
1995 // update the condition so it will reliablely delete the invariant loop
1996 assert((LoopTerm->getSuccessor(0) == LoopBody ||
1997 LoopTerm->getSuccessor(1) == LoopBody) &&
1998 "loop body must have a successor that is it self");
1999 ConstantInt *NewLoopCond = LoopTerm->getSuccessor(0) == LoopBody
2000 ? Builder.getFalse()
2001 : Builder.getTrue();
2002 LoopTerm->setCondition(NewLoopCond);
2003 SE->forgetLoop(CurLoop);
2004
2005 ++NumStrLen;
2006 LLVM_DEBUG(dbgs() << " Formed strlen idiom: " << *StrLenFunc << "\n");
2007 ORE.emit([&]() {
2008 return OptimizationRemark(DEBUG_TYPE, "recognizeAndInsertStrLen",
2009 CurLoop->getStartLoc(), Preheader)
2010 << "Transformed " << StrLenFunc->getName() << " loop idiom";
2011 });
2012
2013 return true;
2014}
2015
2016/// Check if the given conditional branch is based on an unsigned less-than
2017/// comparison between a variable and a constant, and if the comparison is false
2018/// the control yields to the loop entry. If the branch matches the behaviour,
2019/// the variable involved in the comparison is returned.
2021 APInt &Threshold) {
2023 if (!Cond)
2024 return nullptr;
2025
2026 ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1));
2027 if (!CmpConst)
2028 return nullptr;
2029
2030 BasicBlock *FalseSucc = BI->getSuccessor(1);
2031 ICmpInst::Predicate Pred = Cond->getPredicate();
2032
2033 if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
2034 Threshold = CmpConst->getValue();
2035 return Cond->getOperand(0);
2036 }
2037
2038 return nullptr;
2039}
2040
2041// Check if the recurrence variable `VarX` is in the right form to create
2042// the idiom. Returns the value coerced to a PHINode if so.
2044 BasicBlock *LoopEntry) {
2045 auto *PhiX = dyn_cast<PHINode>(VarX);
2046 if (PhiX && PhiX->getParent() == LoopEntry &&
2047 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
2048 return PhiX;
2049 return nullptr;
2050}
2051
2052/// Return true if the idiom is detected in the loop.
2053///
2054/// Additionally:
2055/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
2056/// or nullptr if there is no such.
2057/// 2) \p CntPhi is set to the corresponding phi node
2058/// or nullptr if there is no such.
2059/// 3) \p InitX is set to the value whose CTLZ could be used.
2060/// 4) \p DefX is set to the instruction calculating Loop exit condition.
2061/// 5) \p Threshold is set to the constant involved in the unsigned less-than
2062/// comparison.
2063///
2064/// The core idiom we are trying to detect is:
2065/// \code
2066/// if (x0 < 2)
2067/// goto loop-exit // the precondition of the loop
2068/// cnt0 = init-val
2069/// do {
2070/// x = phi (x0, x.next); //PhiX
2071/// cnt = phi (cnt0, cnt.next)
2072///
2073/// cnt.next = cnt + 1;
2074/// ...
2075/// x.next = x >> 1; // DefX
2076/// } while (x >= 4)
2077/// loop-exit:
2078/// \endcode
2080 Intrinsic::ID &IntrinID,
2081 Value *&InitX, Instruction *&CntInst,
2082 PHINode *&CntPhi, Instruction *&DefX,
2083 APInt &Threshold) {
2084 BasicBlock *LoopEntry;
2085
2086 DefX = nullptr;
2087 CntInst = nullptr;
2088 CntPhi = nullptr;
2089 LoopEntry = *(CurLoop->block_begin());
2090
2091 // step 1: Check if the loop-back branch is in desirable form.
2092 auto *EntryBI = dyn_cast<CondBrInst>(LoopEntry->getTerminator());
2093 if (!EntryBI)
2094 return false;
2095 if (Value *T = matchShiftULTCondition(EntryBI, LoopEntry, Threshold))
2096 DefX = dyn_cast<Instruction>(T);
2097 else
2098 return false;
2099
2100 // step 2: Check the recurrence of variable X
2101 if (!DefX || !isa<PHINode>(DefX))
2102 return false;
2103
2104 PHINode *VarPhi = cast<PHINode>(DefX);
2105 int Idx = VarPhi->getBasicBlockIndex(LoopEntry);
2106 if (Idx == -1)
2107 return false;
2108
2109 DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx));
2110 if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi)
2111 return false;
2112
2113 // step 3: detect instructions corresponding to "x.next = x >> 1"
2114 if (DefX->getOpcode() != Instruction::LShr)
2115 return false;
2116
2117 IntrinID = Intrinsic::ctlz;
2119 if (!Shft || !Shft->isOne())
2120 return false;
2121
2122 InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader());
2123
2124 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
2125 // or cnt.next = cnt + -1.
2126 // TODO: We can skip the step. If loop trip count is known (CTLZ),
2127 // then all uses of "cnt.next" could be optimized to the trip count
2128 // plus "cnt0". Currently it is not optimized.
2129 // This step could be used to detect POPCNT instruction:
2130 // cnt.next = cnt + (x.next & 1)
2131 for (Instruction &Inst :
2132 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2133 if (Inst.getOpcode() != Instruction::Add)
2134 continue;
2135
2137 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
2138 continue;
2139
2140 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2141 if (!Phi)
2142 continue;
2143
2144 CntInst = &Inst;
2145 CntPhi = Phi;
2146 break;
2147 }
2148 if (!CntInst)
2149 return false;
2150
2151 return true;
2152}
2153
2154/// Return true iff the idiom is detected in the loop.
2155///
2156/// Additionally:
2157/// 1) \p CntInst is set to the instruction counting the population bit.
2158/// 2) \p CntPhi is set to the corresponding phi node.
2159/// 3) \p Var is set to the value whose population bits are being counted.
2160///
2161/// The core idiom we are trying to detect is:
2162/// \code
2163/// if (x0 != 0)
2164/// goto loop-exit // the precondition of the loop
2165/// cnt0 = init-val;
2166/// do {
2167/// x1 = phi (x0, x2);
2168/// cnt1 = phi(cnt0, cnt2);
2169///
2170/// cnt2 = cnt1 + 1;
2171/// ...
2172/// x2 = x1 & (x1 - 1);
2173/// ...
2174/// } while(x != 0);
2175///
2176/// loop-exit:
2177/// \endcode
2178static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
2179 Instruction *&CntInst, PHINode *&CntPhi,
2180 Value *&Var) {
2181 // step 1: Check to see if the look-back branch match this pattern:
2182 // "if (a!=0) goto loop-entry".
2183 BasicBlock *LoopEntry;
2184 Instruction *DefX2, *CountInst;
2185 Value *VarX1, *VarX0;
2186 PHINode *PhiX, *CountPhi;
2187
2188 DefX2 = CountInst = nullptr;
2189 VarX1 = VarX0 = nullptr;
2190 PhiX = CountPhi = nullptr;
2191 LoopEntry = *(CurLoop->block_begin());
2192
2193 // step 1: Check if the loop-back branch is in desirable form.
2194 {
2195 auto *LoopTerm = dyn_cast<CondBrInst>(LoopEntry->getTerminator());
2196 if (!LoopTerm)
2197 return false;
2198 DefX2 = dyn_cast_or_null<Instruction>(matchCondition(LoopTerm, LoopEntry));
2199 }
2200
2201 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
2202 {
2203 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
2204 return false;
2205
2206 BinaryOperator *SubOneOp;
2207
2208 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
2209 VarX1 = DefX2->getOperand(1);
2210 else {
2211 VarX1 = DefX2->getOperand(0);
2212 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
2213 }
2214 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
2215 return false;
2216
2217 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
2218 if (!Dec ||
2219 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
2220 (SubOneOp->getOpcode() == Instruction::Add &&
2221 Dec->isMinusOne()))) {
2222 return false;
2223 }
2224 }
2225
2226 // step 3: Check the recurrence of variable X
2227 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
2228 if (!PhiX)
2229 return false;
2230
2231 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
2232 {
2233 CountInst = nullptr;
2234 for (Instruction &Inst :
2235 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2236 if (Inst.getOpcode() != Instruction::Add)
2237 continue;
2238
2240 if (!Inc || !Inc->isOne())
2241 continue;
2242
2243 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2244 if (!Phi)
2245 continue;
2246
2247 // Check if the result of the instruction is live of the loop.
2248 bool LiveOutLoop = false;
2249 for (User *U : Inst.users()) {
2250 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
2251 LiveOutLoop = true;
2252 break;
2253 }
2254 }
2255
2256 if (LiveOutLoop) {
2257 CountInst = &Inst;
2258 CountPhi = Phi;
2259 break;
2260 }
2261 }
2262
2263 if (!CountInst)
2264 return false;
2265 }
2266
2267 // step 5: check if the precondition is in this form:
2268 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
2269 {
2270 auto *PreCondBr = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
2271 if (!PreCondBr)
2272 return false;
2273 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
2274 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
2275 return false;
2276
2277 CntInst = CountInst;
2278 CntPhi = CountPhi;
2279 Var = T;
2280 }
2281
2282 return true;
2283}
2284
2285/// Return true if the idiom is detected in the loop.
2286///
2287/// Additionally:
2288/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
2289/// or nullptr if there is no such.
2290/// 2) \p CntPhi is set to the corresponding phi node
2291/// or nullptr if there is no such.
2292/// 3) \p Var is set to the value whose CTLZ could be used.
2293/// 4) \p DefX is set to the instruction calculating Loop exit condition.
2294///
2295/// The core idiom we are trying to detect is:
2296/// \code
2297/// if (x0 == 0)
2298/// goto loop-exit // the precondition of the loop
2299/// cnt0 = init-val;
2300/// do {
2301/// x = phi (x0, x.next); //PhiX
2302/// cnt = phi(cnt0, cnt.next);
2303///
2304/// cnt.next = cnt + 1;
2305/// ...
2306/// x.next = x >> 1; // DefX
2307/// ...
2308/// } while(x.next != 0);
2309///
2310/// loop-exit:
2311/// \endcode
2312static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
2313 Intrinsic::ID &IntrinID, Value *&InitX,
2314 Instruction *&CntInst, PHINode *&CntPhi,
2315 Instruction *&DefX) {
2316 BasicBlock *LoopEntry;
2317 Value *VarX = nullptr;
2318
2319 DefX = nullptr;
2320 CntInst = nullptr;
2321 CntPhi = nullptr;
2322 LoopEntry = *(CurLoop->block_begin());
2323
2324 // step 1: Check if the loop-back branch is in desirable form.
2325 auto *LoopTerm = dyn_cast<CondBrInst>(LoopEntry->getTerminator());
2326 if (!LoopTerm)
2327 return false;
2328 DefX = dyn_cast_or_null<Instruction>(matchCondition(LoopTerm, LoopEntry));
2329
2330 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
2331 if (!DefX || !DefX->isShift())
2332 return false;
2333 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
2334 Intrinsic::ctlz;
2336 if (!Shft || !Shft->isOne())
2337 return false;
2338 VarX = DefX->getOperand(0);
2339
2340 // step 3: Check the recurrence of variable X
2341 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
2342 if (!PhiX)
2343 return false;
2344
2345 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
2346
2347 // Make sure the initial value can't be negative otherwise the ashr in the
2348 // loop might never reach zero which would make the loop infinite.
2349 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
2350 return false;
2351
2352 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
2353 // or cnt.next = cnt + -1.
2354 // TODO: We can skip the step. If loop trip count is known (CTLZ),
2355 // then all uses of "cnt.next" could be optimized to the trip count
2356 // plus "cnt0". Currently it is not optimized.
2357 // This step could be used to detect POPCNT instruction:
2358 // cnt.next = cnt + (x.next & 1)
2359 for (Instruction &Inst :
2360 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2361 if (Inst.getOpcode() != Instruction::Add)
2362 continue;
2363
2365 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
2366 continue;
2367
2368 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2369 if (!Phi)
2370 continue;
2371
2372 CntInst = &Inst;
2373 CntPhi = Phi;
2374 break;
2375 }
2376 if (!CntInst)
2377 return false;
2378
2379 return true;
2380}
2381
2382// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
2383// profitable if we delete the loop.
2384bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
2385 Value *InitX, bool ZeroCheck,
2386 size_t CanonicalSize) {
2387 const Value *Args[] = {InitX,
2388 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
2389
2390 uint32_t HeaderSize = CurLoop->getHeader()->size();
2391
2392 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
2393 InstructionCost Cost = TTI->getIntrinsicInstrCost(
2395 if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
2396 return false;
2397
2398 return true;
2399}
2400
2401/// Convert CTLZ / CTTZ idiom loop into countable loop.
2402/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
2403/// returns false.
2404bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
2405 Value *InitX, Instruction *DefX,
2406 PHINode *CntPhi,
2407 Instruction *CntInst) {
2408 bool IsCntPhiUsedOutsideLoop = false;
2409 for (User *U : CntPhi->users())
2410 if (!CurLoop->contains(cast<Instruction>(U))) {
2411 IsCntPhiUsedOutsideLoop = true;
2412 break;
2413 }
2414 bool IsCntInstUsedOutsideLoop = false;
2415 for (User *U : CntInst->users())
2416 if (!CurLoop->contains(cast<Instruction>(U))) {
2417 IsCntInstUsedOutsideLoop = true;
2418 break;
2419 }
2420 // If both CntInst and CntPhi are used outside the loop the profitability
2421 // is questionable.
2422 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
2423 return false;
2424
2425 // For some CPUs result of CTLZ(X) intrinsic is undefined
2426 // when X is 0. If we can not guarantee X != 0, we need to check this
2427 // when expand.
2428 bool ZeroCheck = false;
2429 // It is safe to assume Preheader exist as it was checked in
2430 // parent function RunOnLoop.
2431 BasicBlock *PH = CurLoop->getLoopPreheader();
2432
2433 // If we are using the count instruction outside the loop, make sure we
2434 // have a zero check as a precondition. Without the check the loop would run
2435 // one iteration for before any check of the input value. This means 0 and 1
2436 // would have identical behavior in the original loop and thus
2437 if (!IsCntPhiUsedOutsideLoop) {
2438 auto *PreCondBB = PH->getSinglePredecessor();
2439 if (!PreCondBB)
2440 return false;
2441 auto *PreCondBI = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
2442 if (!PreCondBI)
2443 return false;
2444 if (matchCondition(PreCondBI, PH) != InitX)
2445 return false;
2446 ZeroCheck = true;
2447 }
2448
2449 // FFS idiom loop has only 6 instructions:
2450 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2451 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2452 // %shr = ashr %n.addr.0, 1
2453 // %tobool = icmp eq %shr, 0
2454 // %inc = add nsw %i.0, 1
2455 // br i1 %tobool
2456 size_t IdiomCanonicalSize = 6;
2457 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2458 return false;
2459
2460 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
2461 DefX->getDebugLoc(), ZeroCheck,
2462 IsCntPhiUsedOutsideLoop);
2463 return true;
2464}
2465
2466/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
2467/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
2468/// trip count returns true; otherwise, returns false.
2469bool LoopIdiomRecognize::recognizeAndInsertFFS() {
2470 // Give up if the loop has multiple blocks or multiple backedges.
2471 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2472 return false;
2473
2474 Intrinsic::ID IntrinID;
2475 Value *InitX;
2476 Instruction *DefX = nullptr;
2477 PHINode *CntPhi = nullptr;
2478 Instruction *CntInst = nullptr;
2479
2480 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi,
2481 DefX))
2482 return false;
2483
2484 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2485}
2486
2487bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
2488 // Give up if the loop has multiple blocks or multiple backedges.
2489 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2490 return false;
2491
2492 Intrinsic::ID IntrinID;
2493 Value *InitX;
2494 Instruction *DefX = nullptr;
2495 PHINode *CntPhi = nullptr;
2496 Instruction *CntInst = nullptr;
2497
2498 APInt LoopThreshold;
2499 if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,
2500 CntPhi, DefX, LoopThreshold))
2501 return false;
2502
2503 if (LoopThreshold == 2) {
2504 // Treat as regular FFS.
2505 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2506 }
2507
2508 // Look for Floor Log2 Idiom.
2509 if (LoopThreshold != 4)
2510 return false;
2511
2512 // Abort if CntPhi is used outside of the loop.
2513 for (User *U : CntPhi->users())
2514 if (!CurLoop->contains(cast<Instruction>(U)))
2515 return false;
2516
2517 // It is safe to assume Preheader exist as it was checked in
2518 // parent function RunOnLoop.
2519 BasicBlock *PH = CurLoop->getLoopPreheader();
2520 auto *PreCondBB = PH->getSinglePredecessor();
2521 if (!PreCondBB)
2522 return false;
2523 auto *PreCondBI = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
2524 if (!PreCondBI)
2525 return false;
2526
2527 APInt PreLoopThreshold;
2528 if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||
2529 PreLoopThreshold != 2)
2530 return false;
2531
2532 bool ZeroCheck = true;
2533
2534 // the loop has only 6 instructions:
2535 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2536 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2537 // %shr = ashr %n.addr.0, 1
2538 // %tobool = icmp ult %n.addr.0, C
2539 // %inc = add nsw %i.0, 1
2540 // br i1 %tobool
2541 size_t IdiomCanonicalSize = 6;
2542 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2543 return false;
2544
2545 // log2(x) = w − 1 − clz(x)
2546 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
2547 DefX->getDebugLoc(), ZeroCheck,
2548 /*IsCntPhiUsedOutsideLoop=*/false,
2549 /*InsertSub=*/true);
2550 return true;
2551}
2552
2553/// Recognizes a population count idiom in a non-countable loop.
2554///
2555/// If detected, transforms the relevant code to issue the popcount intrinsic
2556/// function call, and returns true; otherwise, returns false.
2557bool LoopIdiomRecognize::recognizePopcount() {
2558 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
2559 return false;
2560
2561 // Counting population are usually conducted by few arithmetic instructions.
2562 // Such instructions can be easily "absorbed" by vacant slots in a
2563 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
2564 // in a compact loop.
2565
2566 // Give up if the loop has multiple blocks or multiple backedges.
2567 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2568 return false;
2569
2570 BasicBlock *LoopBody = *(CurLoop->block_begin());
2571 if (LoopBody->size() >= 20) {
2572 // The loop is too big, bail out.
2573 return false;
2574 }
2575
2576 // It should have a preheader containing nothing but an unconditional branch.
2577 BasicBlock *PH = CurLoop->getLoopPreheader();
2578 if (!PH || &PH->front() != PH->getTerminator())
2579 return false;
2580 auto *EntryBI = dyn_cast<UncondBrInst>(PH->getTerminator());
2581 if (!EntryBI)
2582 return false;
2583
2584 // It should have a precondition block where the generated popcount intrinsic
2585 // function can be inserted.
2586 auto *PreCondBB = PH->getSinglePredecessor();
2587 if (!PreCondBB)
2588 return false;
2589 auto *PreCondBI = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
2590 if (!PreCondBI)
2591 return false;
2592
2593 Instruction *CntInst;
2594 PHINode *CntPhi;
2595 Value *Val;
2596 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
2597 return false;
2598
2599 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
2600 return true;
2601}
2602
2604 const DebugLoc &DL) {
2605 Value *Ops[] = {Val};
2606 Type *Tys[] = {Val->getType()};
2607
2608 CallInst *CI = IRBuilder.CreateIntrinsic(Intrinsic::ctpop, Tys, Ops);
2609 CI->setDebugLoc(DL);
2610
2611 return CI;
2612}
2613
2615 const DebugLoc &DL, bool ZeroCheck,
2616 Intrinsic::ID IID) {
2617 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
2618 Type *Tys[] = {Val->getType()};
2619
2620 CallInst *CI = IRBuilder.CreateIntrinsic(IID, Tys, Ops);
2621 CI->setDebugLoc(DL);
2622
2623 return CI;
2624}
2625
2626/// Transform the following loop (Using CTLZ, CTTZ is similar):
2627/// loop:
2628/// CntPhi = PHI [Cnt0, CntInst]
2629/// PhiX = PHI [InitX, DefX]
2630/// CntInst = CntPhi + 1
2631/// DefX = PhiX >> 1
2632/// LOOP_BODY
2633/// Br: loop if (DefX != 0)
2634/// Use(CntPhi) or Use(CntInst)
2635///
2636/// Into:
2637/// If CntPhi used outside the loop:
2638/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2639/// Count = CountPrev + 1
2640/// else
2641/// Count = BitWidth(InitX) - CTLZ(InitX)
2642/// loop:
2643/// CntPhi = PHI [Cnt0, CntInst]
2644/// PhiX = PHI [InitX, DefX]
2645/// PhiCount = PHI [Count, Dec]
2646/// CntInst = CntPhi + 1
2647/// DefX = PhiX >> 1
2648/// Dec = PhiCount - 1
2649/// LOOP_BODY
2650/// Br: loop if (Dec != 0)
2651/// Use(CountPrev + Cnt0) // Use(CntPhi)
2652/// or
2653/// Use(Count + Cnt0) // Use(CntInst)
2654///
2655/// If LOOP_BODY is empty the loop will be deleted.
2656/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2657void LoopIdiomRecognize::transformLoopToCountable(
2658 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
2659 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
2660 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
2661 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2662 IRBuilder<> Builder(Preheader->getTerminator());
2663 Builder.SetCurrentDebugLocation(DL);
2664
2665 // If there are no uses of CntPhi crate:
2666 // Count = BitWidth - CTLZ(InitX);
2667 // NewCount = Count;
2668 // If there are uses of CntPhi create:
2669 // NewCount = BitWidth - CTLZ(InitX >> 1);
2670 // Count = NewCount + 1;
2671 Value *InitXNext;
2672 if (IsCntPhiUsedOutsideLoop) {
2673 if (DefX->getOpcode() == Instruction::AShr)
2674 InitXNext = Builder.CreateAShr(InitX, 1);
2675 else if (DefX->getOpcode() == Instruction::LShr)
2676 InitXNext = Builder.CreateLShr(InitX, 1);
2677 else if (DefX->getOpcode() == Instruction::Shl) // cttz
2678 InitXNext = Builder.CreateShl(InitX, 1);
2679 else
2680 llvm_unreachable("Unexpected opcode!");
2681 } else
2682 InitXNext = InitX;
2683 Value *Count =
2684 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2685 Type *CountTy = Count->getType();
2686 Count = Builder.CreateSub(
2687 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2688 if (InsertSub)
2689 Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1));
2690 Value *NewCount = Count;
2691 if (IsCntPhiUsedOutsideLoop)
2692 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2693
2694 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2695
2696 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2697 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2698 // If the counter was being incremented in the loop, add NewCount to the
2699 // counter's initial value, but only if the initial value is not zero.
2700 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2701 if (!InitConst || !InitConst->isZero())
2702 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2703 } else {
2704 // If the count was being decremented in the loop, subtract NewCount from
2705 // the counter's initial value.
2706 NewCount = Builder.CreateSub(CntInitVal, NewCount);
2707 }
2708
2709 // Step 2: Insert new IV and loop condition:
2710 // loop:
2711 // ...
2712 // PhiCount = PHI [Count, Dec]
2713 // ...
2714 // Dec = PhiCount - 1
2715 // ...
2716 // Br: loop if (Dec != 0)
2717 BasicBlock *Body = *(CurLoop->block_begin());
2718 auto *LbBr = cast<CondBrInst>(Body->getTerminator());
2719 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2720
2721 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi");
2722 TcPhi->insertBefore(Body->begin());
2723
2724 Builder.SetInsertPoint(LbCond);
2725 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2726 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2727
2728 TcPhi->addIncoming(Count, Preheader);
2729 TcPhi->addIncoming(TcDec, Body);
2730
2731 CmpInst::Predicate Pred =
2732 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2733 LbCond->setPredicate(Pred);
2734 LbCond->setOperand(0, TcDec);
2735 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2736
2737 // Step 3: All the references to the original counter outside
2738 // the loop are replaced with the NewCount
2739 if (IsCntPhiUsedOutsideLoop)
2740 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2741 else
2742 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2743
2744 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2745 // loop. The loop would otherwise not be deleted even if it becomes empty.
2746 SE->forgetLoop(CurLoop);
2747}
2748
2749void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2750 Instruction *CntInst,
2751 PHINode *CntPhi, Value *Var) {
2752 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2753 auto *PreCondBr = cast<CondBrInst>(PreCondBB->getTerminator());
2754 const DebugLoc &DL = CntInst->getDebugLoc();
2755
2756 // Assuming before transformation, the loop is following:
2757 // if (x) // the precondition
2758 // do { cnt++; x &= x - 1; } while(x);
2759
2760 // Step 1: Insert the ctpop instruction at the end of the precondition block
2761 IRBuilder<> Builder(PreCondBr);
2762 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2763 {
2764 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2765 NewCount = PopCntZext =
2766 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2767
2768 if (NewCount != PopCnt)
2769 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2770
2771 // TripCnt is exactly the number of iterations the loop has
2772 TripCnt = NewCount;
2773
2774 // If the population counter's initial value is not zero, insert Add Inst.
2775 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2776 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2777 if (!InitConst || !InitConst->isZero()) {
2778 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2779 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2780 }
2781 }
2782
2783 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2784 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2785 // function would be partial dead code, and downstream passes will drag
2786 // it back from the precondition block to the preheader.
2787 {
2788 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2789
2790 Value *Opnd0 = PopCntZext;
2791 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2792 if (PreCond->getOperand(0) != Var)
2793 std::swap(Opnd0, Opnd1);
2794
2795 ICmpInst *NewPreCond = cast<ICmpInst>(
2796 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2797 PreCondBr->setCondition(NewPreCond);
2798
2800 }
2801
2802 // Step 3: Note that the population count is exactly the trip count of the
2803 // loop in question, which enable us to convert the loop from noncountable
2804 // loop into a countable one. The benefit is twofold:
2805 //
2806 // - If the loop only counts population, the entire loop becomes dead after
2807 // the transformation. It is a lot easier to prove a countable loop dead
2808 // than to prove a noncountable one. (In some C dialects, an infinite loop
2809 // isn't dead even if it computes nothing useful. In general, DCE needs
2810 // to prove a noncountable loop finite before safely delete it.)
2811 //
2812 // - If the loop also performs something else, it remains alive.
2813 // Since it is transformed to countable form, it can be aggressively
2814 // optimized by some optimizations which are in general not applicable
2815 // to a noncountable loop.
2816 //
2817 // After this step, this loop (conceptually) would look like following:
2818 // newcnt = __builtin_ctpop(x);
2819 // t = newcnt;
2820 // if (x)
2821 // do { cnt++; x &= x-1; t--) } while (t > 0);
2822 BasicBlock *Body = *(CurLoop->block_begin());
2823 {
2824 auto *LbBr = cast<CondBrInst>(Body->getTerminator());
2825 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2826 Type *Ty = TripCnt->getType();
2827
2828 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi");
2829 TcPhi->insertBefore(Body->begin());
2830
2831 Builder.SetInsertPoint(LbCond);
2833 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2834 "tcdec", false, true));
2835
2836 TcPhi->addIncoming(TripCnt, PreHead);
2837 TcPhi->addIncoming(TcDec, Body);
2838
2839 CmpInst::Predicate Pred =
2840 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2841 LbCond->setPredicate(Pred);
2842 LbCond->setOperand(0, TcDec);
2843 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2844 }
2845
2846 // Step 4: All the references to the original population counter outside
2847 // the loop are replaced with the NewCount -- the value returned from
2848 // __builtin_ctpop().
2849 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2850
2851 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2852 // loop. The loop would otherwise not be deleted even if it becomes empty.
2853 SE->forgetLoop(CurLoop);
2854}
2855
2856/// Match loop-invariant value.
2857template <typename SubPattern_t> struct match_LoopInvariant {
2858 SubPattern_t SubPattern;
2859 const Loop *L;
2860
2861 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2862 : SubPattern(SP), L(L) {}
2863
2864 template <typename ITy> bool match(ITy *V) const {
2865 return L->isLoopInvariant(V) && SubPattern.match(V);
2866 }
2867};
2868
2869/// Matches if the value is loop-invariant.
2870template <typename Ty>
2871inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2872 return match_LoopInvariant<Ty>(M, L);
2873}
2874
2875/// Return true if the idiom is detected in the loop.
2876///
2877/// The core idiom we are trying to detect is:
2878/// \code
2879/// entry:
2880/// <...>
2881/// %bitmask = shl i32 1, %bitpos
2882/// br label %loop
2883///
2884/// loop:
2885/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2886/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2887/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2888/// %x.next = shl i32 %x.curr, 1
2889/// <...>
2890/// br i1 %x.curr.isbitunset, label %loop, label %end
2891///
2892/// end:
2893/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2894/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2895/// <...>
2896/// \endcode
2897static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2898 Value *&BitMask, Value *&BitPos,
2899 Value *&CurrX, Instruction *&NextX) {
2901 " Performing shift-until-bittest idiom detection.\n");
2902
2903 // Give up if the loop has multiple blocks or multiple backedges.
2904 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2905 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2906 return false;
2907 }
2908
2909 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2910 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2911 assert(LoopPreheaderBB && "There is always a loop preheader.");
2912
2913 using namespace PatternMatch;
2914
2915 // Step 1: Check if the loop backedge is in desirable form.
2916
2917 CmpPredicate Pred;
2918 Value *CmpLHS, *CmpRHS;
2919 BasicBlock *TrueBB, *FalseBB;
2920 if (!match(LoopHeaderBB->getTerminator(),
2921 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2922 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2923 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2924 return false;
2925 }
2926
2927 // Step 2: Check if the backedge's condition is in desirable form.
2928
2929 auto MatchVariableBitMask = [&]() {
2930 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2931 match(CmpLHS,
2932 m_c_And(m_Value(CurrX),
2934 m_Value(BitMask),
2935 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2936 CurLoop))));
2937 };
2938
2939 auto MatchDecomposableConstantBitMask = [&]() {
2940 auto Res = llvm::decomposeBitTestICmp(
2941 CmpLHS, CmpRHS, Pred, /*LookThroughTrunc=*/true,
2942 /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true);
2943 if (Res && Res->Mask.isPowerOf2()) {
2944 assert(ICmpInst::isEquality(Res->Pred));
2945 Pred = Res->Pred;
2946 CurrX = Res->X;
2947 BitMask = ConstantInt::get(CurrX->getType(), Res->Mask);
2948 BitPos = ConstantInt::get(CurrX->getType(), Res->Mask.logBase2());
2949 return true;
2950 }
2951 return false;
2952 };
2953
2954 if (!MatchVariableBitMask() && !MatchDecomposableConstantBitMask()) {
2955 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2956 return false;
2957 }
2958
2959 // Step 3: Check if the recurrence is in desirable form.
2960 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2961 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2962 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2963 return false;
2964 }
2965
2966 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2967 NextX =
2968 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2969
2970 assert(CurLoop->isLoopInvariant(BaseX) &&
2971 "Expected BaseX to be available in the preheader!");
2972
2973 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2974 // FIXME: support right-shift?
2975 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2976 return false;
2977 }
2978
2979 // Step 4: Check if the backedge's destinations are in desirable form.
2980
2982 "Should only get equality predicates here.");
2983
2984 // cmp-br is commutative, so canonicalize to a single variant.
2985 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2986 Pred = ICmpInst::getInversePredicate(Pred);
2987 std::swap(TrueBB, FalseBB);
2988 }
2989
2990 // We expect to exit loop when comparison yields false,
2991 // so when it yields true we should branch back to loop header.
2992 if (TrueBB != LoopHeaderBB) {
2993 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2994 return false;
2995 }
2996
2997 // Okay, idiom checks out.
2998 return true;
2999}
3000
3001/// Look for the following loop:
3002/// \code
3003/// entry:
3004/// <...>
3005/// %bitmask = shl i32 1, %bitpos
3006/// br label %loop
3007///
3008/// loop:
3009/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
3010/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
3011/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
3012/// %x.next = shl i32 %x.curr, 1
3013/// <...>
3014/// br i1 %x.curr.isbitunset, label %loop, label %end
3015///
3016/// end:
3017/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
3018/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
3019/// <...>
3020/// \endcode
3021///
3022/// And transform it into:
3023/// \code
3024/// entry:
3025/// %bitmask = shl i32 1, %bitpos
3026/// %lowbitmask = add i32 %bitmask, -1
3027/// %mask = or i32 %lowbitmask, %bitmask
3028/// %x.masked = and i32 %x, %mask
3029/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
3030/// i1 true)
3031/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
3032/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
3033/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
3034/// %tripcount = add i32 %backedgetakencount, 1
3035/// %x.curr = shl i32 %x, %backedgetakencount
3036/// %x.next = shl i32 %x, %tripcount
3037/// br label %loop
3038///
3039/// loop:
3040/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
3041/// %loop.iv.next = add nuw i32 %loop.iv, 1
3042/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
3043/// <...>
3044/// br i1 %loop.ivcheck, label %end, label %loop
3045///
3046/// end:
3047/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
3048/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
3049/// <...>
3050/// \endcode
3051bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
3052 bool MadeChange = false;
3053
3054 Value *X, *BitMask, *BitPos, *XCurr;
3055 Instruction *XNext;
3056 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
3057 XNext)) {
3059 " shift-until-bittest idiom detection failed.\n");
3060 return MadeChange;
3061 }
3062 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
3063
3064 // Ok, it is the idiom we were looking for, we *could* transform this loop,
3065 // but is it profitable to transform?
3066
3067 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3068 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3069 assert(LoopPreheaderBB && "There is always a loop preheader.");
3070
3071 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
3072 assert(SuccessorBB && "There is only a single successor.");
3073
3074 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
3075 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
3076
3077 Intrinsic::ID IntrID = Intrinsic::ctlz;
3078 Type *Ty = X->getType();
3079 unsigned Bitwidth = Ty->getScalarSizeInBits();
3080
3083
3084 // The rewrite is considered to be unprofitable iff and only iff the
3085 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
3086 // making the loop countable, even if nothing else changes.
3088 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getTrue()});
3089 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
3092 " Intrinsic is too costly, not beneficial\n");
3093 return MadeChange;
3094 }
3095 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
3097 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
3098 return MadeChange;
3099 }
3100
3101 // Ok, transform appears worthwhile.
3102 MadeChange = true;
3103
3104 if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) {
3105 // BitMask may be computed from BitPos, Freeze BitPos so we can increase
3106 // it's use count.
3107 std::optional<BasicBlock::iterator> InsertPt = std::nullopt;
3108 if (auto *BitPosI = dyn_cast<Instruction>(BitPos))
3109 InsertPt = BitPosI->getInsertionPointAfterDef();
3110 else
3111 InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
3112 if (!InsertPt)
3113 return false;
3114 FreezeInst *BitPosFrozen =
3115 new FreezeInst(BitPos, BitPos->getName() + ".fr", *InsertPt);
3116 BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) {
3117 return U.getUser() != BitPosFrozen;
3118 });
3119 BitPos = BitPosFrozen;
3120 }
3121
3122 // Step 1: Compute the loop trip count.
3123
3124 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
3125 BitPos->getName() + ".lowbitmask");
3126 Value *Mask =
3127 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
3128 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
3129 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
3130 IntrID, Ty, {XMasked, /*is_zero_poison=*/Builder.getTrue()},
3131 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
3132 Value *XMaskedNumActiveBits = Builder.CreateSub(
3133 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
3134 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
3135 /*HasNSW=*/Bitwidth != 2);
3136 Value *XMaskedLeadingOnePos =
3137 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
3138 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
3139 /*HasNSW=*/Bitwidth > 2);
3140
3141 Value *LoopBackedgeTakenCount = Builder.CreateSub(
3142 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
3143 /*HasNUW=*/true, /*HasNSW=*/true);
3144 // We know loop's backedge-taken count, but what's loop's trip count?
3145 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
3146 Value *LoopTripCount =
3147 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
3148 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
3149 /*HasNSW=*/Bitwidth != 2);
3150
3151 // Step 2: Compute the recurrence's final value without a loop.
3152
3153 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
3154 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
3155 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
3156 NewX->takeName(XCurr);
3157 if (auto *I = dyn_cast<Instruction>(NewX))
3158 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
3159
3160 Value *NewXNext;
3161 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
3162 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
3163 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
3164 // that isn't the case, we'll need to emit an alternative, safe IR.
3165 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
3169 Ty->getScalarSizeInBits() - 1))))
3170 NewXNext = Builder.CreateShl(X, LoopTripCount);
3171 else {
3172 // Otherwise, just additionally shift by one. It's the smallest solution,
3173 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
3174 // and select 0 instead.
3175 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
3176 }
3177
3178 NewXNext->takeName(XNext);
3179 if (auto *I = dyn_cast<Instruction>(NewXNext))
3180 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
3181
3182 // Step 3: Adjust the successor basic block to receive the computed
3183 // recurrence's final value instead of the recurrence itself.
3184
3185 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
3186 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
3187
3188 // Step 4: Rewrite the loop into a countable form, with canonical IV.
3189
3190 // The new canonical induction variable.
3191 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
3192 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
3193
3194 // The induction itself.
3195 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
3196 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3197 auto *IVNext =
3198 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
3199 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
3200
3201 // The loop trip count check.
3202 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
3203 CurLoop->getName() + ".ivcheck");
3204 SmallVector<uint32_t> BranchWeights;
3205 const bool HasBranchWeights =
3207 extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
3208
3209 auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
3210 if (HasBranchWeights) {
3211 if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
3212 std::swap(BranchWeights[0], BranchWeights[1]);
3213 // We're not changing the loop profile, so we can reuse the original loop's
3214 // profile.
3215 setBranchWeights(*BI, BranchWeights,
3216 /*IsExpected=*/false);
3217 }
3218
3219 LoopHeaderBB->getTerminator()->eraseFromParent();
3220
3221 // Populate the IV PHI.
3222 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
3223 IV->addIncoming(IVNext, LoopHeaderBB);
3224
3225 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
3226 // loop. The loop would otherwise not be deleted even if it becomes empty.
3227
3228 SE->forgetLoop(CurLoop);
3229
3230 // Other passes will take care of actually deleting the loop if possible.
3231
3232 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
3233
3234 ++NumShiftUntilBitTest;
3235 return MadeChange;
3236}
3237
3238/// Return true if the idiom is detected in the loop.
3239///
3240/// The core idiom we are trying to detect is:
3241/// \code
3242/// entry:
3243/// <...>
3244/// %start = <...>
3245/// %extraoffset = <...>
3246/// <...>
3247/// br label %for.cond
3248///
3249/// loop:
3250/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
3251/// %nbits = add nsw i8 %iv, %extraoffset
3252/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3253/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3254/// %iv.next = add i8 %iv, 1
3255/// <...>
3256/// br i1 %val.shifted.iszero, label %end, label %loop
3257///
3258/// end:
3259/// %iv.res = phi i8 [ %iv, %loop ] <...>
3260/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3261/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3262/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3263/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3264/// <...>
3265/// \endcode
3267 Instruction *&ValShiftedIsZero,
3268 Intrinsic::ID &IntrinID, Instruction *&IV,
3269 Value *&Start, Value *&Val,
3270 const SCEV *&ExtraOffsetExpr,
3271 bool &InvertedCond) {
3273 " Performing shift-until-zero idiom detection.\n");
3274
3275 // Give up if the loop has multiple blocks or multiple backedges.
3276 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
3277 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
3278 return false;
3279 }
3280
3281 Instruction *ValShifted, *NBits, *IVNext;
3282 Value *ExtraOffset;
3283
3284 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3285 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3286 assert(LoopPreheaderBB && "There is always a loop preheader.");
3287
3288 using namespace PatternMatch;
3289
3290 // Step 1: Check if the loop backedge, condition is in desirable form.
3291
3292 CmpPredicate Pred;
3293 BasicBlock *TrueBB, *FalseBB;
3294 if (!match(LoopHeaderBB->getTerminator(),
3295 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
3296 m_BasicBlock(FalseBB))) ||
3297 !match(ValShiftedIsZero,
3298 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
3299 !ICmpInst::isEquality(Pred)) {
3300 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
3301 return false;
3302 }
3303
3304 // Step 2: Check if the comparison's operand is in desirable form.
3305 // FIXME: Val could be a one-input PHI node, which we should look past.
3306 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
3307 m_Instruction(NBits)))) {
3308 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
3309 return false;
3310 }
3311 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
3312 : Intrinsic::ctlz;
3313
3314 // Step 3: Check if the shift amount is in desirable form.
3315
3316 if (match(NBits, m_c_Add(m_Instruction(IV),
3317 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
3318 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
3319 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
3320 else if (match(NBits,
3322 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
3323 NBits->hasNoSignedWrap())
3324 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
3325 else {
3326 IV = NBits;
3327 ExtraOffsetExpr = SE->getZero(NBits->getType());
3328 }
3329
3330 // Step 4: Check if the recurrence is in desirable form.
3331 auto *IVPN = dyn_cast<PHINode>(IV);
3332 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
3333 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
3334 return false;
3335 }
3336
3337 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
3338 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
3339
3340 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
3341 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
3342 return false;
3343 }
3344
3345 // Step 4: Check if the backedge's destinations are in desirable form.
3346
3348 "Should only get equality predicates here.");
3349
3350 // cmp-br is commutative, so canonicalize to a single variant.
3351 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
3352 if (InvertedCond) {
3353 Pred = ICmpInst::getInversePredicate(Pred);
3354 std::swap(TrueBB, FalseBB);
3355 }
3356
3357 // We expect to exit loop when comparison yields true,
3358 // so when it yields false we should branch back to loop header.
3359 if (FalseBB != LoopHeaderBB) {
3360 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
3361 return false;
3362 }
3363
3364 // The new, countable, loop will certainly only run a known number of
3365 // iterations, It won't be infinite. But the old loop might be infinite
3366 // under certain conditions. For logical shifts, the value will become zero
3367 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
3368 // right-shift, iff the sign bit was set, the value will never become zero,
3369 // and the loop may never finish.
3370 if (ValShifted->getOpcode() == Instruction::AShr &&
3371 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
3372 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
3373 return false;
3374 }
3375
3376 // Okay, idiom checks out.
3377 return true;
3378}
3379
3380/// Look for the following loop:
3381/// \code
3382/// entry:
3383/// <...>
3384/// %start = <...>
3385/// %extraoffset = <...>
3386/// <...>
3387/// br label %loop
3388///
3389/// loop:
3390/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
3391/// %nbits = add nsw i8 %iv, %extraoffset
3392/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3393/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3394/// %iv.next = add i8 %iv, 1
3395/// <...>
3396/// br i1 %val.shifted.iszero, label %end, label %loop
3397///
3398/// end:
3399/// %iv.res = phi i8 [ %iv, %loop ] <...>
3400/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3401/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3402/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3403/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3404/// <...>
3405/// \endcode
3406///
3407/// And transform it into:
3408/// \code
3409/// entry:
3410/// <...>
3411/// %start = <...>
3412/// %extraoffset = <...>
3413/// <...>
3414/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
3415/// %val.numactivebits = sub i8 8, %val.numleadingzeros
3416/// %extraoffset.neg = sub i8 0, %extraoffset
3417/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
3418/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
3419/// %loop.tripcount = sub i8 %iv.final, %start
3420/// br label %loop
3421///
3422/// loop:
3423/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
3424/// %loop.iv.next = add i8 %loop.iv, 1
3425/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
3426/// %iv = add i8 %loop.iv, %start
3427/// <...>
3428/// br i1 %loop.ivcheck, label %end, label %loop
3429///
3430/// end:
3431/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
3432/// <...>
3433/// \endcode
3434bool LoopIdiomRecognize::recognizeShiftUntilZero() {
3435 bool MadeChange = false;
3436
3437 Instruction *ValShiftedIsZero;
3438 Intrinsic::ID IntrID;
3439 Instruction *IV;
3440 Value *Start, *Val;
3441 const SCEV *ExtraOffsetExpr;
3442 bool InvertedCond;
3443 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
3444 Start, Val, ExtraOffsetExpr, InvertedCond)) {
3446 " shift-until-zero idiom detection failed.\n");
3447 return MadeChange;
3448 }
3449 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
3450
3451 // Ok, it is the idiom we were looking for, we *could* transform this loop,
3452 // but is it profitable to transform?
3453
3454 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3455 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3456 assert(LoopPreheaderBB && "There is always a loop preheader.");
3457
3458 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
3459 assert(SuccessorBB && "There is only a single successor.");
3460
3461 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
3462 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
3463
3464 Type *Ty = Val->getType();
3465 unsigned Bitwidth = Ty->getScalarSizeInBits();
3466
3469
3470 // The rewrite is considered to be unprofitable iff and only iff the
3471 // intrinsic we'll use are not cheap. Note that we are okay with *just*
3472 // making the loop countable, even if nothing else changes.
3474 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
3475 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
3478 " Intrinsic is too costly, not beneficial\n");
3479 return MadeChange;
3480 }
3481
3482 // Ok, transform appears worthwhile.
3483 MadeChange = true;
3484
3485 bool OffsetIsZero = ExtraOffsetExpr->isZero();
3486
3487 // Step 1: Compute the loop's final IV value / trip count.
3488
3489 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
3490 IntrID, Ty, {Val, /*is_zero_poison=*/Builder.getFalse()},
3491 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
3492 Value *ValNumActiveBits = Builder.CreateSub(
3493 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
3494 Val->getName() + ".numactivebits", /*HasNUW=*/true,
3495 /*HasNSW=*/Bitwidth != 2);
3496
3497 SCEVExpander Expander(*SE, "loop-idiom");
3498 Expander.setInsertPoint(&*Builder.GetInsertPoint());
3499 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
3500
3501 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
3502 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
3503 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
3504 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
3505 {ValNumActiveBitsOffset, Start},
3506 /*FMFSource=*/nullptr, "iv.final");
3507
3508 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
3509 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
3510 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
3511 // FIXME: or when the offset was `add nuw`
3512
3513 // We know loop's backedge-taken count, but what's loop's trip count?
3514 Value *LoopTripCount =
3515 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
3516 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
3517 /*HasNSW=*/Bitwidth != 2);
3518
3519 // Step 2: Adjust the successor basic block to receive the original
3520 // induction variable's final value instead of the orig. IV itself.
3521
3522 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
3523
3524 // Step 3: Rewrite the loop into a countable form, with canonical IV.
3525
3526 // The new canonical induction variable.
3527 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
3528 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
3529
3530 // The induction itself.
3531 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->getFirstNonPHIIt());
3532 auto *CIVNext =
3533 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
3534 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
3535
3536 // The loop trip count check.
3537 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
3538 CurLoop->getName() + ".ivcheck");
3539 auto *NewIVCheck = CIVCheck;
3540 if (InvertedCond) {
3541 NewIVCheck = Builder.CreateNot(CIVCheck);
3542 NewIVCheck->takeName(ValShiftedIsZero);
3543 }
3544
3545 // The original IV, but rebased to be an offset to the CIV.
3546 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
3547 /*HasNSW=*/true); // FIXME: what about NUW?
3548 IVDePHId->takeName(IV);
3549
3550 // The loop terminator.
3551 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3552 SmallVector<uint32_t> BranchWeights;
3553 const bool HasBranchWeights =
3555 extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
3556
3557 auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
3558 if (HasBranchWeights) {
3559 if (InvertedCond)
3560 std::swap(BranchWeights[0], BranchWeights[1]);
3561 // We're not changing the loop profile, so we can reuse the original loop's
3562 // profile.
3563 setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
3564 }
3565 LoopHeaderBB->getTerminator()->eraseFromParent();
3566
3567 // Populate the IV PHI.
3568 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
3569 CIV->addIncoming(CIVNext, LoopHeaderBB);
3570
3571 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
3572 // loop. The loop would otherwise not be deleted even if it becomes empty.
3573
3574 SE->forgetLoop(CurLoop);
3575
3576 // Step 5: Try to cleanup the loop's body somewhat.
3577 IV->replaceAllUsesWith(IVDePHId);
3578 IV->eraseFromParent();
3579
3580 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
3581 ValShiftedIsZero->eraseFromParent();
3582
3583 // Other passes will take care of actually deleting the loop if possible.
3584
3585 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
3586
3587 ++NumShiftUntilZero;
3588 return MadeChange;
3589}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
DXIL Resource Access
This file defines the DenseMap class.
#define DEBUG_TYPE
ManagedStatic< HTTPClientCleanup > Cleanup
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
static Value * matchShiftULTCondition(CondBrInst *BI, BasicBlock *LoopEntry, APInt &Threshold)
Check if the given conditional branch is based on an unsigned less-than comparison between a variable...
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX, APInt &Threshold)
Return true if the idiom is detected in the loop.
static Value * matchCondition(CondBrInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset....
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
static void deleteDeadInstruction(Instruction *I)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...
if(PassOpts->AAPipeline)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static bool isSimple(Instruction *I)
verify safepoint Safepoint IR Verifier
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
std::optional< uint64_t > tryZExtValue() const
Get zero extended value if possible.
Definition APInt.h:1567
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:484
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:482
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
This class represents a function call, abstracting a target machine's calling convention.
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition InstrTypes.h:768
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Conditional Branch instruction.
void setCondition(Value *V)
Value * getCondition() const
BasicBlock * getSuccessor(unsigned i) const
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition Constants.h:231
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition Constants.h:225
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:123
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This class represents a freeze function that returns random concrete value if an operand is either a ...
PointerType * getType() const
Global values are always pointers.
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
static CRCTable genSarwateTable(const APInt &GenPoly, bool ByteOrderSwapped)
Generate a lookup table of 256 entries by interleaving the generating polynomial.
This instruction compares its operands according to the predicate given to the constructor.
bool isEquality() const
Return true if this predicate is either EQ or NE.
static bool isEquality(Predicate P)
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition IRBuilder.h:497
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2096
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2812
LLVM_ABI bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
LLVM_ABI bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isShift() const
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
bool isUnordered() const
Align getAlign() const
Return the alignment of the access that is being performed.
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
block_iterator block_begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:653
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:67
ICmpInst * getLatchCmpInst() const
Get the latch condition instruction.
Definition LoopInfo.cpp:198
StringRef getName() const
Definition LoopInfo.h:389
PHINode * getCanonicalInductionVariable() const
Check to see if the loop has a canonical induction variable: an integer recurrence that starts at 0 a...
Definition LoopInfo.cpp:174
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
MaybeAlign getDestAlign() const
bool isForceInlined() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
MaybeAlign getSourceAlign() const
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Representation for a specific memory location.
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:922
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
SCEVUse getOperand(unsigned i) const
This class represents an analyzed expression in the program.
LLVM_ABI bool isOne() const
Return true if the expression is a constant one.
static constexpr auto FlagNUW
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
LLVM_ABI const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:262
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Align getAlign() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
unsigned getWCharSize(const Module &M) const
Returns the size of the wchar_t type in bytes or 0 if the size is unknown.
bool has(LibFunc F) const
Tests whether a library function is available.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCC_Basic
The cost of a typical 'add' instruction.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition Type.h:272
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition Value.cpp:607
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:557
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
Value handle that is nullable, but tries to track the Value.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
const ParentTy * getParent() const
Definition ilist_node.h:34
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
@ HeaderSize
Definition BTF.h:61
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
OperandType
Operands are tagged with one of the values of this enum.
Definition MCInstrDesc.h:59
match_combine_and< Ty... > m_CombineAnd(const Ty &...Ps)
Combine pattern matchers matching all of Ps patterns.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
auto m_BasicBlock()
Match an arbitrary basic block value and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
specificloop_ty m_SpecificLoop(const Loop *L)
bool match(const SCEV *S, const Pattern &P)
specificscev_ty m_scev_Specific(const SCEV *S)
Match if we have a specific specified SCEV.
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
constexpr double e
DiagnosticInfoOptimizationBase::Argument NV
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
cl::opt< bool > ProfcheckDisableMetadataFixes
Definition LoopInfo.cpp:60
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
static cl::opt< bool, true > EnableLIRPWcslen("disable-loop-idiom-wcslen", cl::desc("Proceed with loop idiom recognize pass, " "enable conversion of loop(s) to wcslen."), cl::location(DisableLIRP::Wcslen), cl::init(false), cl::ReallyHidden)
InstructionCost Cost
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static cl::opt< bool, true > DisableLIRPStrlen("disable-loop-idiom-strlen", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to strlen."), cl::location(DisableLIRP::Strlen), cl::init(false), cl::ReallyHidden)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
static cl::opt< bool > ForceMemsetPatternIntrinsic("loop-idiom-force-memset-pattern-intrinsic", cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), cl::Hidden)
LLVM_ABI bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected, bool ElideAllZero=false)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2026
LLVM_ABI bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
LLVM_ABI Value * emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the strlen function to the builder, for the specified pointer.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition ModRef.h:28
@ ModRef
The access may reference and may modify the value stored in memory.
Definition ModRef.h:36
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
static cl::opt< bool, true > DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize", cl::desc("Proceed with loop idiom recognize pass, " "but do not optimize CRC loops."), cl::location(DisableLIRP::HashRecognize), cl::init(false), cl::ReallyHidden)
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition MemorySSA.cpp:84
LLVM_ABI bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
DWARFExpression::Operation Op
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
LLVM_ABI Value * emitWcsLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the wcslen function to the builder, for the specified pointer.
LLVM_ABI bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
LLVM_ABI Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling " "with -Os/-Oz"), cl::init(true), cl::Hidden)
LLVM_ABI bool RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
If the specified value is an effectively dead PHI node, due to being a def-use chain of single-use no...
Definition Local.cpp:643
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
std::optional< DecomposedBitTest > decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred, bool LookThroughTrunc=true, bool AllowNonZeroC=false, bool DecomposeAnd=false)
Decompose an icmp into the form ((X & Mask) pred C) if possible.
SCEVUseT< const SCEV * > SCEVUse
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
LLVM_ABI AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition Metadata.h:836
static bool Memcpy
When true, Memcpy is disabled.
static bool Wcslen
When true, Wcslen is disabled.
static bool Strlen
When true, Strlen is disabled.
static bool HashRecognize
When true, HashRecognize is disabled.
static bool Memset
When true, Memset is disabled.
static bool All
When true, the entire pass is disabled.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
The structure that is returned when a polynomial algorithm was recognized by the analysis.
Match loop-invariant value.
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)