LLVM API Documentation
00001 //===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This pass implements an idiom recognizer that transforms simple loops into a 00011 // non-loop form. In cases that this kicks in, it can be a significant 00012 // performance win. 00013 // 00014 //===----------------------------------------------------------------------===// 00015 // 00016 // TODO List: 00017 // 00018 // Future loop memory idioms to recognize: 00019 // memcmp, memmove, strlen, etc. 00020 // Future floating point idioms to recognize in -ffast-math mode: 00021 // fpowi 00022 // Future integer operation idioms to recognize: 00023 // ctpop, ctlz, cttz 00024 // 00025 // Beware that isel's default lowering for ctpop is highly inefficient for 00026 // i64 and larger types when i64 is legal and the value has few bits set. It 00027 // would be good to enhance isel to emit a loop for ctpop in this case. 00028 // 00029 // We should enhance the memset/memcpy recognition to handle multiple stores in 00030 // the loop. This would handle things like: 00031 // void foo(_Complex float *P) 00032 // for (i) { __real__(*P) = 0; __imag__(*P) = 0; } 00033 // 00034 // We should enhance this to handle negative strides through memory. 00035 // Alternatively (and perhaps better) we could rely on an earlier pass to force 00036 // forward iteration through memory, which is generally better for cache 00037 // behavior. Negative strides *do* happen for memset/memcpy loops. 00038 // 00039 // This could recognize common matrix multiplies and dot product idioms and 00040 // replace them with calls to BLAS (if linked in??). 00041 // 00042 //===----------------------------------------------------------------------===// 00043 00044 #define DEBUG_TYPE "loop-idiom" 00045 #include "llvm/Transforms/Scalar.h" 00046 #include "llvm/ADT/Statistic.h" 00047 #include "llvm/Analysis/AliasAnalysis.h" 00048 #include "llvm/Analysis/LoopPass.h" 00049 #include "llvm/Analysis/ScalarEvolutionExpander.h" 00050 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 00051 #include "llvm/Analysis/TargetTransformInfo.h" 00052 #include "llvm/Analysis/ValueTracking.h" 00053 #include "llvm/IR/DataLayout.h" 00054 #include "llvm/IR/IRBuilder.h" 00055 #include "llvm/IR/IntrinsicInst.h" 00056 #include "llvm/IR/Module.h" 00057 #include "llvm/Support/Debug.h" 00058 #include "llvm/Support/raw_ostream.h" 00059 #include "llvm/Target/TargetLibraryInfo.h" 00060 #include "llvm/Transforms/Utils/Local.h" 00061 using namespace llvm; 00062 00063 STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); 00064 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); 00065 00066 namespace { 00067 00068 class LoopIdiomRecognize; 00069 00070 /// This class defines some utility functions for loop idiom recognization. 00071 class LIRUtil { 00072 public: 00073 /// Return true iff the block contains nothing but an uncondition branch 00074 /// (aka goto instruction). 00075 static bool isAlmostEmpty(BasicBlock *); 00076 00077 static BranchInst *getBranch(BasicBlock *BB) { 00078 return dyn_cast<BranchInst>(BB->getTerminator()); 00079 } 00080 00081 /// Return the condition of the branch terminating the given basic block. 00082 static Value *getBrCondtion(BasicBlock *); 00083 00084 /// Derive the precondition block (i.e the block that guards the loop 00085 /// preheader) from the given preheader. 00086 static BasicBlock *getPrecondBb(BasicBlock *PreHead); 00087 }; 00088 00089 /// This class is to recoginize idioms of population-count conducted in 00090 /// a noncountable loop. Currently it only recognizes this pattern: 00091 /// \code 00092 /// while(x) {cnt++; ...; x &= x - 1; ...} 00093 /// \endcode 00094 class NclPopcountRecognize { 00095 LoopIdiomRecognize &LIR; 00096 Loop *CurLoop; 00097 BasicBlock *PreCondBB; 00098 00099 typedef IRBuilder<> IRBuilderTy; 00100 00101 public: 00102 explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR); 00103 bool recognize(); 00104 00105 private: 00106 /// Take a glimpse of the loop to see if we need to go ahead recoginizing 00107 /// the idiom. 00108 bool preliminaryScreen(); 00109 00110 /// Check if the given conditional branch is based on the comparison 00111 /// beween a variable and zero, and if the variable is non-zero, the 00112 /// control yeilds to the loop entry. If the branch matches the behavior, 00113 /// the variable involved in the comparion is returned. This function will 00114 /// be called to see if the precondition and postcondition of the loop 00115 /// are in desirable form. 00116 Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const; 00117 00118 /// Return true iff the idiom is detected in the loop. and 1) \p CntInst 00119 /// is set to the instruction counting the pupulation bit. 2) \p CntPhi 00120 /// is set to the corresponding phi node. 3) \p Var is set to the value 00121 /// whose population bits are being counted. 00122 bool detectIdiom 00123 (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; 00124 00125 /// Insert ctpop intrinsic function and some obviously dead instructions. 00126 void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var); 00127 00128 /// Create llvm.ctpop.* intrinsic function. 00129 CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); 00130 }; 00131 00132 class LoopIdiomRecognize : public LoopPass { 00133 Loop *CurLoop; 00134 const DataLayout *TD; 00135 DominatorTree *DT; 00136 ScalarEvolution *SE; 00137 TargetLibraryInfo *TLI; 00138 const TargetTransformInfo *TTI; 00139 public: 00140 static char ID; 00141 explicit LoopIdiomRecognize() : LoopPass(ID) { 00142 initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); 00143 TD = 0; DT = 0; SE = 0; TLI = 0; TTI = 0; 00144 } 00145 00146 bool runOnLoop(Loop *L, LPPassManager &LPM); 00147 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, 00148 SmallVectorImpl<BasicBlock*> &ExitBlocks); 00149 00150 bool processLoopStore(StoreInst *SI, const SCEV *BECount); 00151 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); 00152 00153 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, 00154 unsigned StoreAlignment, 00155 Value *SplatValue, Instruction *TheStore, 00156 const SCEVAddRecExpr *Ev, 00157 const SCEV *BECount); 00158 bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, 00159 const SCEVAddRecExpr *StoreEv, 00160 const SCEVAddRecExpr *LoadEv, 00161 const SCEV *BECount); 00162 00163 /// This transformation requires natural loop information & requires that 00164 /// loop preheaders be inserted into the CFG. 00165 /// 00166 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 00167 AU.addRequired<LoopInfo>(); 00168 AU.addPreserved<LoopInfo>(); 00169 AU.addRequiredID(LoopSimplifyID); 00170 AU.addPreservedID(LoopSimplifyID); 00171 AU.addRequiredID(LCSSAID); 00172 AU.addPreservedID(LCSSAID); 00173 AU.addRequired<AliasAnalysis>(); 00174 AU.addPreserved<AliasAnalysis>(); 00175 AU.addRequired<ScalarEvolution>(); 00176 AU.addPreserved<ScalarEvolution>(); 00177 AU.addPreserved<DominatorTree>(); 00178 AU.addRequired<DominatorTree>(); 00179 AU.addRequired<TargetLibraryInfo>(); 00180 AU.addRequired<TargetTransformInfo>(); 00181 } 00182 00183 const DataLayout *getDataLayout() { 00184 return TD ? TD : TD=getAnalysisIfAvailable<DataLayout>(); 00185 } 00186 00187 DominatorTree *getDominatorTree() { 00188 return DT ? DT : (DT=&getAnalysis<DominatorTree>()); 00189 } 00190 00191 ScalarEvolution *getScalarEvolution() { 00192 return SE ? SE : (SE = &getAnalysis<ScalarEvolution>()); 00193 } 00194 00195 TargetLibraryInfo *getTargetLibraryInfo() { 00196 return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>()); 00197 } 00198 00199 const TargetTransformInfo *getTargetTransformInfo() { 00200 return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>()); 00201 } 00202 00203 Loop *getLoop() const { return CurLoop; } 00204 00205 private: 00206 bool runOnNoncountableLoop(); 00207 bool runOnCountableLoop(); 00208 }; 00209 } 00210 00211 char LoopIdiomRecognize::ID = 0; 00212 INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", 00213 false, false) 00214 INITIALIZE_PASS_DEPENDENCY(LoopInfo) 00215 INITIALIZE_PASS_DEPENDENCY(DominatorTree) 00216 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 00217 INITIALIZE_PASS_DEPENDENCY(LCSSA) 00218 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 00219 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) 00220 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 00221 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 00222 INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", 00223 false, false) 00224 00225 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } 00226 00227 /// deleteDeadInstruction - Delete this instruction. Before we do, go through 00228 /// and zero out all the operands of this instruction. If any of them become 00229 /// dead, delete them and the computation tree that feeds them. 00230 /// 00231 static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE, 00232 const TargetLibraryInfo *TLI) { 00233 SmallVector<Instruction*, 32> NowDeadInsts; 00234 00235 NowDeadInsts.push_back(I); 00236 00237 // Before we touch this instruction, remove it from SE! 00238 do { 00239 Instruction *DeadInst = NowDeadInsts.pop_back_val(); 00240 00241 // This instruction is dead, zap it, in stages. Start by removing it from 00242 // SCEV. 00243 SE.forgetValue(DeadInst); 00244 00245 for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { 00246 Value *Op = DeadInst->getOperand(op); 00247 DeadInst->setOperand(op, 0); 00248 00249 // If this operand just became dead, add it to the NowDeadInsts list. 00250 if (!Op->use_empty()) continue; 00251 00252 if (Instruction *OpI = dyn_cast<Instruction>(Op)) 00253 if (isInstructionTriviallyDead(OpI, TLI)) 00254 NowDeadInsts.push_back(OpI); 00255 } 00256 00257 DeadInst->eraseFromParent(); 00258 00259 } while (!NowDeadInsts.empty()); 00260 } 00261 00262 /// deleteIfDeadInstruction - If the specified value is a dead instruction, 00263 /// delete it and any recursively used instructions. 00264 static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, 00265 const TargetLibraryInfo *TLI) { 00266 if (Instruction *I = dyn_cast<Instruction>(V)) 00267 if (isInstructionTriviallyDead(I, TLI)) 00268 deleteDeadInstruction(I, SE, TLI); 00269 } 00270 00271 //===----------------------------------------------------------------------===// 00272 // 00273 // Implementation of LIRUtil 00274 // 00275 //===----------------------------------------------------------------------===// 00276 00277 // This fucntion will return true iff the given block contains nothing but goto. 00278 // A typical usage of this function is to check if the preheader fucntion is 00279 // "almost" empty such that generated intrinsic function can be moved across 00280 // preheader and to be placed at the end of the preconditiona block without 00281 // concerning of breaking data dependence. 00282 bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { 00283 if (BranchInst *Br = getBranch(BB)) { 00284 return Br->isUnconditional() && BB->size() == 1; 00285 } 00286 return false; 00287 } 00288 00289 Value *LIRUtil::getBrCondtion(BasicBlock *BB) { 00290 BranchInst *Br = getBranch(BB); 00291 return Br ? Br->getCondition() : 0; 00292 } 00293 00294 BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { 00295 if (BasicBlock *BB = PreHead->getSinglePredecessor()) { 00296 BranchInst *Br = getBranch(BB); 00297 return Br && Br->isConditional() ? BB : 0; 00298 } 00299 return 0; 00300 } 00301 00302 //===----------------------------------------------------------------------===// 00303 // 00304 // Implementation of NclPopcountRecognize 00305 // 00306 //===----------------------------------------------------------------------===// 00307 00308 NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR): 00309 LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) { 00310 } 00311 00312 bool NclPopcountRecognize::preliminaryScreen() { 00313 const TargetTransformInfo *TTI = LIR.getTargetTransformInfo(); 00314 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) 00315 return false; 00316 00317 // Counting population are usually conducted by few arithmetic instrutions. 00318 // Such instructions can be easilly "absorbed" by vacant slots in a 00319 // non-compact loop. Therefore, recognizing popcount idiom only makes sense 00320 // in a compact loop. 00321 00322 // Give up if the loop has multiple blocks or multiple backedges. 00323 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 00324 return false; 00325 00326 BasicBlock *LoopBody = *(CurLoop->block_begin()); 00327 if (LoopBody->size() >= 20) { 00328 // The loop is too big, bail out. 00329 return false; 00330 } 00331 00332 // It should have a preheader containing nothing but a goto instruction. 00333 BasicBlock *PreHead = CurLoop->getLoopPreheader(); 00334 if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead)) 00335 return false; 00336 00337 // It should have a precondition block where the generated popcount instrinsic 00338 // function will be inserted. 00339 PreCondBB = LIRUtil::getPrecondBb(PreHead); 00340 if (!PreCondBB) 00341 return false; 00342 00343 return true; 00344 } 00345 00346 Value *NclPopcountRecognize::matchCondition (BranchInst *Br, 00347 BasicBlock *LoopEntry) const { 00348 if (!Br || !Br->isConditional()) 00349 return 0; 00350 00351 ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition()); 00352 if (!Cond) 00353 return 0; 00354 00355 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); 00356 if (!CmpZero || !CmpZero->isZero()) 00357 return 0; 00358 00359 ICmpInst::Predicate Pred = Cond->getPredicate(); 00360 if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) || 00361 (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry)) 00362 return Cond->getOperand(0); 00363 00364 return 0; 00365 } 00366 00367 bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, 00368 PHINode *&CntPhi, 00369 Value *&Var) const { 00370 // Following code tries to detect this idiom: 00371 // 00372 // if (x0 != 0) 00373 // goto loop-exit // the precondition of the loop 00374 // cnt0 = init-val; 00375 // do { 00376 // x1 = phi (x0, x2); 00377 // cnt1 = phi(cnt0, cnt2); 00378 // 00379 // cnt2 = cnt1 + 1; 00380 // ... 00381 // x2 = x1 & (x1 - 1); 00382 // ... 00383 // } while(x != 0); 00384 // 00385 // loop-exit: 00386 // 00387 00388 // step 1: Check to see if the look-back branch match this pattern: 00389 // "if (a!=0) goto loop-entry". 00390 BasicBlock *LoopEntry; 00391 Instruction *DefX2, *CountInst; 00392 Value *VarX1, *VarX0; 00393 PHINode *PhiX, *CountPhi; 00394 00395 DefX2 = CountInst = 0; 00396 VarX1 = VarX0 = 0; 00397 PhiX = CountPhi = 0; 00398 LoopEntry = *(CurLoop->block_begin()); 00399 00400 // step 1: Check if the loop-back branch is in desirable form. 00401 { 00402 if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry)) 00403 DefX2 = dyn_cast<Instruction>(T); 00404 else 00405 return false; 00406 } 00407 00408 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" 00409 { 00410 if (!DefX2 || DefX2->getOpcode() != Instruction::And) 00411 return false; 00412 00413 BinaryOperator *SubOneOp; 00414 00415 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) 00416 VarX1 = DefX2->getOperand(1); 00417 else { 00418 VarX1 = DefX2->getOperand(0); 00419 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); 00420 } 00421 if (!SubOneOp) 00422 return false; 00423 00424 Instruction *SubInst = cast<Instruction>(SubOneOp); 00425 ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); 00426 if (!Dec || 00427 !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || 00428 (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) { 00429 return false; 00430 } 00431 } 00432 00433 // step 3: Check the recurrence of variable X 00434 { 00435 PhiX = dyn_cast<PHINode>(VarX1); 00436 if (!PhiX || 00437 (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { 00438 return false; 00439 } 00440 } 00441 00442 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 00443 { 00444 CountInst = NULL; 00445 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(), 00446 IterE = LoopEntry->end(); Iter != IterE; Iter++) { 00447 Instruction *Inst = Iter; 00448 if (Inst->getOpcode() != Instruction::Add) 00449 continue; 00450 00451 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); 00452 if (!Inc || !Inc->isOne()) 00453 continue; 00454 00455 PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); 00456 if (!Phi || Phi->getParent() != LoopEntry) 00457 continue; 00458 00459 // Check if the result of the instruction is live of the loop. 00460 bool LiveOutLoop = false; 00461 for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); 00462 I != E; I++) { 00463 if ((cast<Instruction>(*I))->getParent() != LoopEntry) { 00464 LiveOutLoop = true; break; 00465 } 00466 } 00467 00468 if (LiveOutLoop) { 00469 CountInst = Inst; 00470 CountPhi = Phi; 00471 break; 00472 } 00473 } 00474 00475 if (!CountInst) 00476 return false; 00477 } 00478 00479 // step 5: check if the precondition is in this form: 00480 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" 00481 { 00482 BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); 00483 Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader()); 00484 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) 00485 return false; 00486 00487 CntInst = CountInst; 00488 CntPhi = CountPhi; 00489 Var = T; 00490 } 00491 00492 return true; 00493 } 00494 00495 void NclPopcountRecognize::transform(Instruction *CntInst, 00496 PHINode *CntPhi, Value *Var) { 00497 00498 ScalarEvolution *SE = LIR.getScalarEvolution(); 00499 TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo(); 00500 BasicBlock *PreHead = CurLoop->getLoopPreheader(); 00501 BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); 00502 const DebugLoc DL = CntInst->getDebugLoc(); 00503 00504 // Assuming before transformation, the loop is following: 00505 // if (x) // the precondition 00506 // do { cnt++; x &= x - 1; } while(x); 00507 00508 // Step 1: Insert the ctpop instruction at the end of the precondition block 00509 IRBuilderTy Builder(PreCondBr); 00510 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; 00511 { 00512 PopCnt = createPopcntIntrinsic(Builder, Var, DL); 00513 NewCount = PopCntZext = 00514 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); 00515 00516 if (NewCount != PopCnt) 00517 (cast<Instruction>(NewCount))->setDebugLoc(DL); 00518 00519 // TripCnt is exactly the number of iterations the loop has 00520 TripCnt = NewCount; 00521 00522 // If the popoulation counter's initial value is not zero, insert Add Inst. 00523 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); 00524 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); 00525 if (!InitConst || !InitConst->isZero()) { 00526 NewCount = Builder.CreateAdd(NewCount, CntInitVal); 00527 (cast<Instruction>(NewCount))->setDebugLoc(DL); 00528 } 00529 } 00530 00531 // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to 00532 // "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic 00533 // function would be partial dead code, and downstream passes will drag 00534 // it back from the precondition block to the preheader. 00535 { 00536 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); 00537 00538 Value *Opnd0 = PopCntZext; 00539 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); 00540 if (PreCond->getOperand(0) != Var) 00541 std::swap(Opnd0, Opnd1); 00542 00543 ICmpInst *NewPreCond = 00544 cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); 00545 PreCond->replaceAllUsesWith(NewPreCond); 00546 00547 deleteDeadInstruction(PreCond, *SE, TLI); 00548 } 00549 00550 // Step 3: Note that the population count is exactly the trip count of the 00551 // loop in question, which enble us to to convert the loop from noncountable 00552 // loop into a countable one. The benefit is twofold: 00553 // 00554 // - If the loop only counts population, the entire loop become dead after 00555 // the transformation. It is lots easier to prove a countable loop dead 00556 // than to prove a noncountable one. (In some C dialects, a infite loop 00557 // isn't dead even if it computes nothing useful. In general, DCE needs 00558 // to prove a noncountable loop finite before safely delete it.) 00559 // 00560 // - If the loop also performs something else, it remains alive. 00561 // Since it is transformed to countable form, it can be aggressively 00562 // optimized by some optimizations which are in general not applicable 00563 // to a noncountable loop. 00564 // 00565 // After this step, this loop (conceptually) would look like following: 00566 // newcnt = __builtin_ctpop(x); 00567 // t = newcnt; 00568 // if (x) 00569 // do { cnt++; x &= x-1; t--) } while (t > 0); 00570 BasicBlock *Body = *(CurLoop->block_begin()); 00571 { 00572 BranchInst *LbBr = LIRUtil::getBranch(Body); 00573 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); 00574 Type *Ty = TripCnt->getType(); 00575 00576 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin()); 00577 00578 Builder.SetInsertPoint(LbCond); 00579 Value *Opnd1 = cast<Value>(TcPhi); 00580 Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1)); 00581 Instruction *TcDec = 00582 cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true)); 00583 00584 TcPhi->addIncoming(TripCnt, PreHead); 00585 TcPhi->addIncoming(TcDec, Body); 00586 00587 CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ? 00588 CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; 00589 LbCond->setPredicate(Pred); 00590 LbCond->setOperand(0, TcDec); 00591 LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0))); 00592 } 00593 00594 // Step 4: All the references to the original population counter outside 00595 // the loop are replaced with the NewCount -- the value returned from 00596 // __builtin_ctpop(). 00597 { 00598 SmallVector<Value *, 4> CntUses; 00599 for (Value::use_iterator I = CntInst->use_begin(), E = CntInst->use_end(); 00600 I != E; I++) { 00601 if (cast<Instruction>(*I)->getParent() != Body) 00602 CntUses.push_back(*I); 00603 } 00604 for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) { 00605 (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount); 00606 } 00607 } 00608 00609 // step 5: Forget the "non-computable" trip-count SCEV associated with the 00610 // loop. The loop would otherwise not be deleted even if it becomes empty. 00611 SE->forgetLoop(CurLoop); 00612 } 00613 00614 CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, 00615 Value *Val, DebugLoc DL) { 00616 Value *Ops[] = { Val }; 00617 Type *Tys[] = { Val->getType() }; 00618 00619 Module *M = (*(CurLoop->block_begin()))->getParent()->getParent(); 00620 Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); 00621 CallInst *CI = IRBuilder.CreateCall(Func, Ops); 00622 CI->setDebugLoc(DL); 00623 00624 return CI; 00625 } 00626 00627 /// recognize - detect population count idiom in a non-countable loop. If 00628 /// detected, transform the relevant code to popcount intrinsic function 00629 /// call, and return true; otherwise, return false. 00630 bool NclPopcountRecognize::recognize() { 00631 00632 if (!LIR.getTargetTransformInfo()) 00633 return false; 00634 00635 LIR.getScalarEvolution(); 00636 00637 if (!preliminaryScreen()) 00638 return false; 00639 00640 Instruction *CntInst; 00641 PHINode *CntPhi; 00642 Value *Val; 00643 if (!detectIdiom(CntInst, CntPhi, Val)) 00644 return false; 00645 00646 transform(CntInst, CntPhi, Val); 00647 return true; 00648 } 00649 00650 //===----------------------------------------------------------------------===// 00651 // 00652 // Implementation of LoopIdiomRecognize 00653 // 00654 //===----------------------------------------------------------------------===// 00655 00656 bool LoopIdiomRecognize::runOnCountableLoop() { 00657 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); 00658 if (isa<SCEVCouldNotCompute>(BECount)) return false; 00659 00660 // If this loop executes exactly one time, then it should be peeled, not 00661 // optimized by this pass. 00662 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 00663 if (BECst->getValue()->getValue() == 0) 00664 return false; 00665 00666 // We require target data for now. 00667 if (!getDataLayout()) 00668 return false; 00669 00670 // set DT 00671 (void)getDominatorTree(); 00672 00673 LoopInfo &LI = getAnalysis<LoopInfo>(); 00674 TLI = &getAnalysis<TargetLibraryInfo>(); 00675 00676 // set TLI 00677 (void)getTargetLibraryInfo(); 00678 00679 SmallVector<BasicBlock*, 8> ExitBlocks; 00680 CurLoop->getUniqueExitBlocks(ExitBlocks); 00681 00682 DEBUG(dbgs() << "loop-idiom Scanning: F[" 00683 << CurLoop->getHeader()->getParent()->getName() 00684 << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); 00685 00686 bool MadeChange = false; 00687 // Scan all the blocks in the loop that are not in subloops. 00688 for (Loop::block_iterator BI = CurLoop->block_begin(), 00689 E = CurLoop->block_end(); BI != E; ++BI) { 00690 // Ignore blocks in subloops. 00691 if (LI.getLoopFor(*BI) != CurLoop) 00692 continue; 00693 00694 MadeChange |= runOnLoopBlock(*BI, BECount, ExitBlocks); 00695 } 00696 return MadeChange; 00697 } 00698 00699 bool LoopIdiomRecognize::runOnNoncountableLoop() { 00700 NclPopcountRecognize Popcount(*this); 00701 if (Popcount.recognize()) 00702 return true; 00703 00704 return false; 00705 } 00706 00707 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { 00708 CurLoop = L; 00709 00710 // If the loop could not be converted to canonical form, it must have an 00711 // indirectbr in it, just give up. 00712 if (!L->getLoopPreheader()) 00713 return false; 00714 00715 // Disable loop idiom recognition if the function's name is a common idiom. 00716 StringRef Name = L->getHeader()->getParent()->getName(); 00717 if (Name == "memset" || Name == "memcpy") 00718 return false; 00719 00720 SE = &getAnalysis<ScalarEvolution>(); 00721 if (SE->hasLoopInvariantBackedgeTakenCount(L)) 00722 return runOnCountableLoop(); 00723 return runOnNoncountableLoop(); 00724 } 00725 00726 /// runOnLoopBlock - Process the specified block, which lives in a counted loop 00727 /// with the specified backedge count. This block is known to be in the current 00728 /// loop and not in any subloops. 00729 bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, 00730 SmallVectorImpl<BasicBlock*> &ExitBlocks) { 00731 // We can only promote stores in this block if they are unconditionally 00732 // executed in the loop. For a block to be unconditionally executed, it has 00733 // to dominate all the exit blocks of the loop. Verify this now. 00734 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) 00735 if (!DT->dominates(BB, ExitBlocks[i])) 00736 return false; 00737 00738 bool MadeChange = false; 00739 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { 00740 Instruction *Inst = I++; 00741 // Look for store instructions, which may be optimized to memset/memcpy. 00742 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 00743 WeakVH InstPtr(I); 00744 if (!processLoopStore(SI, BECount)) continue; 00745 MadeChange = true; 00746 00747 // If processing the store invalidated our iterator, start over from the 00748 // top of the block. 00749 if (InstPtr == 0) 00750 I = BB->begin(); 00751 continue; 00752 } 00753 00754 // Look for memset instructions, which may be optimized to a larger memset. 00755 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { 00756 WeakVH InstPtr(I); 00757 if (!processLoopMemSet(MSI, BECount)) continue; 00758 MadeChange = true; 00759 00760 // If processing the memset invalidated our iterator, start over from the 00761 // top of the block. 00762 if (InstPtr == 0) 00763 I = BB->begin(); 00764 continue; 00765 } 00766 } 00767 00768 return MadeChange; 00769 } 00770 00771 00772 /// processLoopStore - See if this store can be promoted to a memset or memcpy. 00773 bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { 00774 if (!SI->isSimple()) return false; 00775 00776 Value *StoredVal = SI->getValueOperand(); 00777 Value *StorePtr = SI->getPointerOperand(); 00778 00779 // Reject stores that are so large that they overflow an unsigned. 00780 uint64_t SizeInBits = TD->getTypeSizeInBits(StoredVal->getType()); 00781 if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) 00782 return false; 00783 00784 // See if the pointer expression is an AddRec like {base,+,1} on the current 00785 // loop, which indicates a strided store. If we have something else, it's a 00786 // random store we can't handle. 00787 const SCEVAddRecExpr *StoreEv = 00788 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 00789 if (StoreEv == 0 || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) 00790 return false; 00791 00792 // Check to see if the stride matches the size of the store. If so, then we 00793 // know that every byte is touched in the loop. 00794 unsigned StoreSize = (unsigned)SizeInBits >> 3; 00795 const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); 00796 00797 if (Stride == 0 || StoreSize != Stride->getValue()->getValue()) { 00798 // TODO: Could also handle negative stride here someday, that will require 00799 // the validity check in mayLoopAccessLocation to be updated though. 00800 // Enable this to print exact negative strides. 00801 if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) { 00802 dbgs() << "NEGATIVE STRIDE: " << *SI << "\n"; 00803 dbgs() << "BB: " << *SI->getParent(); 00804 } 00805 00806 return false; 00807 } 00808 00809 // See if we can optimize just this store in isolation. 00810 if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), 00811 StoredVal, SI, StoreEv, BECount)) 00812 return true; 00813 00814 // If the stored value is a strided load in the same loop with the same stride 00815 // this this may be transformable into a memcpy. This kicks in for stuff like 00816 // for (i) A[i] = B[i]; 00817 if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { 00818 const SCEVAddRecExpr *LoadEv = 00819 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0))); 00820 if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() && 00821 StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple()) 00822 if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount)) 00823 return true; 00824 } 00825 //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n"; 00826 00827 return false; 00828 } 00829 00830 /// processLoopMemSet - See if this memset can be promoted to a large memset. 00831 bool LoopIdiomRecognize:: 00832 processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { 00833 // We can only handle non-volatile memsets with a constant size. 00834 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false; 00835 00836 // If we're not allowed to hack on memset, we fail. 00837 if (!TLI->has(LibFunc::memset)) 00838 return false; 00839 00840 Value *Pointer = MSI->getDest(); 00841 00842 // See if the pointer expression is an AddRec like {base,+,1} on the current 00843 // loop, which indicates a strided store. If we have something else, it's a 00844 // random store we can't handle. 00845 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); 00846 if (Ev == 0 || Ev->getLoop() != CurLoop || !Ev->isAffine()) 00847 return false; 00848 00849 // Reject memsets that are so large that they overflow an unsigned. 00850 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); 00851 if ((SizeInBytes >> 32) != 0) 00852 return false; 00853 00854 // Check to see if the stride matches the size of the memset. If so, then we 00855 // know that every byte is touched in the loop. 00856 const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); 00857 00858 // TODO: Could also handle negative stride here someday, that will require the 00859 // validity check in mayLoopAccessLocation to be updated though. 00860 if (Stride == 0 || MSI->getLength() != Stride->getValue()) 00861 return false; 00862 00863 return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, 00864 MSI->getAlignment(), MSI->getValue(), 00865 MSI, Ev, BECount); 00866 } 00867 00868 00869 /// mayLoopAccessLocation - Return true if the specified loop might access the 00870 /// specified pointer location, which is a loop-strided access. The 'Access' 00871 /// argument specifies what the verboten forms of access are (read or write). 00872 static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, 00873 Loop *L, const SCEV *BECount, 00874 unsigned StoreSize, AliasAnalysis &AA, 00875 Instruction *IgnoredStore) { 00876 // Get the location that may be stored across the loop. Since the access is 00877 // strided positively through memory, we say that the modified location starts 00878 // at the pointer and has infinite size. 00879 uint64_t AccessSize = AliasAnalysis::UnknownSize; 00880 00881 // If the loop iterates a fixed number of times, we can refine the access size 00882 // to be exactly the size of the memset, which is (BECount+1)*StoreSize 00883 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 00884 AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize; 00885 00886 // TODO: For this to be really effective, we have to dive into the pointer 00887 // operand in the store. Store to &A[i] of 100 will always return may alias 00888 // with store of &A[100], we need to StoreLoc to be "A" with size of 100, 00889 // which will then no-alias a store to &A[100]. 00890 AliasAnalysis::Location StoreLoc(Ptr, AccessSize); 00891 00892 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; 00893 ++BI) 00894 for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) 00895 if (&*I != IgnoredStore && 00896 (AA.getModRefInfo(I, StoreLoc) & Access)) 00897 return true; 00898 00899 return false; 00900 } 00901 00902 /// getMemSetPatternValue - If a strided store of the specified value is safe to 00903 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should 00904 /// be passed in. Otherwise, return null. 00905 /// 00906 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these 00907 /// just replicate their input array and then pass on to memset_pattern16. 00908 static Constant *getMemSetPatternValue(Value *V, const DataLayout &TD) { 00909 // If the value isn't a constant, we can't promote it to being in a constant 00910 // array. We could theoretically do a store to an alloca or something, but 00911 // that doesn't seem worthwhile. 00912 Constant *C = dyn_cast<Constant>(V); 00913 if (C == 0) return 0; 00914 00915 // Only handle simple values that are a power of two bytes in size. 00916 uint64_t Size = TD.getTypeSizeInBits(V->getType()); 00917 if (Size == 0 || (Size & 7) || (Size & (Size-1))) 00918 return 0; 00919 00920 // Don't care enough about darwin/ppc to implement this. 00921 if (TD.isBigEndian()) 00922 return 0; 00923 00924 // Convert to size in bytes. 00925 Size /= 8; 00926 00927 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see 00928 // if the top and bottom are the same (e.g. for vectors and large integers). 00929 if (Size > 16) return 0; 00930 00931 // If the constant is exactly 16 bytes, just use it. 00932 if (Size == 16) return C; 00933 00934 // Otherwise, we'll use an array of the constants. 00935 unsigned ArraySize = 16/Size; 00936 ArrayType *AT = ArrayType::get(V->getType(), ArraySize); 00937 return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C)); 00938 } 00939 00940 00941 /// processLoopStridedStore - We see a strided store of some value. If we can 00942 /// transform this into a memset or memset_pattern in the loop preheader, do so. 00943 bool LoopIdiomRecognize:: 00944 processLoopStridedStore(Value *DestPtr, unsigned StoreSize, 00945 unsigned StoreAlignment, Value *StoredVal, 00946 Instruction *TheStore, const SCEVAddRecExpr *Ev, 00947 const SCEV *BECount) { 00948 00949 // If the stored value is a byte-wise value (like i32 -1), then it may be 00950 // turned into a memset of i8 -1, assuming that all the consecutive bytes 00951 // are stored. A store of i32 0x01020304 can never be turned into a memset, 00952 // but it can be turned into memset_pattern if the target supports it. 00953 Value *SplatValue = isBytewiseValue(StoredVal); 00954 Constant *PatternValue = 0; 00955 00956 // If we're allowed to form a memset, and the stored value would be acceptable 00957 // for memset, use it. 00958 if (SplatValue && TLI->has(LibFunc::memset) && 00959 // Verify that the stored value is loop invariant. If not, we can't 00960 // promote the memset. 00961 CurLoop->isLoopInvariant(SplatValue)) { 00962 // Keep and use SplatValue. 00963 PatternValue = 0; 00964 } else if (TLI->has(LibFunc::memset_pattern16) && 00965 (PatternValue = getMemSetPatternValue(StoredVal, *TD))) { 00966 // It looks like we can use PatternValue! 00967 SplatValue = 0; 00968 } else { 00969 // Otherwise, this isn't an idiom we can transform. For example, we can't 00970 // do anything with a 3-byte store. 00971 return false; 00972 } 00973 00974 // The trip count of the loop and the base pointer of the addrec SCEV is 00975 // guaranteed to be loop invariant, which means that it should dominate the 00976 // header. This allows us to insert code for it in the preheader. 00977 BasicBlock *Preheader = CurLoop->getLoopPreheader(); 00978 IRBuilder<> Builder(Preheader->getTerminator()); 00979 SCEVExpander Expander(*SE, "loop-idiom"); 00980 00981 // Okay, we have a strided store "p[i]" of a splattable value. We can turn 00982 // this into a memset in the loop preheader now if we want. However, this 00983 // would be unsafe to do if there is anything else in the loop that may read 00984 // or write to the aliased location. Check for any overlap by generating the 00985 // base pointer and checking the region. 00986 unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace(); 00987 Value *BasePtr = 00988 Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace), 00989 Preheader->getTerminator()); 00990 00991 00992 if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, 00993 CurLoop, BECount, 00994 StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){ 00995 Expander.clear(); 00996 // If we generated new code for the base pointer, clean up. 00997 deleteIfDeadInstruction(BasePtr, *SE, TLI); 00998 return false; 00999 } 01000 01001 // Okay, everything looks good, insert the memset. 01002 01003 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to 01004 // pointer size if it isn't already. 01005 Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); 01006 BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); 01007 01008 const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), 01009 SCEV::FlagNUW); 01010 if (StoreSize != 1) 01011 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), 01012 SCEV::FlagNUW); 01013 01014 Value *NumBytes = 01015 Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); 01016 01017 CallInst *NewCall; 01018 if (SplatValue) 01019 NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment); 01020 else { 01021 Module *M = TheStore->getParent()->getParent()->getParent(); 01022 Value *MSP = M->getOrInsertFunction("memset_pattern16", 01023 Builder.getVoidTy(), 01024 Builder.getInt8PtrTy(), 01025 Builder.getInt8PtrTy(), IntPtr, 01026 (void*)0); 01027 01028 // Otherwise we should form a memset_pattern16. PatternValue is known to be 01029 // an constant array of 16-bytes. Plop the value into a mergable global. 01030 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, 01031 GlobalValue::InternalLinkage, 01032 PatternValue, ".memset_pattern"); 01033 GV->setUnnamedAddr(true); // Ok to merge these. 01034 GV->setAlignment(16); 01035 Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy()); 01036 NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes); 01037 } 01038 01039 DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" 01040 << " from store to: " << *Ev << " at: " << *TheStore << "\n"); 01041 NewCall->setDebugLoc(TheStore->getDebugLoc()); 01042 01043 // Okay, the memset has been formed. Zap the original store and anything that 01044 // feeds into it. 01045 deleteDeadInstruction(TheStore, *SE, TLI); 01046 ++NumMemSet; 01047 return true; 01048 } 01049 01050 /// processLoopStoreOfLoopLoad - We see a strided store whose value is a 01051 /// same-strided load. 01052 bool LoopIdiomRecognize:: 01053 processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, 01054 const SCEVAddRecExpr *StoreEv, 01055 const SCEVAddRecExpr *LoadEv, 01056 const SCEV *BECount) { 01057 // If we're not allowed to form memcpy, we fail. 01058 if (!TLI->has(LibFunc::memcpy)) 01059 return false; 01060 01061 LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); 01062 01063 // The trip count of the loop and the base pointer of the addrec SCEV is 01064 // guaranteed to be loop invariant, which means that it should dominate the 01065 // header. This allows us to insert code for it in the preheader. 01066 BasicBlock *Preheader = CurLoop->getLoopPreheader(); 01067 IRBuilder<> Builder(Preheader->getTerminator()); 01068 SCEVExpander Expander(*SE, "loop-idiom"); 01069 01070 // Okay, we have a strided store "p[i]" of a loaded value. We can turn 01071 // this into a memcpy in the loop preheader now if we want. However, this 01072 // would be unsafe to do if there is anything else in the loop that may read 01073 // or write the memory region we're storing to. This includes the load that 01074 // feeds the stores. Check for an alias by generating the base address and 01075 // checking everything. 01076 Value *StoreBasePtr = 01077 Expander.expandCodeFor(StoreEv->getStart(), 01078 Builder.getInt8PtrTy(SI->getPointerAddressSpace()), 01079 Preheader->getTerminator()); 01080 01081 if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef, 01082 CurLoop, BECount, StoreSize, 01083 getAnalysis<AliasAnalysis>(), SI)) { 01084 Expander.clear(); 01085 // If we generated new code for the base pointer, clean up. 01086 deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); 01087 return false; 01088 } 01089 01090 // For a memcpy, we have to make sure that the input array is not being 01091 // mutated by the loop. 01092 Value *LoadBasePtr = 01093 Expander.expandCodeFor(LoadEv->getStart(), 01094 Builder.getInt8PtrTy(LI->getPointerAddressSpace()), 01095 Preheader->getTerminator()); 01096 01097 if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount, 01098 StoreSize, getAnalysis<AliasAnalysis>(), SI)) { 01099 Expander.clear(); 01100 // If we generated new code for the base pointer, clean up. 01101 deleteIfDeadInstruction(LoadBasePtr, *SE, TLI); 01102 deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); 01103 return false; 01104 } 01105 01106 // Okay, everything is safe, we can transform this! 01107 01108 01109 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to 01110 // pointer size if it isn't already. 01111 Type *IntPtr = TD->getIntPtrType(SI->getContext()); 01112 BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); 01113 01114 const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), 01115 SCEV::FlagNUW); 01116 if (StoreSize != 1) 01117 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), 01118 SCEV::FlagNUW); 01119 01120 Value *NumBytes = 01121 Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); 01122 01123 CallInst *NewCall = 01124 Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, 01125 std::min(SI->getAlignment(), LI->getAlignment())); 01126 NewCall->setDebugLoc(SI->getDebugLoc()); 01127 01128 DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" 01129 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" 01130 << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); 01131 01132 01133 // Okay, the memset has been formed. Zap the original store and anything that 01134 // feeds into it. 01135 deleteDeadInstruction(SI, *SE, TLI); 01136 ++NumMemCpy; 01137 return true; 01138 }