LCOV - llvm-toolchain.info - lib/Target/ARM/ARMCodeGenPrepare.cpp

LCOV - code coverage report

Current view:	top level - lib/Target/ARM - ARMCodeGenPrepare.cpp (source / functions)		Hit	Total	Coverage
Test:	llvm-toolchain.info	Lines:	153	190	80.5 %
Date:	2018-10-20 13:21:21	Functions:	16	19	84.2 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : //===----- ARMCodeGenPrepare.cpp ------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// This pass inserts intrinsics to handle small types that would otherwise be
      12             : /// promoted during legalization. Here we can manually promote types or insert
      13             : /// intrinsics which can handle narrow types that aren't supported by the
      14             : /// register classes.
      15             : //
      16             : //===----------------------------------------------------------------------===//
      17             : 
      18             : #include "ARM.h"
      19             : #include "ARMSubtarget.h"
      20             : #include "ARMTargetMachine.h"
      21             : #include "llvm/ADT/StringRef.h"
      22             : #include "llvm/CodeGen/Passes.h"
      23             : #include "llvm/CodeGen/TargetPassConfig.h"
      24             : #include "llvm/IR/Attributes.h"
      25             : #include "llvm/IR/BasicBlock.h"
      26             : #include "llvm/IR/IRBuilder.h"
      27             : #include "llvm/IR/Constants.h"
      28             : #include "llvm/IR/InstrTypes.h"
      29             : #include "llvm/IR/Instruction.h"
      30             : #include "llvm/IR/Instructions.h"
      31             : #include "llvm/IR/IntrinsicInst.h"
      32             : #include "llvm/IR/Intrinsics.h"
      33             : #include "llvm/IR/Type.h"
      34             : #include "llvm/IR/Value.h"
      35             : #include "llvm/IR/Verifier.h"
      36             : #include "llvm/Pass.h"
      37             : #include "llvm/Support/Casting.h"
      38             : #include "llvm/Support/CommandLine.h"
      39             : 
      40             : #define DEBUG_TYPE "arm-codegenprepare"
      41             : 
      42             : using namespace llvm;
      43             : 
      44             : static cl::opt<bool>
      45             : DisableCGP("arm-disable-cgp", cl::Hidden, cl::init(true),
      46             :            cl::desc("Disable ARM specific CodeGenPrepare pass"));
      47             : 
      48             : static cl::opt<bool>
      49             : EnableDSP("arm-enable-scalar-dsp", cl::Hidden, cl::init(false),
      50             :           cl::desc("Use DSP instructions for scalar operations"));
      51             : 
      52             : static cl::opt<bool>
      53             : EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false),
      54             :                    cl::desc("Use DSP instructions for scalar operations\
      55             :                             with immediate operands"));
      56             : 
      57             : // The goal of this pass is to enable more efficient code generation for
      58             : // operations on narrow types (i.e. types with < 32-bits) and this is a
      59             : // motivating IR code example:
      60             : //
      61             : //   define hidden i32 @cmp(i8 zeroext) {
      62             : //     %2 = add i8 %0, -49
      63             : //     %3 = icmp ult i8 %2, 3
      64             : //     ..
      65             : //   }
      66             : //
      67             : // The issue here is that i8 is type-legalized to i32 because i8 is not a
      68             : // legal type. Thus, arithmetic is done in integer-precision, but then the
      69             : // byte value is masked out as follows:
      70             : //
      71             : //   t19: i32 = add t4, Constant:i32<-49>
      72             : //     t24: i32 = and t19, Constant:i32<255>
      73             : //
      74             : // Consequently, we generate code like this:
      75             : //
      76             : //   subs  r0, #49
      77             : //   uxtb  r1, r0
      78             : //   cmp r1, #3
      79             : //
      80             : // This shows that masking out the byte value results in generation of
      81             : // the UXTB instruction. This is not optimal as r0 already contains the byte
      82             : // value we need, and so instead we can just generate:
      83             : //
      84             : //   sub.w r1, r0, #49
      85             : //   cmp r1, #3
      86             : //
      87             : // We achieve this by type promoting the IR to i32 like so for this example:
      88             : //
      89             : //   define i32 @cmp(i8 zeroext %c) {
      90             : //     %0 = zext i8 %c to i32
      91             : //     %c.off = add i32 %0, -49
      92             : //     %1 = icmp ult i32 %c.off, 3
      93             : //     ..
      94             : //   }
      95             : //
      96             : // For this to be valid and legal, we need to prove that the i32 add is
      97             : // producing the same value as the i8 addition, and that e.g. no overflow
      98             : // happens.
      99             : //
     100             : // A brief sketch of the algorithm and some terminology.
     101             : // We pattern match interesting IR patterns:
     102             : // - which have "sources": instructions producing narrow values (i8, i16), and
     103             : // - they have "sinks": instructions consuming these narrow values.
     104             : //
     105             : // We collect all instruction connecting sources and sinks in a worklist, so
     106             : // that we can mutate these instruction and perform type promotion when it is
     107             : // legal to do so.
     108             : 
     109             : namespace {
     110             : class IRPromoter {
     111             :   SmallPtrSet<Value*, 8> NewInsts;
     112             :   SmallVector<Instruction*, 4> InstsToRemove;
     113             :   Module *M = nullptr;
     114             :   LLVMContext &Ctx;
     115             : 
     116             : public:
     117        5130 :   IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { }
     118             : 
     119         384 :   void Cleanup() {
     120         408 :     for (auto *I : InstsToRemove) {
     121             :       LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
     122          24 :       I->dropAllReferences();
     123          24 :       I->eraseFromParent();
     124             :     }
     125             :     InstsToRemove.clear();
     126         384 :     NewInsts.clear();
     127         384 :   }
     128             : 
     129             :   void Mutate(Type *OrigTy,
     130             :               SmallPtrSetImpl<Value*> &Visited,
     131             :               SmallPtrSetImpl<Value*> &Sources,
     132             :               SmallPtrSetImpl<Instruction*> &Sinks);
     133             : };
     134             : 
     135             : class ARMCodeGenPrepare : public FunctionPass {
     136             :   const ARMSubtarget *ST = nullptr;
     137             :   IRPromoter *Promoter = nullptr;
     138             :   std::set<Value*> AllVisited;
     139             : 
     140             :   bool isSupportedValue(Value *V);
     141             :   bool isLegalToPromote(Value *V);
     142             :   bool TryToPromote(Value *V);
     143             : 
     144             : public:
     145             :   static char ID;
     146             :   static unsigned TypeSize;
     147             :   Type *OrigTy = nullptr;
     148             : 
     149        7707 :   ARMCodeGenPrepare() : FunctionPass(ID) {}
     150             : 
     151        2564 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     152             :     AU.addRequired<TargetPassConfig>();
     153        2564 :   }
     154             : 
     155           0 :   StringRef getPassName() const override { return "ARM IR optimizations"; }
     156             : 
     157             :   bool doInitialization(Module &M) override;
     158             :   bool runOnFunction(Function &F) override;
     159             :   bool doFinalization(Module &M) override;
     160             : };
     161             : 
     162             : }
     163             : 
     164             : static bool generateSignBits(Value *V) {
     165             :   if (!isa<Instruction>(V))
     166             :     return false;
     167             : 
     168             :   unsigned Opc = cast<Instruction>(V)->getOpcode();
     169         740 :   return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
     170             :          Opc == Instruction::SRem;
     171             : }
     172             : 
     173             : /// Some instructions can use 8- and 16-bit operands, and we don't need to
     174             : /// promote anything larger. We disallow booleans to make life easier when
     175             : /// dealing with icmps but allow any other integer that is <= 16 bits. Void
     176             : /// types are accepted so we can handle switches.
     177        1116 : static bool isSupportedType(Value *V) {
     178        1116 :   Type *Ty = V->getType();
     179             : 
     180             :   // Allow voids and pointers, these won't be promoted.
     181        1116 :   if (Ty->isVoidTy() || Ty->isPointerTy())
     182             :     return true;
     183             : 
     184             :   if (auto *Ld = dyn_cast<LoadInst>(V))
     185          67 :     Ty = cast<PointerType>(Ld->getPointerOperandType())->getElementType();
     186             : 
     187             :   const IntegerType *IntTy = dyn_cast<IntegerType>(Ty);
     188             :   if (!IntTy)
     189             :     return false;
     190             : 
     191        1082 :   return IntTy->getBitWidth() == ARMCodeGenPrepare::TypeSize;
     192             : }
     193             : 
     194             : /// Return true if the given value is a source in the use-def chain, producing
     195             : /// a narrow (i8, i16) value. These values will be zext to start the promotion
     196             : /// of the tree to i32. We guarantee that these won't populate the upper bits
     197             : /// of the register. ZExt on the loads will be free, and the same for call
     198             : /// return values because we only accept ones that guarantee a zeroext ret val.
     199             : /// Many arguments will have the zeroext attribute too, so those would be free
     200             : /// too.
     201        4168 : static bool isSource(Value *V) {
     202        8336 :   if (!isa<IntegerType>(V->getType()))
     203             :     return false;
     204             :   // TODO Allow zext to be sources.
     205        4080 :   if (isa<Argument>(V))
     206             :     return true;
     207             :   else if (isa<LoadInst>(V))
     208             :     return true;
     209             :   else if (isa<BitCastInst>(V))
     210             :     return true;
     211             :   else if (auto *Call = dyn_cast<CallInst>(V))
     212          12 :     return Call->hasRetAttr(Attribute::AttrKind::ZExt);
     213             :   else if (auto *Trunc = dyn_cast<TruncInst>(V))
     214          82 :     return isSupportedType(Trunc);
     215             :   return false;
     216             : }
     217             : 
     218             : /// Return true if V will require any promoted values to be truncated for the
     219             : /// the IR to remain valid. We can't mutate the value type of these
     220             : /// instructions.
     221        3619 : static bool isSink(Value *V) {
     222             :   // TODO The truncate also isn't actually necessary because we would already
     223             :   // proved that the data value is kept within the range of the original data
     224             :   // type.
     225             :   auto UsesNarrowValue = [](Value *V) {
     226         209 :     return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
     227             :   };
     228             : 
     229             :   if (auto *Store = dyn_cast<StoreInst>(V))
     230           8 :     return UsesNarrowValue(Store->getValueOperand());
     231             :   if (auto *Return = dyn_cast<ReturnInst>(V))
     232          16 :     return UsesNarrowValue(Return->getReturnValue());
     233             :   if (auto *Trunc = dyn_cast<TruncInst>(V))
     234         112 :     return UsesNarrowValue(Trunc->getOperand(0));
     235             :   if (auto *ZExt = dyn_cast<ZExtInst>(V))
     236          73 :     return UsesNarrowValue(ZExt->getOperand(0));
     237             :   if (auto *ICmp = dyn_cast<ICmpInst>(V))
     238         813 :     return ICmp->isSigned();
     239             : 
     240             :   return isa<CallInst>(V);
     241             : }
     242             : 
     243             : /// Return whether the instruction can be promoted within any modifications to
     244             : /// it's operands or result.
     245         579 : static bool isSafeOverflow(Instruction *I) {
     246             :   // FIXME Do we need NSW too?
     247         374 :   if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
     248             :     return true;
     249             : 
     250             :   // We can support a, potentially, overflowing instruction (I) if:
     251             :   // - It is only used by an unsigned icmp.
     252             :   // - The icmp uses a constant.
     253             :   // - The overflowing value (I) is decreasing, i.e would underflow - wrapping
     254             :   //   around zero to become a larger number than before.
     255             :   // - The underflowing instruction (I) also uses a constant.
     256             :   //
     257             :   // We can then use the two constants to calculate whether the result would
     258             :   // wrap in respect to itself in the original bitwidth. If it doesn't wrap,
     259             :   // just underflows the range, the icmp would give the same result whether the
     260             :   // result has been truncated or not. We calculate this by:
     261             :   // - Zero extending both constants, if needed, to 32-bits.
     262             :   // - Take the absolute value of I's constant, adding this to the icmp const.
     263             :   // - Check that this value is not out of range for small type. If it is, it
     264             :   //   means that it has underflowed enough to wrap around the icmp constant.
     265             :   //
     266             :   // For example:
     267             :   //
     268             :   // %sub = sub i8 %a, 2
     269             :   // %cmp = icmp ule i8 %sub, 254
     270             :   //
     271             :   // If %a = 0, %sub = -2 == FE == 254
     272             :   // But if this is evalulated as a i32
     273             :   // %sub = -2 == FF FF FF FE == 4294967294
     274             :   // So the unsigned compares (i8 and i32) would not yield the same result.
     275             :   //
     276             :   // Another way to look at it is:
     277             :   // %a - 2 <= 254
     278             :   // %a + 2 <= 254 + 2
     279             :   // %a <= 256
     280             :   // And we can't represent 256 in the i8 format, so we don't support it.
     281             :   //
     282             :   // Whereas:
     283             :   //
     284             :   // %sub i8 %a, 1
     285             :   // %cmp = icmp ule i8 %sub, 254
     286             :   //
     287             :   // If %a = 0, %sub = -1 == FF == 255
     288             :   // As i32:
     289             :   // %sub = -1 == FF FF FF FF == 4294967295
     290             :   //
     291             :   // In this case, the unsigned compare results would be the same and this
     292             :   // would also be true for ult, uge and ugt:
     293             :   // - (255 < 254) == (0xFFFFFFFF < 254) == false
     294             :   // - (255 <= 254) == (0xFFFFFFFF <= 254) == false
     295             :   // - (255 > 254) == (0xFFFFFFFF > 254) == true
     296             :   // - (255 >= 254) == (0xFFFFFFFF >= 254) == true
     297             :   //
     298             :   // To demonstrate why we can't handle increasing values:
     299             :   // 
     300             :   // %add = add i8 %a, 2
     301             :   // %cmp = icmp ult i8 %add, 127
     302             :   //
     303             :   // If %a = 254, %add = 256 == (i8 1)
     304             :   // As i32:
     305             :   // %add = 256
     306             :   //
     307             :   // (1 < 127) != (256 < 127)
     308             : 
     309             :   unsigned Opc = I->getOpcode();
     310         357 :   if (Opc != Instruction::Add && Opc != Instruction::Sub)
     311             :     return false;
     312             : 
     313             :   if (!I->hasOneUse() ||
     314         253 :       !isa<ICmpInst>(*I->user_begin()) ||
     315         103 :       !isa<ConstantInt>(I->getOperand(1)))
     316             :     return false;
     317             : 
     318             :   ConstantInt *OverflowConst = cast<ConstantInt>(I->getOperand(1));
     319             :   bool NegImm = OverflowConst->isNegative();
     320          79 :   bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
     321          54 :                        ((Opc == Instruction::Add) && NegImm);
     322             :   if (!IsDecreasing)
     323             :     return false;
     324             : 
     325             :   // Don't support an icmp that deals with sign bits.
     326             :   auto *CI = cast<ICmpInst>(*I->user_begin());
     327          66 :   if (CI->isSigned() || CI->isEquality())
     328             :     return false;
     329             : 
     330             :   ConstantInt *ICmpConst = nullptr;
     331             :   if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(0)))
     332             :     ICmpConst = Const;
     333             :   else if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(1)))
     334             :     ICmpConst = Const;
     335             :   else
     336             :     return false;
     337             : 
     338             :   // Now check that the result can't wrap on itself.
     339          61 :   APInt Total = ICmpConst->getValue().getBitWidth() < 32 ?
     340          61 :     ICmpConst->getValue().zext(32) : ICmpConst->getValue();
     341             : 
     342         104 :   Total += OverflowConst->getValue().getBitWidth() < 32 ?
     343         165 :     OverflowConst->getValue().abs().zext(32) : OverflowConst->getValue().abs();
     344             : 
     345          61 :   APInt Max = APInt::getAllOnesValue(ARMCodeGenPrepare::TypeSize);
     346             : 
     347          61 :   if (Total.getBitWidth() > Max.getBitWidth()) {
     348         122 :     if (Total.ugt(Max.zext(Total.getBitWidth())))
     349           7 :       return false;
     350           0 :   } else if (Max.getBitWidth() > Total.getBitWidth()) {
     351           0 :     if (Total.zext(Max.getBitWidth()).ugt(Max))
     352           0 :       return false;
     353           0 :   } else if (Total.ugt(Max))
     354           0 :     return false;
     355             : 
     356             :   LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
     357             :   return true;
     358             : }
     359             : 
     360        2562 : static bool shouldPromote(Value *V) {
     361        5124 :   if (!isa<IntegerType>(V->getType()) || isSink(V))
     362         220 :     return false;
     363             : 
     364        2342 :   if (isSource(V))
     365             :     return true;
     366             : 
     367             :   auto *I = dyn_cast<Instruction>(V);
     368             :   if (!I)
     369             :     return false;
     370             : 
     371        1845 :   if (isa<ICmpInst>(I))
     372         640 :     return false;
     373             : 
     374             :   return true;
     375             : }
     376             : 
     377             : /// Return whether we can safely mutate V's type to ExtTy without having to be
     378             : /// concerned with zero extending or truncation.
     379         862 : static bool isPromotedResultSafe(Value *V) {
     380         862 :   if (!isa<Instruction>(V))
     381             :     return true;
     382             : 
     383             :   if (generateSignBits(V))
     384             :     return false;
     385             : 
     386             :   // If I is only being used by something that will require its value to be
     387             :   // truncated, then we don't care about the promoted result.
     388             :   auto *I = cast<Instruction>(V);
     389         740 :   if (I->hasOneUse() && isSink(*I->use_begin()))
     390             :     return true;
     391             : 
     392             :   if (isa<OverflowingBinaryOperator>(I))
     393         291 :     return isSafeOverflow(I);
     394             :   return true;
     395             : }
     396             : 
     397             : /// Return the intrinsic for the instruction that can perform the same
     398             : /// operation but on a narrow type. This is using the parallel dsp intrinsics
     399             : /// on scalar values.
     400             : static Intrinsic::ID getNarrowIntrinsic(Instruction *I) {
     401             :   // Whether we use the signed or unsigned versions of these intrinsics
     402             :   // doesn't matter because we're not using the GE bits that they set in
     403             :   // the APSR.
     404             :   switch(I->getOpcode()) {
     405             :   default:
     406             :     break;
     407             :   case Instruction::Add:
     408             :     return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_uadd16 :
     409             :       Intrinsic::arm_uadd8;
     410             :   case Instruction::Sub:
     411             :     return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_usub16 :
     412             :       Intrinsic::arm_usub8;
     413             :   }
     414             :   llvm_unreachable("unhandled opcode for narrow intrinsic");
     415             : }
     416             : 
     417         114 : void IRPromoter::Mutate(Type *OrigTy,
     418             :                         SmallPtrSetImpl<Value*> &Visited,
     419             :                         SmallPtrSetImpl<Value*> &Sources,
     420             :                         SmallPtrSetImpl<Instruction*> &Sinks) {
     421         114 :   IRBuilder<> Builder{Ctx};
     422         114 :   Type *ExtTy = Type::getInt32Ty(M->getContext());
     423             :   SmallPtrSet<Value*, 8> Promoted;
     424             :   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
     425             :              << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
     426             : 
     427             :   // Cache original types.
     428             :   DenseMap<Value*, Type*> TruncTysMap;
     429         759 :   for (auto *V : Visited)
     430         645 :     TruncTysMap[V] = V->getType();
     431             : 
     432             :   auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) {
     433             :     SmallVector<Instruction*, 4> Users;
     434             :     Instruction *InstTo = dyn_cast<Instruction>(To);
     435             :     for (Use &U : From->uses()) {
     436             :       auto *User = cast<Instruction>(U.getUser());
     437             :       if (InstTo && User->isIdenticalTo(InstTo))
     438             :         continue;
     439             :       Users.push_back(User);
     440             :     }
     441             : 
     442             :     for (auto *U : Users)
     443             :       U->replaceUsesOfWith(From, To);
     444             :   };
     445             : 
     446             :   auto FixConst = [&](ConstantInt *Const, Instruction *I) {
     447             :     Constant *NewConst = isSafeOverflow(I) && Const->isNegative() ?
     448             :       ConstantExpr::getSExt(Const, ExtTy) :
     449             :       ConstantExpr::getZExt(Const, ExtTy);
     450             :     I->replaceUsesOfWith(Const, NewConst);
     451         114 :   };
     452             : 
     453             :   auto InsertDSPIntrinsic = [&](Instruction *I) {
     454             :     LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
     455             :                << *I << "\n");
     456             :     Function *DSPInst =
     457             :       Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
     458             :     Builder.SetInsertPoint(I);
     459             :     Builder.SetCurrentDebugLocation(I->getDebugLoc());
     460             :     Value *Args[] = { I->getOperand(0), I->getOperand(1) };
     461             :     CallInst *Call = Builder.CreateCall(DSPInst, Args);
     462             :     ReplaceAllUsersOfWith(I, Call);
     463             :     InstsToRemove.push_back(I);
     464             :     NewInsts.insert(Call);
     465             :     TruncTysMap[Call] = OrigTy;
     466         114 :   };
     467             : 
     468             :   auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
     469             :     LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
     470             :     Builder.SetInsertPoint(InsertPt);
     471             :     if (auto *I = dyn_cast<Instruction>(V))
     472             :       Builder.SetCurrentDebugLocation(I->getDebugLoc());
     473             :     auto *ZExt = cast<Instruction>(Builder.CreateZExt(V, ExtTy));
     474             :     if (isa<Argument>(V))
     475             :       ZExt->moveBefore(InsertPt);
     476             :     else
     477             :       ZExt->moveAfter(InsertPt);
     478             :     ReplaceAllUsersOfWith(V, ZExt);
     479             :     NewInsts.insert(ZExt);
     480             :     TruncTysMap[ZExt] = TruncTysMap[V];
     481         114 :   };
     482             : 
     483             :   // First, insert extending instructions between the sources and their users.
     484             :   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n");
     485         288 :   for (auto V : Sources) {
     486             :     LLVM_DEBUG(dbgs() << " - " << *V << "\n");
     487             :     if (auto *I = dyn_cast<Instruction>(V))
     488          57 :       InsertZExt(I, I);
     489             :     else if (auto *Arg = dyn_cast<Argument>(V)) {
     490         117 :       BasicBlock &BB = Arg->getParent()->front();
     491         117 :       InsertZExt(Arg, &*BB.getFirstInsertionPt());
     492             :     } else {
     493           0 :       llvm_unreachable("unhandled source that needs extending");
     494             :     }
     495         174 :     Promoted.insert(V);
     496             :   }
     497             : 
     498             :   LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
     499             :   // Then mutate the types of the instructions within the tree. Here we handle
     500             :   // constant operands.
     501         759 :   for (auto *V : Visited) {
     502         645 :     if (Sources.count(V))
     503             :       continue;
     504             : 
     505             :     auto *I = cast<Instruction>(V);
     506         471 :     if (Sinks.count(I))
     507             :       continue;
     508             : 
     509        1331 :     for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) {
     510         913 :       Value *Op = I->getOperand(i);
     511         913 :       if ((Op->getType() == ExtTy) || !isa<IntegerType>(Op->getType()))
     512             :         continue;
     513             : 
     514             :       if (auto *Const = dyn_cast<ConstantInt>(Op))
     515         288 :         FixConst(Const, I);
     516         146 :       else if (isa<UndefValue>(Op))
     517           4 :         I->setOperand(i, UndefValue::get(ExtTy));
     518             :     }
     519             : 
     520         418 :     if (shouldPromote(I)) {
     521         262 :       I->mutateType(ExtTy);
     522         262 :       Promoted.insert(I);
     523             :     }
     524             :   }
     525             : 
     526             :   // Now we need to remove any zexts that have become unnecessary, as well
     527             :   // as insert any intrinsics.
     528         759 :   for (auto *V : Visited) {
     529         645 :     if (Sources.count(V))
     530             :       continue;
     531             : 
     532         471 :     if (!shouldPromote(V) || isPromotedResultSafe(V))
     533         447 :       continue;
     534             : 
     535             :     assert(EnableDSP && "DSP intrinisc insertion not enabled!");
     536             : 
     537             :     // Replace unsafe instructions with appropriate intrinsic calls.
     538          24 :     InsertDSPIntrinsic(cast<Instruction>(V));
     539             :   }
     540             : 
     541             :   auto InsertTrunc = [&](Value *V) -> Instruction* {
     542             :     if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType()))
     543             :       return nullptr;
     544             : 
     545             :     if ((!Promoted.count(V) && !NewInsts.count(V)) || !TruncTysMap.count(V) ||
     546             :         Sources.count(V))
     547             :       return nullptr;
     548             : 
     549             :     Type *TruncTy = TruncTysMap[V];
     550             :     if (TruncTy == ExtTy)
     551             :       return nullptr;
     552             : 
     553             :     LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for "
     554             :                << *V << "\n");
     555             :     Builder.SetInsertPoint(cast<Instruction>(V));
     556             :     auto *Trunc = cast<Instruction>(Builder.CreateTrunc(V, TruncTy));
     557             :     NewInsts.insert(Trunc);
     558             :     return Trunc;
     559         114 :   };
     560             : 
     561             :   LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
     562             :   // Fix up any stores or returns that use the results of the promoted
     563             :   // chain.
     564         173 :   for (auto I : Sinks) {
     565             :     LLVM_DEBUG(dbgs() << " - " << *I << "\n");
     566             : 
     567             :     // Handle calls separately as we need to iterate over arg operands.
     568             :     if (auto *Call = dyn_cast<CallInst>(I)) {
     569          14 :       for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) {
     570             :         Value *Arg = Call->getArgOperand(i);
     571           8 :         if (Instruction *Trunc = InsertTrunc(Arg)) {
     572           2 :           Trunc->moveBefore(Call);
     573             :           Call->setArgOperand(i, Trunc);
     574             :         }
     575             :       }
     576             :       continue;
     577             :     }
     578             : 
     579             :     // Now handle the others.
     580         116 :     for (unsigned i = 0; i < I->getNumOperands(); ++i) {
     581         126 :       if (Instruction *Trunc = InsertTrunc(I->getOperand(i))) {
     582          50 :         Trunc->moveBefore(I);
     583          50 :         I->setOperand(i, Trunc);
     584             :       }
     585             :     }
     586             :   }
     587             :   LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete:\n");
     588             :   LLVM_DEBUG(dbgs();
     589             :              for (auto *V : Sources)
     590             :                V->dump();
     591             :              for (auto *I : NewInsts)
     592             :                I->dump();
     593             :              for (auto *V : Visited) {
     594             :                if (!Sources.count(V))
     595             :                  V->dump();
     596             :               });
     597         114 : }
     598             : 
     599             : /// We accept most instructions, as well as Arguments and ConstantInsts. We
     600             : /// Disallow casts other than zext and truncs and only allow calls if their
     601             : /// return value is zeroext. We don't allow opcodes that can introduce sign
     602             : /// bits.
     603           0 : bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
     604             :   if (isa<ICmpInst>(V))
     605           0 :     return true;
     606             : 
     607             :   // Memory instructions
     608             :   if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
     609           0 :     return true;
     610             : 
     611             :   // Branches and targets.
     612           0 :   if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
     613           0 :     return true;
     614             : 
     615             :   // Non-instruction values that we can handle.
     616           0 :   if ((isa<Constant>(V) && !isa<ConstantExpr>(V)) || isa<Argument>(V))
     617           0 :     return isSupportedType(V);
     618             : 
     619             :   if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V) ||
     620             :       isa<LoadInst>(V))
     621           0 :     return isSupportedType(V);
     622             : 
     623             :   // Truncs can be either sources or sinks.
     624             :   if (auto *Trunc = dyn_cast<TruncInst>(V))
     625           0 :     return isSupportedType(Trunc) || isSupportedType(Trunc->getOperand(0));
     626             : 
     627             :   if (isa<CastInst>(V) && !isa<SExtInst>(V))
     628           0 :     return isSupportedType(cast<CastInst>(V)->getOperand(0));
     629             : 
     630             :   // Special cases for calls as we need to check for zeroext
     631             :   // TODO We should accept calls even if they don't have zeroext, as they can
     632             :   // still be sinks.
     633             :   if (auto *Call = dyn_cast<CallInst>(V))
     634           0 :     return isSupportedType(Call) &&
     635           0 :            Call->hasRetAttr(Attribute::AttrKind::ZExt);
     636             : 
     637             :   if (!isa<BinaryOperator>(V))
     638           0 :     return false;
     639             : 
     640           0 :   if (!isSupportedType(V))
     641           0 :     return false;
     642             : 
     643             :   if (generateSignBits(V)) {
     644             :     LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n");
     645           0 :     return false;
     646             :   }
     647             :   return true;
     648             : }
     649             : 
     650             : /// Check that the type of V would be promoted and that the original type is
     651             : /// smaller than the targeted promoted type. Check that we're not trying to
     652             : /// promote something larger than our base 'TypeSize' type.
     653           0 : bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
     654           0 :   if (isPromotedResultSafe(V))
     655           0 :     return true;
     656             : 
     657             :   auto *I = dyn_cast<Instruction>(V);
     658             :   if (!I)
     659           0 :     return false;
     660             : 
     661             :   // If promotion is not safe, can we use a DSP instruction to natively
     662             :   // handle the narrow type?
     663           0 :   if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
     664           0 :     return false;
     665             : 
     666           0 :   if (ST->isThumb() && !ST->hasThumb2())
     667           0 :     return false;
     668             : 
     669           0 :   if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
     670           0 :     return false;
     671             : 
     672             :   // TODO
     673             :   // Would it be profitable? For Thumb code, these parallel DSP instructions
     674             :   // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
     675             :   // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
     676             :   // halved. They also do not take immediates as operands.
     677           0 :   for (auto &Op : I->operands()) {
     678           0 :     if (isa<Constant>(Op)) {
     679           0 :       if (!EnableDSPWithImms)
     680           0 :         return false;
     681             :     }
     682             :   }
     683             :   return true;
     684             : }
     685             : 
     686         262 : bool ARMCodeGenPrepare::TryToPromote(Value *V) {
     687         262 :   OrigTy = V->getType();
     688         262 :   TypeSize = OrigTy->getPrimitiveSizeInBits();
     689         262 :   if (TypeSize > 16 || TypeSize < 8)
     690             :     return false;
     691             : 
     692         180 :   if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
     693          44 :     return false;
     694             : 
     695             :   LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << ", TypeSize = "
     696             :              << TypeSize << "\n");
     697             : 
     698         136 :   SetVector<Value*> WorkList;
     699             :   SmallPtrSet<Value*, 8> Sources;
     700             :   SmallPtrSet<Instruction*, 4> Sinks;
     701         136 :   WorkList.insert(V);
     702             :   SmallPtrSet<Value*, 16> CurrentVisited;
     703         136 :   CurrentVisited.clear();
     704             : 
     705             :   // Return true if V was added to the worklist as a supported instruction,
     706             :   // if it was already visited, or if we don't need to explore it (e.g.
     707             :   // pointer values and GEPs), and false otherwise.
     708             :   auto AddLegalInst = [&](Value *V) {
     709             :     if (CurrentVisited.count(V))
     710             :       return true;
     711             : 
     712             :     // Ignore GEPs because they don't need promoting and the constant indices
     713             :     // will prevent the transformation.
     714             :     if (isa<GetElementPtrInst>(V))
     715             :       return true;
     716             : 
     717             :     if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
     718             :       LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n");
     719             :       return false;
     720             :     }
     721             : 
     722             :     WorkList.insert(V);
     723             :     return true;
     724         136 :   };
     725             : 
     726             :   // Iterate through, and add to, a tree of operands and users in the use-def.
     727        1141 :   while (!WorkList.empty()) {
     728        1022 :     Value *V = WorkList.back();
     729             :     WorkList.pop_back();
     730        1022 :     if (CurrentVisited.count(V))
     731         322 :       continue;
     732             : 
     733             :     // Ignore non-instructions, other than arguments.
     734        2044 :     if (!isa<Instruction>(V) && !isSource(V))
     735             :       continue;
     736             : 
     737             :     // If we've already visited this value from somewhere, bail now because
     738             :     // the tree has already been explored.
     739             :     // TODO: This could limit the transform, ie if we try to promote something
     740             :     // from an i8 and fail first, before trying an i16.
     741             :     if (AllVisited.count(V))
     742          17 :       return false;
     743             : 
     744         696 :     CurrentVisited.insert(V);
     745             :     AllVisited.insert(V);
     746             : 
     747             :     // Calls can be both sources and sinks.
     748         696 :     if (isSink(V))
     749          65 :       Sinks.insert(cast<Instruction>(V));
     750         696 :     if (isSource(V))
     751         187 :       Sources.insert(V);
     752         509 :     else if (auto *I = dyn_cast<Instruction>(V)) {
     753             :       // Visit operands of any instruction visited.
     754        2052 :       for (auto &U : I->operands()) {
     755        1043 :         if (!AddLegalInst(U))
     756             :           return false;
     757             :       }
     758             :     }
     759             : 
     760             :     // Don't visit users of a node which isn't going to be mutated unless its a
     761             :     // source.
     762         687 :     if (isSource(V) || shouldPromote(V)) {
     763        1100 :       for (Use &U : V->uses()) {
     764         645 :         if (!AddLegalInst(U.getUser()))
     765             :           return false;
     766             :       }
     767             :     }
     768             :   }
     769             : 
     770             :   LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
     771             :              for (auto *I : CurrentVisited)
     772             :                I->dump();
     773             :              );
     774             :   unsigned ToPromote = 0;
     775         779 :   for (auto *V : CurrentVisited) {
     776         660 :     if (Sources.count(V))
     777             :       continue;
     778         481 :     if (Sinks.count(cast<Instruction>(V)))
     779             :       continue;
     780         423 :     ++ToPromote;
     781             :   }
     782             : 
     783         119 :   if (ToPromote < 2)
     784             :     return false;
     785             : 
     786         114 :   Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks);
     787         114 :   return true;
     788             : }
     789             : 
     790        2565 : bool ARMCodeGenPrepare::doInitialization(Module &M) {
     791        2565 :   Promoter = new IRPromoter(&M);
     792        2565 :   return false;
     793             : }
     794             : 
     795       13501 : bool ARMCodeGenPrepare::runOnFunction(Function &F) {
     796       13501 :   if (skipFunction(F) || DisableCGP)
     797             :     return false;
     798             : 
     799         203 :   auto *TPC = &getAnalysis<TargetPassConfig>();
     800         203 :   if (!TPC)
     801             :     return false;
     802             : 
     803         203 :   const TargetMachine &TM = TPC->getTM<TargetMachine>();
     804         203 :   ST = &TM.getSubtarget<ARMSubtarget>(F);
     805             :   bool MadeChange = false;
     806             :   LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F.getName() << "\n");
     807             : 
     808             :   // Search up from icmps to try to promote their operands.
     809         587 :   for (BasicBlock &BB : F) {
     810             :     auto &Insts = BB.getInstList();
     811        2004 :     for (auto &I : Insts) {
     812        3056 :       if (AllVisited.count(&I))
     813         184 :         continue;
     814             : 
     815        1436 :       if (isa<ICmpInst>(I)) {
     816             :         auto &CI = cast<ICmpInst>(I);
     817             : 
     818             :         // Skip signed or pointer compares
     819         697 :         if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType()))
     820             :           continue;
     821             : 
     822         880 :         for (auto &Op : CI.operands()) {
     823             :           if (auto *I = dyn_cast<Instruction>(Op))
     824         262 :             MadeChange |= TryToPromote(I);
     825             :         }
     826             :       }
     827             :     }
     828         384 :     Promoter->Cleanup();
     829             :     LLVM_DEBUG(if (verifyFunction(F, &dbgs())) {
     830             :                 dbgs();
     831             :                 report_fatal_error("Broken function after type promotion");
     832             :                });
     833             :   }
     834             :   if (MadeChange)
     835             :     LLVM_DEBUG(dbgs() << "After ARMCodeGenPrepare: " << F << "\n");
     836             : 
     837             :   return MadeChange;
     838             : }
     839             : 
     840        2538 : bool ARMCodeGenPrepare::doFinalization(Module &M) {
     841        2538 :   delete Promoter;
     842        2538 :   return false;
     843             : }
     844             : 
     845       85105 : INITIALIZE_PASS_BEGIN(ARMCodeGenPrepare, DEBUG_TYPE,
     846             :                       "ARM IR optimizations", false, false)
     847      199024 : INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations",
     848             :                     false, false)
     849             : 
     850             : char ARMCodeGenPrepare::ID = 0;
     851             : unsigned ARMCodeGenPrepare::TypeSize = 0;
     852             : 
     853        2569 : FunctionPass *llvm::createARMCodeGenPreparePass() {
     854        2569 :   return new ARMCodeGenPrepare();
     855             : }

Generated by: LCOV version 1.13