LLVM  16.0.0git
ARMParallelDSP.cpp
Go to the documentation of this file.
1 //===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Armv6 introduced instructions to perform 32-bit SIMD operations. The
11 /// purpose of this pass is do some IR pattern matching to create ACLE
12 /// DSP intrinsics, which map on these 32-bit SIMD operations.
13 /// This pass runs only when unaligned accesses is supported/enabled.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "ARM.h"
18 #include "ARMSubtarget.h"
19 #include "llvm/ADT/SmallPtrSet.h"
20 #include "llvm/ADT/Statistic.h"
27 #include "llvm/IR/Instructions.h"
28 #include "llvm/IR/IntrinsicsARM.h"
29 #include "llvm/IR/NoFolder.h"
30 #include "llvm/IR/PatternMatch.h"
31 #include "llvm/Pass.h"
32 #include "llvm/PassRegistry.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Transforms/Scalar.h"
36 
37 using namespace llvm;
38 using namespace PatternMatch;
39 
40 #define DEBUG_TYPE "arm-parallel-dsp"
41 
42 STATISTIC(NumSMLAD , "Number of smlad instructions generated");
43 
44 static cl::opt<bool>
45 DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
46  cl::desc("Disable the ARM Parallel DSP pass"));
47 
48 static cl::opt<unsigned>
49 NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16),
50  cl::desc("Limit the number of loads analysed"));
51 
52 namespace {
53  struct MulCandidate;
54  class Reduction;
55 
56  using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>;
57  using MemInstList = SmallVectorImpl<LoadInst*>;
59 
60  // 'MulCandidate' holds the multiplication instructions that are candidates
61  // for parallel execution.
62  struct MulCandidate {
63  Instruction *Root;
64  Value* LHS;
65  Value* RHS;
66  bool Exchange = false;
67  bool ReadOnly = true;
68  bool Paired = false;
69  SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen.
70 
71  MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
72  Root(I), LHS(lhs), RHS(rhs) { }
73 
74  bool HasTwoLoadInputs() const {
75  return isa<LoadInst>(LHS) && isa<LoadInst>(RHS);
76  }
77 
78  LoadInst *getBaseLoad() const {
79  return VecLd.front();
80  }
81  };
82 
83  /// Represent a sequence of multiply-accumulate operations with the aim to
84  /// perform the multiplications in parallel.
85  class Reduction {
86  Instruction *Root = nullptr;
87  Value *Acc = nullptr;
88  MulCandList Muls;
89  MulPairList MulPairs;
91 
92  public:
93  Reduction() = delete;
94 
95  Reduction (Instruction *Add) : Root(Add) { }
96 
97  /// Record an Add instruction that is a part of the this reduction.
98  void InsertAdd(Instruction *I) { Adds.insert(I); }
99 
100  /// Create MulCandidates, each rooted at a Mul instruction, that is a part
101  /// of this reduction.
102  void InsertMuls() {
103  auto GetMulOperand = [](Value *V) -> Instruction* {
104  if (auto *SExt = dyn_cast<SExtInst>(V)) {
105  if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
106  if (I->getOpcode() == Instruction::Mul)
107  return I;
108  } else if (auto *I = dyn_cast<Instruction>(V)) {
109  if (I->getOpcode() == Instruction::Mul)
110  return I;
111  }
112  return nullptr;
113  };
114 
115  auto InsertMul = [this](Instruction *I) {
116  Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
117  Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
118  Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
119  };
120 
121  for (auto *Add : Adds) {
122  if (Add == Acc)
123  continue;
124  if (auto *Mul = GetMulOperand(Add->getOperand(0)))
125  InsertMul(Mul);
126  if (auto *Mul = GetMulOperand(Add->getOperand(1)))
127  InsertMul(Mul);
128  }
129  }
130 
131  /// Add the incoming accumulator value, returns true if a value had not
132  /// already been added. Returning false signals to the user that this
133  /// reduction already has a value to initialise the accumulator.
134  bool InsertAcc(Value *V) {
135  if (Acc)
136  return false;
137  Acc = V;
138  return true;
139  }
140 
141  /// Set two MulCandidates, rooted at muls, that can be executed as a single
142  /// parallel operation.
143  void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
144  bool Exchange = false) {
145  LLVM_DEBUG(dbgs() << "Pairing:\n"
146  << *Mul0->Root << "\n"
147  << *Mul1->Root << "\n");
148  Mul0->Paired = true;
149  Mul1->Paired = true;
150  if (Exchange)
151  Mul1->Exchange = true;
152  MulPairs.push_back(std::make_pair(Mul0, Mul1));
153  }
154 
155  /// Return true if enough mul operations are found that can be executed in
156  /// parallel.
157  bool CreateParallelPairs();
158 
159  /// Return the add instruction which is the root of the reduction.
160  Instruction *getRoot() { return Root; }
161 
162  bool is64Bit() const { return Root->getType()->isIntegerTy(64); }
163 
164  Type *getType() const { return Root->getType(); }
165 
166  /// Return the incoming value to be accumulated. This maybe null.
167  Value *getAccumulator() { return Acc; }
168 
169  /// Return the set of adds that comprise the reduction.
170  SetVector<Instruction*> &getAdds() { return Adds; }
171 
172  /// Return the MulCandidate, rooted at mul instruction, that comprise the
173  /// the reduction.
174  MulCandList &getMuls() { return Muls; }
175 
176  /// Return the MulCandidate, rooted at mul instructions, that have been
177  /// paired for parallel execution.
178  MulPairList &getMulPairs() { return MulPairs; }
179 
180  /// To finalise, replace the uses of the root with the intrinsic call.
181  void UpdateRoot(Instruction *SMLAD) {
182  Root->replaceAllUsesWith(SMLAD);
183  }
184 
185  void dump() {
186  LLVM_DEBUG(dbgs() << "Reduction:\n";
187  for (auto *Add : Adds)
188  LLVM_DEBUG(dbgs() << *Add << "\n");
189  for (auto &Mul : Muls)
190  LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
191  << " " << *Mul->LHS << "\n"
192  << " " << *Mul->RHS << "\n");
193  LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
194  );
195  }
196  };
197 
198  class WidenedLoad {
199  LoadInst *NewLd = nullptr;
201 
202  public:
203  WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
204  : NewLd(Wide) {
205  append_range(Loads, Lds);
206  }
207  LoadInst *getLoad() {
208  return NewLd;
209  }
210  };
211 
212  class ARMParallelDSP : public FunctionPass {
213  ScalarEvolution *SE;
214  AliasAnalysis *AA;
215  TargetLibraryInfo *TLI;
216  DominatorTree *DT;
217  const DataLayout *DL;
218  Module *M;
219  std::map<LoadInst*, LoadInst*> LoadPairs;
220  SmallPtrSet<LoadInst*, 4> OffsetLoads;
221  std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
222 
223  template<unsigned>
224  bool IsNarrowSequence(Value *V);
225  bool Search(Value *V, BasicBlock *BB, Reduction &R);
226  bool RecordMemoryOps(BasicBlock *BB);
227  void InsertParallelMACs(Reduction &Reduction);
228  bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
229  LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
230  bool CreateParallelPairs(Reduction &R);
231 
232  /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
233  /// Dual performs two signed 16x16-bit multiplications. It adds the
234  /// products to a 32-bit accumulate operand. Optionally, the instruction can
235  /// exchange the halfwords of the second operand before performing the
236  /// arithmetic.
237  bool MatchSMLAD(Function &F);
238 
239  public:
240  static char ID;
241 
242  ARMParallelDSP() : FunctionPass(ID) { }
243 
244  void getAnalysisUsage(AnalysisUsage &AU) const override {
254  AU.setPreservesCFG();
255  }
256 
257  bool runOnFunction(Function &F) override {
258  if (DisableParallelDSP)
259  return false;
260  if (skipFunction(F))
261  return false;
262 
263  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
264  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
265  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
266  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
267  auto &TPC = getAnalysis<TargetPassConfig>();
268 
269  M = F.getParent();
270  DL = &M->getDataLayout();
271 
272  auto &TM = TPC.getTM<TargetMachine>();
273  auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
274 
275  if (!ST->allowsUnalignedMem()) {
276  LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
277  "running pass ARMParallelDSP\n");
278  return false;
279  }
280 
281  if (!ST->hasDSP()) {
282  LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
283  "ARMParallelDSP\n");
284  return false;
285  }
286 
287  if (!ST->isLittle()) {
288  LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
289  << "ARMParallelDSP\n");
290  return false;
291  }
292 
293  LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
294  LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
295 
296  bool Changes = MatchSMLAD(F);
297  return Changes;
298  }
299  };
300 }
301 
302 bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
303  MemInstList &VecMem) {
304  if (!Ld0 || !Ld1)
305  return false;
306 
307  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
308  return false;
309 
310  LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
311  dbgs() << "Ld0:"; Ld0->dump();
312  dbgs() << "Ld1:"; Ld1->dump();
313  );
314 
315  VecMem.clear();
316  VecMem.push_back(Ld0);
317  VecMem.push_back(Ld1);
318  return true;
319 }
320 
321 // MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
322 // instructions, which is set to 16. So here we should collect all i8 and i16
323 // narrow operations.
324 // TODO: we currently only collect i16, and will support i8 later, so that's
325 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
326 template<unsigned MaxBitWidth>
327 bool ARMParallelDSP::IsNarrowSequence(Value *V) {
328  if (auto *SExt = dyn_cast<SExtInst>(V)) {
329  if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
330  return false;
331 
332  if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
333  // Check that this load could be paired.
334  return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
335  }
336  }
337  return false;
338 }
339 
340 /// Iterate through the block and record base, offset pairs of loads which can
341 /// be widened into a single load.
342 bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
345  LoadPairs.clear();
346  WideLoads.clear();
347 
348  // Collect loads and instruction that may write to memory. For now we only
349  // record loads which are simple, sign-extended and have a single user.
350  // TODO: Allow zero-extended loads.
351  for (auto &I : *BB) {
352  if (I.mayWriteToMemory())
353  Writes.push_back(&I);
354  auto *Ld = dyn_cast<LoadInst>(&I);
355  if (!Ld || !Ld->isSimple() ||
356  !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
357  continue;
358  Loads.push_back(Ld);
359  }
360 
361  if (Loads.empty() || Loads.size() > NumLoadLimit)
362  return false;
363 
364  using InstSet = std::set<Instruction*>;
365  using DepMap = std::map<Instruction*, InstSet>;
366  DepMap RAWDeps;
367 
368  // Record any writes that may alias a load.
370  for (auto Write : Writes) {
371  for (auto Read : Loads) {
372  MemoryLocation ReadLoc =
373  MemoryLocation(Read->getPointerOperand(), Size);
374 
375  if (!isModOrRefSet(AA->getModRefInfo(Write, ReadLoc)))
376  continue;
377  if (Write->comesBefore(Read))
378  RAWDeps[Read].insert(Write);
379  }
380  }
381 
382  // Check whether there's not a write between the two loads which would
383  // prevent them from being safely merged.
384  auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
385  bool BaseFirst = Base->comesBefore(Offset);
386  LoadInst *Dominator = BaseFirst ? Base : Offset;
387  LoadInst *Dominated = BaseFirst ? Offset : Base;
388 
389  if (RAWDeps.count(Dominated)) {
390  InstSet &WritesBefore = RAWDeps[Dominated];
391 
392  for (auto Before : WritesBefore) {
393  // We can't move the second load backward, past a write, to merge
394  // with the first load.
395  if (Dominator->comesBefore(Before))
396  return false;
397  }
398  }
399  return true;
400  };
401 
402  // Record base, offset load pairs.
403  for (auto *Base : Loads) {
404  for (auto *Offset : Loads) {
405  if (Base == Offset || OffsetLoads.count(Offset))
406  continue;
407 
408  if (isConsecutiveAccess(Base, Offset, *DL, *SE) &&
409  SafeToPair(Base, Offset)) {
410  LoadPairs[Base] = Offset;
411  OffsetLoads.insert(Offset);
412  break;
413  }
414  }
415  }
416 
417  LLVM_DEBUG(if (!LoadPairs.empty()) {
418  dbgs() << "Consecutive load pairs:\n";
419  for (auto &MapIt : LoadPairs) {
420  LLVM_DEBUG(dbgs() << *MapIt.first << ", "
421  << *MapIt.second << "\n");
422  }
423  });
424  return LoadPairs.size() > 1;
425 }
426 
427 // Search recursively back through the operands to find a tree of values that
428 // form a multiply-accumulate chain. The search records the Add and Mul
429 // instructions that form the reduction and allows us to find a single value
430 // to be used as the initial input to the accumlator.
431 bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
432  // If we find a non-instruction, try to use it as the initial accumulator
433  // value. This may have already been found during the search in which case
434  // this function will return false, signaling a search fail.
435  auto *I = dyn_cast<Instruction>(V);
436  if (!I)
437  return R.InsertAcc(V);
438 
439  if (I->getParent() != BB)
440  return false;
441 
442  switch (I->getOpcode()) {
443  default:
444  break;
445  case Instruction::PHI:
446  // Could be the accumulator value.
447  return R.InsertAcc(V);
448  case Instruction::Add: {
449  // Adds should be adding together two muls, or another add and a mul to
450  // be within the mac chain. One of the operands may also be the
451  // accumulator value at which point we should stop searching.
452  R.InsertAdd(I);
453  Value *LHS = I->getOperand(0);
454  Value *RHS = I->getOperand(1);
455  bool ValidLHS = Search(LHS, BB, R);
456  bool ValidRHS = Search(RHS, BB, R);
457 
458  if (ValidLHS && ValidRHS)
459  return true;
460 
461  // Ensure we don't add the root as the incoming accumulator.
462  if (R.getRoot() == I)
463  return false;
464 
465  return R.InsertAcc(I);
466  }
467  case Instruction::Mul: {
468  Value *MulOp0 = I->getOperand(0);
469  Value *MulOp1 = I->getOperand(1);
470  return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
471  }
472  case Instruction::SExt:
473  return Search(I->getOperand(0), BB, R);
474  }
475  return false;
476 }
477 
478 // The pass needs to identify integer add/sub reductions of 16-bit vector
479 // multiplications.
480 // To use SMLAD:
481 // 1) we first need to find integer add then look for this pattern:
482 //
483 // acc0 = ...
484 // ld0 = load i16
485 // sext0 = sext i16 %ld0 to i32
486 // ld1 = load i16
487 // sext1 = sext i16 %ld1 to i32
488 // mul0 = mul %sext0, %sext1
489 // ld2 = load i16
490 // sext2 = sext i16 %ld2 to i32
491 // ld3 = load i16
492 // sext3 = sext i16 %ld3 to i32
493 // mul1 = mul i32 %sext2, %sext3
494 // add0 = add i32 %mul0, %acc0
495 // acc1 = add i32 %add0, %mul1
496 //
497 // Which can be selected to:
498 //
499 // ldr r0
500 // ldr r1
501 // smlad r2, r0, r1, r2
502 //
503 // If constants are used instead of loads, these will need to be hoisted
504 // out and into a register.
505 //
506 // If loop invariants are used instead of loads, these need to be packed
507 // before the loop begins.
508 //
509 bool ARMParallelDSP::MatchSMLAD(Function &F) {
510  bool Changed = false;
511 
512  for (auto &BB : F) {
514  if (!RecordMemoryOps(&BB))
515  continue;
516 
517  for (Instruction &I : reverse(BB)) {
518  if (I.getOpcode() != Instruction::Add)
519  continue;
520 
521  if (AllAdds.count(&I))
522  continue;
523 
524  const auto *Ty = I.getType();
525  if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
526  continue;
527 
528  Reduction R(&I);
529  if (!Search(&I, &BB, R))
530  continue;
531 
532  R.InsertMuls();
533  LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
534 
535  if (!CreateParallelPairs(R))
536  continue;
537 
538  InsertParallelMACs(R);
539  Changed = true;
540  AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
541  LLVM_DEBUG(dbgs() << "BB after inserting parallel MACs:\n" << BB);
542  }
543  }
544 
545  return Changed;
546 }
547 
548 bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
549 
550  // Not enough mul operations to make a pair.
551  if (R.getMuls().size() < 2)
552  return false;
553 
554  // Check that the muls operate directly upon sign extended loads.
555  for (auto &MulCand : R.getMuls()) {
556  if (!MulCand->HasTwoLoadInputs())
557  return false;
558  }
559 
560  auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) {
561  // The first elements of each vector should be loads with sexts. If we
562  // find that its two pairs of consecutive loads, then these can be
563  // transformed into two wider loads and the users can be replaced with
564  // DSP intrinsics.
565  auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
566  auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
567  auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
568  auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
569 
570  // Check that each mul is operating on two different loads.
571  if (Ld0 == Ld2 || Ld1 == Ld3)
572  return false;
573 
574  if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
575  if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
576  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
577  R.AddMulPair(PMul0, PMul1);
578  return true;
579  } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
580  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
581  LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
582  R.AddMulPair(PMul0, PMul1, true);
583  return true;
584  }
585  } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
586  AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
587  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
588  LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
589  LLVM_DEBUG(dbgs() << " and swapping muls\n");
590  // Only the second operand can be exchanged, so swap the muls.
591  R.AddMulPair(PMul1, PMul0, true);
592  return true;
593  }
594  return false;
595  };
596 
597  MulCandList &Muls = R.getMuls();
598  const unsigned Elems = Muls.size();
599  for (unsigned i = 0; i < Elems; ++i) {
600  MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
601  if (PMul0->Paired)
602  continue;
603 
604  for (unsigned j = 0; j < Elems; ++j) {
605  if (i == j)
606  continue;
607 
608  MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
609  if (PMul1->Paired)
610  continue;
611 
612  const Instruction *Mul0 = PMul0->Root;
613  const Instruction *Mul1 = PMul1->Root;
614  if (Mul0 == Mul1)
615  continue;
616 
617  assert(PMul0 != PMul1 && "expected different chains");
618 
619  if (CanPair(R, PMul0, PMul1))
620  break;
621  }
622  }
623  return !R.getMulPairs().empty();
624 }
625 
626 void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
627 
628  auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
629  Value *Acc, bool Exchange,
630  Instruction *InsertAfter) {
631  // Replace the reduction chain with an intrinsic call
632 
633  Value* Args[] = { WideLd0, WideLd1, Acc };
634  Function *SMLAD = nullptr;
635  if (Exchange)
636  SMLAD = Acc->getType()->isIntegerTy(32) ?
637  Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
638  Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
639  else
640  SMLAD = Acc->getType()->isIntegerTy(32) ?
641  Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
642  Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
643 
644  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
645  BasicBlock::iterator(InsertAfter));
646  Instruction *Call = Builder.CreateCall(SMLAD, Args);
647  NumSMLAD++;
648  return Call;
649  };
650 
651  // Return the instruction after the dominated instruction.
652  auto GetInsertPoint = [this](Value *A, Value *B) {
653  assert((isa<Instruction>(A) || isa<Instruction>(B)) &&
654  "expected at least one instruction");
655 
656  Value *V = nullptr;
657  if (!isa<Instruction>(A))
658  V = B;
659  else if (!isa<Instruction>(B))
660  V = A;
661  else
662  V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A;
663 
664  return &*++BasicBlock::iterator(cast<Instruction>(V));
665  };
666 
667  Value *Acc = R.getAccumulator();
668 
669  // For any muls that were discovered but not paired, accumulate their values
670  // as before.
671  IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
672  MulCandList &MulCands = R.getMuls();
673  for (auto &MulCand : MulCands) {
674  if (MulCand->Paired)
675  continue;
676 
677  Instruction *Mul = cast<Instruction>(MulCand->Root);
678  LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
679 
680  if (R.getType() != Mul->getType()) {
681  assert(R.is64Bit() && "expected 64-bit result");
682  Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul));
683  Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));
684  }
685 
686  if (!Acc) {
687  Acc = Mul;
688  continue;
689  }
690 
691  // If Acc is the original incoming value to the reduction, it could be a
692  // phi. But the phi will dominate Mul, meaning that Mul will be the
693  // insertion point.
694  Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));
695  Acc = Builder.CreateAdd(Mul, Acc);
696  }
697 
698  if (!Acc) {
699  Acc = R.is64Bit() ?
700  ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) :
701  ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
702  } else if (Acc->getType() != R.getType()) {
703  Builder.SetInsertPoint(R.getRoot());
704  Acc = Builder.CreateSExt(Acc, R.getType());
705  }
706 
707  // Roughly sort the mul pairs in their program order.
708  llvm::sort(R.getMulPairs(), [](auto &PairA, auto &PairB) {
709  const Instruction *A = PairA.first->Root;
710  const Instruction *B = PairB.first->Root;
711  return A->comesBefore(B);
712  });
713 
714  IntegerType *Ty = IntegerType::get(M->getContext(), 32);
715  for (auto &Pair : R.getMulPairs()) {
716  MulCandidate *LHSMul = Pair.first;
717  MulCandidate *RHSMul = Pair.second;
718  LoadInst *BaseLHS = LHSMul->getBaseLoad();
719  LoadInst *BaseRHS = RHSMul->getBaseLoad();
720  LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
721  WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty);
722  LoadInst *WideRHS = WideLoads.count(BaseRHS) ?
723  WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty);
724 
725  Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
726  InsertAfter = GetInsertPoint(InsertAfter, Acc);
727  Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
728  }
729  R.UpdateRoot(cast<Instruction>(Acc));
730 }
731 
732 LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
733  IntegerType *LoadTy) {
734  assert(Loads.size() == 2 && "currently only support widening two loads");
735 
736  LoadInst *Base = Loads[0];
737  LoadInst *Offset = Loads[1];
738 
739  Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
740  Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
741 
742  assert((BaseSExt && OffsetSExt)
743  && "Loads should have a single, extending, user");
744 
745  std::function<void(Value*, Value*)> MoveBefore =
746  [&](Value *A, Value *B) -> void {
747  if (!isa<Instruction>(A) || !isa<Instruction>(B))
748  return;
749 
750  auto *Source = cast<Instruction>(A);
751  auto *Sink = cast<Instruction>(B);
752 
753  if (DT->dominates(Source, Sink) ||
754  Source->getParent() != Sink->getParent() ||
755  isa<PHINode>(Source) || isa<PHINode>(Sink))
756  return;
757 
758  Source->moveBefore(Sink);
759  for (auto &Op : Source->operands())
760  MoveBefore(Op, Source);
761  };
762 
763  // Insert the load at the point of the original dominating load.
764  LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
765  IRBuilder<NoFolder> IRB(DomLoad->getParent(),
766  ++BasicBlock::iterator(DomLoad));
767 
768  // Bitcast the pointer to a wider type and create the wide load, while making
769  // sure to maintain the original alignment as this prevents ldrd from being
770  // generated when it could be illegal due to memory alignment.
771  const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
772  Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
773  LoadTy->getPointerTo(AddrSpace));
774  LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, Base->getAlign());
775 
776  // Make sure everything is in the correct order in the basic block.
777  MoveBefore(Base->getPointerOperand(), VecPtr);
778  MoveBefore(VecPtr, WideLoad);
779 
780  // From the wide load, create two values that equal the original two loads.
781  // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
782  // TODO: Support big-endian as well.
783  Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
784  Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
785  BaseSExt->replaceAllUsesWith(NewBaseSExt);
786 
787  IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
788  Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
789  Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
790  Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
791  Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
792  OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
793 
794  LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
795  << *Base << "\n" << *Offset << "\n"
796  << "Created Wide Load:\n"
797  << *WideLoad << "\n"
798  << *Bottom << "\n"
799  << *NewBaseSExt << "\n"
800  << *Top << "\n"
801  << *Trunc << "\n"
802  << *NewOffsetSExt << "\n");
803  WideLoads.emplace(std::make_pair(Base,
804  std::make_unique<WidenedLoad>(Loads, WideLoad)));
805  return WideLoad;
806 }
807 
809  return new ARMParallelDSP();
810 }
811 
812 char ARMParallelDSP::ID = 0;
813 
814 INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
815  "Transform functions to use DSP intrinsics", false, false)
816 INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
817  "Transform functions to use DSP intrinsics", false, false)
llvm::Check::Size
@ Size
Definition: FileCheck.h:77
i
i
Definition: README.txt:29
ARMSubtarget.h
use
Move duplicate certain instructions close to their use
Definition: Localizer.cpp:32
functions
amdgpu propagate attributes Late propagate attributes from kernels to functions
Definition: AMDGPUPropagateAttributes.cpp:196
AssumptionCache.h
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
NoFolder.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1421
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::ARMSubtarget
Definition: ARMSubtarget.h:47
Scalar.h
llvm::Function
Definition: Function.h:60
llvm::Value::dump
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4831
Pass.h
is64Bit
static bool is64Bit(const char *name)
Definition: X86Disassembler.cpp:1018
NumLoadLimit
static cl::opt< unsigned > NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), cl::desc("Limit the number of loads analysed"))
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1183
Statistic.h
LoopAccessAnalysis.h
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2507
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
GlobalsModRef.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
to
Should compile to
Definition: README.txt:449
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::reverse
auto reverse(ContainerTy &&C, std::enable_if_t< has_rbegin< ContainerTy >::value > *=nullptr)
Definition: STLExtras.h:381
llvm::Instruction::comesBefore
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
Definition: Instruction.cpp:110
llvm::isModOrRefSet
bool isModOrRefSet(const ModRefInfo MRI)
Definition: AliasAnalysis.h:161
dsp
arm parallel dsp
Definition: ARMParallelDSP.cpp:816
llvm::isConsecutiveAccess
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition: LoopAccessAnalysis.cpp:1538
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
PassRegistry.h
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:241
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::AAResults
Definition: AliasAnalysis.h:469
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
TargetLibraryInfo.h
false
Definition: StackSlotColoring.cpp:141
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::Instruction
Definition: Instruction.h:42
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:302
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:879
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", "Transform functions to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP
llvm::ScalarEvolutionWrapperPass
Definition: ScalarEvolution.h:2166
SmallPtrSet.h
PatternMatch.h
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:155
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::codeview::ClassOptions::Intrinsic
@ Intrinsic
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:196
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::cl::opt< bool >
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:267
llvm::pdb::OMFSegDescFlags::Read
@ Read
llvm::AMDGPU::Hwreg::Offset
Offset
Definition: SIDefines.h:416
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:468
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:439
intrinsics
arm parallel Transform functions to use DSP intrinsics
Definition: ARMParallelDSP.cpp:817
TargetPassConfig.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
ARM.h
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
getType
static M68kRelType getType(unsigned Kind, MCSymbolRefExpr::VariantKind &Modifier, bool &IsPCRel)
Definition: M68kELFObjectWriter.cpp:48
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:99
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
Mul
BinaryOperator * Mul
Definition: X86PartialReduction.cpp:70
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:263
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::cl::Sink
@ Sink
Definition: CommandLine.h:167
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:1829
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::createARMParallelDSPPass
Pass * createARMParallelDSPPass()
Definition: ARMParallelDSP.cpp:808
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
j
return j(j<< 16)
get
Should compile to something r4 addze r3 instead we get
Definition: README.txt:24
llvm::LocationSize::beforeOrAfterPointer
constexpr static LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
Definition: MemoryLocation.h:130
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:348
llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1563
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:774
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:222
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:598
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
AA
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
Instructions.h
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1307
DisableParallelDSP
static cl::opt< bool > DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass"))
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
llvm::GlobalsAAWrapperPass
Legacy wrapper pass to provide the GlobalsAAResult object.
Definition: GlobalsModRef.h:144
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
llvm::GlobalValue::getType
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:276
llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:97
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::IntegerType::get
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::pdb::OMFSegDescFlags::Write
@ Write
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::cl::desc
Definition: CommandLine.h:412
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
BasicBlockUtils.h
Reduction
loop Loop Strength Reduction
Definition: LoopStrengthReduce.cpp:6701
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
Debug.h
llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:210
llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38