LLVM 22.0.0git
ARMParallelDSP.cpp
Go to the documentation of this file.
1//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Armv6 introduced instructions to perform 32-bit SIMD operations. The
11/// purpose of this pass is do some IR pattern matching to create ACLE
12/// DSP intrinsics, which map on these 32-bit SIMD operations.
13/// This pass runs only when unaligned accesses is supported/enabled.
14//
15//===----------------------------------------------------------------------===//
16
17#include "ARM.h"
18#include "ARMSubtarget.h"
20#include "llvm/ADT/Statistic.h"
27#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/IntrinsicsARM.h"
30#include "llvm/IR/Module.h"
31#include "llvm/IR/NoFolder.h"
33#include "llvm/Pass.h"
34#include "llvm/Support/Debug.h"
37
38using namespace llvm;
39using namespace PatternMatch;
40
41#define DEBUG_TYPE "arm-parallel-dsp"
42
43STATISTIC(NumSMLAD , "Number of smlad instructions generated");
44
45static cl::opt<bool>
46DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
47 cl::desc("Disable the ARM Parallel DSP pass"));
48
50NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16),
51 cl::desc("Limit the number of loads analysed"));
52
53namespace {
54 struct MulCandidate;
55 class Reduction;
56
57 using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>;
58 using MemInstList = SmallVectorImpl<LoadInst*>;
60
61 // 'MulCandidate' holds the multiplication instructions that are candidates
62 // for parallel execution.
63 struct MulCandidate {
64 Instruction *Root;
65 Value* LHS;
66 Value* RHS;
67 bool Exchange = false;
68 bool Paired = false;
69 SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen.
70
71 MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
72 Root(I), LHS(lhs), RHS(rhs) { }
73
74 bool HasTwoLoadInputs() const {
75 return isa<LoadInst>(LHS) && isa<LoadInst>(RHS);
76 }
77
78 LoadInst *getBaseLoad() const {
79 return VecLd.front();
80 }
81 };
82
83 /// Represent a sequence of multiply-accumulate operations with the aim to
84 /// perform the multiplications in parallel.
85 class Reduction {
86 Instruction *Root = nullptr;
87 Value *Acc = nullptr;
88 MulCandList Muls;
89 MulPairList MulPairs;
90 SetVector<Instruction*> Adds;
91
92 public:
93 Reduction() = delete;
94
95 Reduction (Instruction *Add) : Root(Add) { }
96
97 /// Record an Add instruction that is a part of the this reduction.
98 void InsertAdd(Instruction *I) { Adds.insert(I); }
99
100 /// Create MulCandidates, each rooted at a Mul instruction, that is a part
101 /// of this reduction.
102 void InsertMuls() {
103 auto GetMulOperand = [](Value *V) -> Instruction* {
104 if (auto *SExt = dyn_cast<SExtInst>(V)) {
105 if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
106 if (I->getOpcode() == Instruction::Mul)
107 return I;
108 } else if (auto *I = dyn_cast<Instruction>(V)) {
109 if (I->getOpcode() == Instruction::Mul)
110 return I;
111 }
112 return nullptr;
113 };
114
115 auto InsertMul = [this](Instruction *I) {
116 Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
117 Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
118 Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
119 };
120
121 for (auto *Add : Adds) {
122 if (Add == Acc)
123 continue;
124 if (auto *Mul = GetMulOperand(Add->getOperand(0)))
125 InsertMul(Mul);
126 if (auto *Mul = GetMulOperand(Add->getOperand(1)))
127 InsertMul(Mul);
128 }
129 }
130
131 /// Add the incoming accumulator value, returns true if a value had not
132 /// already been added. Returning false signals to the user that this
133 /// reduction already has a value to initialise the accumulator.
134 bool InsertAcc(Value *V) {
135 if (Acc)
136 return false;
137 Acc = V;
138 return true;
139 }
140
141 /// Set two MulCandidates, rooted at muls, that can be executed as a single
142 /// parallel operation.
143 void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
144 bool Exchange = false) {
145 LLVM_DEBUG(dbgs() << "Pairing:\n"
146 << *Mul0->Root << "\n"
147 << *Mul1->Root << "\n");
148 Mul0->Paired = true;
149 Mul1->Paired = true;
150 if (Exchange)
151 Mul1->Exchange = true;
152 MulPairs.push_back(std::make_pair(Mul0, Mul1));
153 }
154
155 /// Return the add instruction which is the root of the reduction.
156 Instruction *getRoot() { return Root; }
157
158 bool is64Bit() const { return Root->getType()->isIntegerTy(64); }
159
160 Type *getType() const { return Root->getType(); }
161
162 /// Return the incoming value to be accumulated. This maybe null.
163 Value *getAccumulator() { return Acc; }
164
165 /// Return the set of adds that comprise the reduction.
166 SetVector<Instruction*> &getAdds() { return Adds; }
167
168 /// Return the MulCandidate, rooted at mul instruction, that comprise the
169 /// the reduction.
170 MulCandList &getMuls() { return Muls; }
171
172 /// Return the MulCandidate, rooted at mul instructions, that have been
173 /// paired for parallel execution.
174 MulPairList &getMulPairs() { return MulPairs; }
175
176 /// To finalise, replace the uses of the root with the intrinsic call.
177 void UpdateRoot(Instruction *SMLAD) {
178 Root->replaceAllUsesWith(SMLAD);
179 }
180
181 void dump() {
182 LLVM_DEBUG(dbgs() << "Reduction:\n";
183 for (auto *Add : Adds)
184 LLVM_DEBUG(dbgs() << *Add << "\n");
185 for (auto &Mul : Muls)
186 LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
187 << " " << *Mul->LHS << "\n"
188 << " " << *Mul->RHS << "\n");
189 LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
190 );
191 }
192 };
193
194 class WidenedLoad {
195 LoadInst *NewLd = nullptr;
197
198 public:
199 WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
200 : NewLd(Wide) {
201 append_range(Loads, Lds);
202 }
203 LoadInst *getLoad() {
204 return NewLd;
205 }
206 };
207
208 class ARMParallelDSP : public FunctionPass {
209 ScalarEvolution *SE;
210 AliasAnalysis *AA;
211 TargetLibraryInfo *TLI;
212 DominatorTree *DT;
213 const DataLayout *DL;
214 Module *M;
215 std::map<LoadInst*, LoadInst*> LoadPairs;
216 SmallPtrSet<LoadInst*, 4> OffsetLoads;
217 std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
218
219 template<unsigned>
220 bool IsNarrowSequence(Value *V);
221 bool Search(Value *V, BasicBlock *BB, Reduction &R);
222 bool RecordMemoryOps(BasicBlock *BB);
223 void InsertParallelMACs(Reduction &Reduction);
224 bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
225 LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
226 bool CreateParallelPairs(Reduction &R);
227
228 /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
229 /// Dual performs two signed 16x16-bit multiplications. It adds the
230 /// products to a 32-bit accumulate operand. Optionally, the instruction can
231 /// exchange the halfwords of the second operand before performing the
232 /// arithmetic.
233 bool MatchSMLAD(Function &F);
234
235 public:
236 static char ID;
237
238 ARMParallelDSP() : FunctionPass(ID) { }
239
240 void getAnalysisUsage(AnalysisUsage &AU) const override {
241 FunctionPass::getAnalysisUsage(AU);
242 AU.addRequired<AssumptionCacheTracker>();
243 AU.addRequired<ScalarEvolutionWrapperPass>();
244 AU.addRequired<AAResultsWrapperPass>();
245 AU.addRequired<TargetLibraryInfoWrapperPass>();
246 AU.addRequired<DominatorTreeWrapperPass>();
247 AU.addRequired<TargetPassConfig>();
248 AU.addPreserved<ScalarEvolutionWrapperPass>();
249 AU.addPreserved<GlobalsAAWrapperPass>();
250 AU.setPreservesCFG();
251 }
252
253 bool runOnFunction(Function &F) override {
255 return false;
256 if (skipFunction(F))
257 return false;
258
259 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
260 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
261 TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
262 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
263 auto &TPC = getAnalysis<TargetPassConfig>();
264
265 M = F.getParent();
266 DL = &M->getDataLayout();
267
268 auto &TM = TPC.getTM<TargetMachine>();
269 auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
270
271 if (!ST->allowsUnalignedMem()) {
272 LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
273 "running pass ARMParallelDSP\n");
274 return false;
275 }
276
277 if (!ST->hasDSP()) {
278 LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
279 "ARMParallelDSP\n");
280 return false;
281 }
282
283 if (!ST->isLittle()) {
284 LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
285 << "ARMParallelDSP\n");
286 return false;
287 }
288
289 LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
290 LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
291
292 bool Changes = MatchSMLAD(F);
293 return Changes;
294 }
295 };
296}
297
298bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
299 MemInstList &VecMem) {
300 if (!Ld0 || !Ld1)
301 return false;
302
303 auto It = LoadPairs.find(Ld0);
304 if (It == LoadPairs.end() || It->second != Ld1)
305 return false;
306
307 LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
308 dbgs() << "Ld0:"; Ld0->dump();
309 dbgs() << "Ld1:"; Ld1->dump();
310 );
311
312 VecMem.clear();
313 VecMem.push_back(Ld0);
314 VecMem.push_back(Ld1);
315 return true;
316}
317
318// MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
319// instructions, which is set to 16. So here we should collect all i8 and i16
320// narrow operations.
321// TODO: we currently only collect i16, and will support i8 later, so that's
322// why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
323template<unsigned MaxBitWidth>
324bool ARMParallelDSP::IsNarrowSequence(Value *V) {
325 if (auto *SExt = dyn_cast<SExtInst>(V)) {
326 if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
327 return false;
328
329 if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
330 // Check that this load could be paired.
331 return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
332 }
333 }
334 return false;
335}
336
337/// Iterate through the block and record base, offset pairs of loads which can
338/// be widened into a single load.
339bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
341 SmallVector<Instruction*, 8> Writes;
342 LoadPairs.clear();
343 WideLoads.clear();
344
345 // Collect loads and instruction that may write to memory. For now we only
346 // record loads which are simple, sign-extended and have a single user.
347 // TODO: Allow zero-extended loads.
348 for (auto &I : *BB) {
349 if (I.mayWriteToMemory())
350 Writes.push_back(&I);
351 auto *Ld = dyn_cast<LoadInst>(&I);
352 if (!Ld || !Ld->isSimple() ||
353 !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
354 continue;
355 Loads.push_back(Ld);
356 }
357
358 if (Loads.empty() || Loads.size() > NumLoadLimit)
359 return false;
360
361 using InstSet = std::set<Instruction*>;
362 using DepMap = std::map<Instruction*, InstSet>;
363 DepMap RAWDeps;
364
365 // Record any writes that may alias a load.
367 for (auto *Write : Writes) {
368 for (auto *Read : Loads) {
369 MemoryLocation ReadLoc =
370 MemoryLocation(Read->getPointerOperand(), Size);
371
372 if (!isModOrRefSet(AA->getModRefInfo(Write, ReadLoc)))
373 continue;
374 if (Write->comesBefore(Read))
375 RAWDeps[Read].insert(Write);
376 }
377 }
378
379 // Check whether there's not a write between the two loads which would
380 // prevent them from being safely merged.
381 auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
382 bool BaseFirst = Base->comesBefore(Offset);
383 LoadInst *Dominator = BaseFirst ? Base : Offset;
384 LoadInst *Dominated = BaseFirst ? Offset : Base;
385
386 if (auto It = RAWDeps.find(Dominated); It != RAWDeps.end()) {
387 InstSet &WritesBefore = It->second;
388
389 for (auto *Before : WritesBefore) {
390 // We can't move the second load backward, past a write, to merge
391 // with the first load.
392 if (Dominator->comesBefore(Before))
393 return false;
394 }
395 }
396 return true;
397 };
398
399 // Record base, offset load pairs.
400 for (auto *Base : Loads) {
401 for (auto *Offset : Loads) {
402 if (Base == Offset || OffsetLoads.count(Offset))
403 continue;
404
405 if (isConsecutiveAccess(Base, Offset, *DL, *SE) &&
406 SafeToPair(Base, Offset)) {
407 LoadPairs[Base] = Offset;
408 OffsetLoads.insert(Offset);
409 break;
410 }
411 }
412 }
413
414 LLVM_DEBUG(if (!LoadPairs.empty()) {
415 dbgs() << "Consecutive load pairs:\n";
416 for (auto &MapIt : LoadPairs) {
417 LLVM_DEBUG(dbgs() << *MapIt.first << ", "
418 << *MapIt.second << "\n");
419 }
420 });
421 return LoadPairs.size() > 1;
422}
423
424// Search recursively back through the operands to find a tree of values that
425// form a multiply-accumulate chain. The search records the Add and Mul
426// instructions that form the reduction and allows us to find a single value
427// to be used as the initial input to the accumlator.
428bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
429 // If we find a non-instruction, try to use it as the initial accumulator
430 // value. This may have already been found during the search in which case
431 // this function will return false, signaling a search fail.
432 auto *I = dyn_cast<Instruction>(V);
433 if (!I)
434 return R.InsertAcc(V);
435
436 if (I->getParent() != BB)
437 return false;
438
439 switch (I->getOpcode()) {
440 default:
441 break;
442 case Instruction::PHI:
443 // Could be the accumulator value.
444 return R.InsertAcc(V);
445 case Instruction::Add: {
446 // Adds should be adding together two muls, or another add and a mul to
447 // be within the mac chain. One of the operands may also be the
448 // accumulator value at which point we should stop searching.
449 R.InsertAdd(I);
450 Value *LHS = I->getOperand(0);
451 Value *RHS = I->getOperand(1);
452 bool ValidLHS = Search(LHS, BB, R);
453 bool ValidRHS = Search(RHS, BB, R);
454
455 if (ValidLHS && ValidRHS)
456 return true;
457
458 // Ensure we don't add the root as the incoming accumulator.
459 if (R.getRoot() == I)
460 return false;
461
462 return R.InsertAcc(I);
463 }
464 case Instruction::Mul: {
465 Value *MulOp0 = I->getOperand(0);
466 Value *MulOp1 = I->getOperand(1);
467 return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
468 }
469 case Instruction::SExt:
470 return Search(I->getOperand(0), BB, R);
471 }
472 return false;
473}
474
475// The pass needs to identify integer add/sub reductions of 16-bit vector
476// multiplications.
477// To use SMLAD:
478// 1) we first need to find integer add then look for this pattern:
479//
480// acc0 = ...
481// ld0 = load i16
482// sext0 = sext i16 %ld0 to i32
483// ld1 = load i16
484// sext1 = sext i16 %ld1 to i32
485// mul0 = mul %sext0, %sext1
486// ld2 = load i16
487// sext2 = sext i16 %ld2 to i32
488// ld3 = load i16
489// sext3 = sext i16 %ld3 to i32
490// mul1 = mul i32 %sext2, %sext3
491// add0 = add i32 %mul0, %acc0
492// acc1 = add i32 %add0, %mul1
493//
494// Which can be selected to:
495//
496// ldr r0
497// ldr r1
498// smlad r2, r0, r1, r2
499//
500// If constants are used instead of loads, these will need to be hoisted
501// out and into a register.
502//
503// If loop invariants are used instead of loads, these need to be packed
504// before the loop begins.
505//
506bool ARMParallelDSP::MatchSMLAD(Function &F) {
507 bool Changed = false;
508
509 for (auto &BB : F) {
510 SmallPtrSet<Instruction*, 4> AllAdds;
511 if (!RecordMemoryOps(&BB))
512 continue;
513
514 for (Instruction &I : reverse(BB)) {
515 if (I.getOpcode() != Instruction::Add)
516 continue;
517
518 if (AllAdds.count(&I))
519 continue;
520
521 const auto *Ty = I.getType();
522 if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
523 continue;
524
525 Reduction R(&I);
526 if (!Search(&I, &BB, R))
527 continue;
528
529 R.InsertMuls();
530 LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
531
532 if (!CreateParallelPairs(R))
533 continue;
534
535 InsertParallelMACs(R);
536 Changed = true;
537 AllAdds.insert_range(R.getAdds());
538 LLVM_DEBUG(dbgs() << "BB after inserting parallel MACs:\n" << BB);
539 }
540 }
541
542 return Changed;
543}
544
545bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
546
547 // Not enough mul operations to make a pair.
548 if (R.getMuls().size() < 2)
549 return false;
550
551 // Check that the muls operate directly upon sign extended loads.
552 for (auto &MulCand : R.getMuls()) {
553 if (!MulCand->HasTwoLoadInputs())
554 return false;
555 }
556
557 auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) {
558 // The first elements of each vector should be loads with sexts. If we
559 // find that its two pairs of consecutive loads, then these can be
560 // transformed into two wider loads and the users can be replaced with
561 // DSP intrinsics.
562 auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
563 auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
564 auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
565 auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
566
567 // Check that each mul is operating on two different loads.
568 if (Ld0 == Ld2 || Ld1 == Ld3)
569 return false;
570
571 if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
572 if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
573 LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
574 R.AddMulPair(PMul0, PMul1);
575 return true;
576 } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
577 LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
578 LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
579 R.AddMulPair(PMul0, PMul1, true);
580 return true;
581 }
582 } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
583 AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
584 LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
585 LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
586 LLVM_DEBUG(dbgs() << " and swapping muls\n");
587 // Only the second operand can be exchanged, so swap the muls.
588 R.AddMulPair(PMul1, PMul0, true);
589 return true;
590 }
591 return false;
592 };
593
594 MulCandList &Muls = R.getMuls();
595 const unsigned Elems = Muls.size();
596 for (unsigned i = 0; i < Elems; ++i) {
597 MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
598 if (PMul0->Paired)
599 continue;
600
601 for (unsigned j = 0; j < Elems; ++j) {
602 if (i == j)
603 continue;
604
605 MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
606 if (PMul1->Paired)
607 continue;
608
609 const Instruction *Mul0 = PMul0->Root;
610 const Instruction *Mul1 = PMul1->Root;
611 if (Mul0 == Mul1)
612 continue;
613
614 assert(PMul0 != PMul1 && "expected different chains");
615
616 if (CanPair(R, PMul0, PMul1))
617 break;
618 }
619 }
620 return !R.getMulPairs().empty();
621}
622
623void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
624
625 auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
626 Value *Acc, bool Exchange,
627 Instruction *InsertAfter) {
628 // Replace the reduction chain with an intrinsic call
629
630 Value* Args[] = { WideLd0, WideLd1, Acc };
631 Function *SMLAD = nullptr;
632 if (Exchange)
633 SMLAD =
634 Acc->getType()->isIntegerTy(32)
635 ? Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smladx)
637 else
638 SMLAD = Acc->getType()->isIntegerTy(32)
639 ? Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smlad)
640 : Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smlald);
641
642 IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
643 BasicBlock::iterator(InsertAfter));
644 Instruction *Call = Builder.CreateCall(SMLAD, Args);
645 NumSMLAD++;
646 return Call;
647 };
648
649 // Return the instruction after the dominated instruction.
650 auto GetInsertPoint = [this](Value *A, Value *B) {
652 "expected at least one instruction");
653
654 Value *V = nullptr;
655 if (!isa<Instruction>(A))
656 V = B;
657 else if (!isa<Instruction>(B))
658 V = A;
659 else
661
663 };
664
665 Value *Acc = R.getAccumulator();
666
667 // For any muls that were discovered but not paired, accumulate their values
668 // as before.
669 IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
670 MulCandList &MulCands = R.getMuls();
671 for (auto &MulCand : MulCands) {
672 if (MulCand->Paired)
673 continue;
674
675 Instruction *Mul = cast<Instruction>(MulCand->Root);
676 LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
677
678 if (R.getType() != Mul->getType()) {
679 assert(R.is64Bit() && "expected 64-bit result");
680 Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul));
681 Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));
682 }
683
684 if (!Acc) {
685 Acc = Mul;
686 continue;
687 }
688
689 // If Acc is the original incoming value to the reduction, it could be a
690 // phi. But the phi will dominate Mul, meaning that Mul will be the
691 // insertion point.
692 Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));
693 Acc = Builder.CreateAdd(Mul, Acc);
694 }
695
696 if (!Acc) {
697 Acc = R.is64Bit() ?
698 ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) :
699 ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
700 } else if (Acc->getType() != R.getType()) {
701 Builder.SetInsertPoint(R.getRoot());
702 Acc = Builder.CreateSExt(Acc, R.getType());
703 }
704
705 // Roughly sort the mul pairs in their program order.
706 llvm::sort(R.getMulPairs(), [](auto &PairA, auto &PairB) {
707 const Instruction *A = PairA.first->Root;
708 const Instruction *B = PairB.first->Root;
709 return A->comesBefore(B);
710 });
711
712 IntegerType *Ty = IntegerType::get(M->getContext(), 32);
713 for (auto &Pair : R.getMulPairs()) {
714 MulCandidate *LHSMul = Pair.first;
715 MulCandidate *RHSMul = Pair.second;
716 LoadInst *BaseLHS = LHSMul->getBaseLoad();
717 LoadInst *BaseRHS = RHSMul->getBaseLoad();
718 auto LIt = WideLoads.find(BaseLHS);
719 LoadInst *WideLHS = LIt != WideLoads.end()
720 ? LIt->second->getLoad()
721 : CreateWideLoad(LHSMul->VecLd, Ty);
722 auto RIt = WideLoads.find(BaseRHS);
723 LoadInst *WideRHS = RIt != WideLoads.end()
724 ? RIt->second->getLoad()
725 : CreateWideLoad(RHSMul->VecLd, Ty);
726
727 Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
728 InsertAfter = GetInsertPoint(InsertAfter, Acc);
729 Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
730 }
731 R.UpdateRoot(cast<Instruction>(Acc));
732}
733
734LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
735 IntegerType *LoadTy) {
736 assert(Loads.size() == 2 && "currently only support widening two loads");
737
738 LoadInst *Base = Loads[0];
739 LoadInst *Offset = Loads[1];
740
741 Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
742 Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
743
744 assert((BaseSExt && OffsetSExt)
745 && "Loads should have a single, extending, user");
746
747 std::function<void(Value*, Value*)> MoveBefore =
748 [&](Value *A, Value *B) -> void {
750 return;
751
752 auto *Source = cast<Instruction>(A);
753 auto *Sink = cast<Instruction>(B);
754
755 if (DT->dominates(Source, Sink) ||
756 Source->getParent() != Sink->getParent() ||
757 isa<PHINode>(Source) || isa<PHINode>(Sink))
758 return;
759
760 Source->moveBefore(Sink->getIterator());
761 for (auto &Op : Source->operands())
762 MoveBefore(Op, Source);
763 };
764
765 // Insert the load at the point of the original dominating load.
766 LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
767 IRBuilder<NoFolder> IRB(DomLoad->getParent(),
768 ++BasicBlock::iterator(DomLoad));
769
770 // Create the wide load, while making sure to maintain the original alignment
771 // as this prevents ldrd from being generated when it could be illegal due to
772 // memory alignment.
773 Value *VecPtr = Base->getPointerOperand();
774 LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, Base->getAlign());
775
776 // Make sure everything is in the correct order in the basic block.
777 MoveBefore(Base->getPointerOperand(), VecPtr);
778 MoveBefore(VecPtr, WideLoad);
779
780 // From the wide load, create two values that equal the original two loads.
781 // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
782 // TODO: Support big-endian as well.
783 Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
784 Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
785 BaseSExt->replaceAllUsesWith(NewBaseSExt);
786
787 IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
788 Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
789 Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
790 Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
791 Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
792 OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
793
794 LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
795 << *Base << "\n" << *Offset << "\n"
796 << "Created Wide Load:\n"
797 << *WideLoad << "\n"
798 << *Bottom << "\n"
799 << *NewBaseSExt << "\n"
800 << *Top << "\n"
801 << *Trunc << "\n"
802 << *NewOffsetSExt << "\n");
803 WideLoads.emplace(std::make_pair(Base,
804 std::make_unique<WidenedLoad>(Loads, WideLoad)));
805 return WideLoad;
806}
807
809 return new ARMParallelDSP();
810}
811
812char ARMParallelDSP::ID = 0;
813
814INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
815 "Transform functions to use DSP intrinsics", false, false)
816INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
817 "Transform functions to use DSP intrinsics", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass"))
static cl::opt< unsigned > NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), cl::desc("Limit the number of loads analysed"))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static DeltaTreeNode * getRoot(void *Root)
static bool runOnFunction(Function &F, bool PostInlining)
This is the interface for a simple mod/ref and alias analysis over globals.
Module.h This file contains the declarations for the Module class.
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Machine Check Debug Module
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the SmallPtrSet class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
Target-Independent Code Generator Pass Configuration Options pass.
static bool is64Bit(const char *name)
Value * RHS
Value * LHS
BinaryOperator * Mul
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
PointerType * getType() const
Global values are always pointers.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
An instruction for reading from memory.
static constexpr LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
initializer< Ty > init(const Ty &Val)
Context & getContext() const
Definition BasicBlock.h:99
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
auto reverse(ContainerTy &&C)
Definition STLExtras.h:420
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1652
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
Pass * createARMParallelDSPPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.