LLVM  8.0.0svn
AMDGPUTargetMachine.cpp
Go to the documentation of this file.
1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// The AMDGPU target machine contains all of the hardware specific
12 /// information needed to emit code for R600 and SI GPUs.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPU.h"
18 #include "AMDGPUAliasAnalysis.h"
19 #include "AMDGPUCallLowering.h"
21 #include "AMDGPULegalizerInfo.h"
22 #include "AMDGPUMacroFusion.h"
23 #include "AMDGPUTargetObjectFile.h"
25 #include "GCNIterativeScheduler.h"
26 #include "GCNSchedStrategy.h"
27 #include "R600MachineScheduler.h"
28 #include "SIMachineScheduler.h"
33 #include "llvm/CodeGen/Passes.h"
35 #include "llvm/IR/Attributes.h"
36 #include "llvm/IR/Function.h"
38 #include "llvm/Pass.h"
40 #include "llvm/Support/Compiler.h"
43 #include "llvm/Transforms/IPO.h"
46 #include "llvm/Transforms/Scalar.h"
48 #include "llvm/Transforms/Utils.h"
50 #include <memory>
51 
52 using namespace llvm;
53 
55  "r600-ir-structurize",
56  cl::desc("Use StructurizeCFG IR pass"),
57  cl::init(true));
58 
60  "amdgpu-sroa",
61  cl::desc("Run SROA after promote alloca pass"),
63  cl::init(true));
64 
65 static cl::opt<bool>
66 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
67  cl::desc("Run early if-conversion"),
68  cl::init(false));
69 
71  "r600-if-convert",
72  cl::desc("Use if conversion pass"),
74  cl::init(true));
75 
76 // Option to disable vectorizer for tests.
78  "amdgpu-load-store-vectorizer",
79  cl::desc("Enable load store vectorizer"),
80  cl::init(true),
81  cl::Hidden);
82 
83 // Option to control global loads scalarization
85  "amdgpu-scalarize-global-loads",
86  cl::desc("Enable global load scalarization"),
87  cl::init(true),
88  cl::Hidden);
89 
90 // Option to run internalize pass.
92  "amdgpu-internalize-symbols",
93  cl::desc("Enable elimination of non-kernel functions and unused globals"),
94  cl::init(false),
95  cl::Hidden);
96 
97 // Option to inline all early.
99  "amdgpu-early-inline-all",
100  cl::desc("Inline all functions early"),
101  cl::init(false),
102  cl::Hidden);
103 
105  "amdgpu-sdwa-peephole",
106  cl::desc("Enable SDWA peepholer"),
107  cl::init(true));
108 
109 // Enable address space based alias analysis
110 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
111  cl::desc("Enable AMDGPU Alias Analysis"),
112  cl::init(true));
113 
114 // Option to run late CFG structurizer
116  "amdgpu-late-structurize",
117  cl::desc("Enable late CFG structurization"),
119  cl::Hidden);
120 
122  "amdgpu-function-calls",
123  cl::desc("Enable AMDGPU function call support"),
125  cl::init(false),
126  cl::Hidden);
127 
128 // Enable lib calls simplifications
130  "amdgpu-simplify-libcall",
131  cl::desc("Enable amdgpu library simplifications"),
132  cl::init(true),
133  cl::Hidden);
134 
136  "amdgpu-ir-lower-kernel-arguments",
137  cl::desc("Lower kernel argument loads in IR pass"),
138  cl::init(true),
139  cl::Hidden);
140 
141 extern "C" void LLVMInitializeAMDGPUTarget() {
142  // Register the target
145 
189 }
190 
191 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
192  return llvm::make_unique<AMDGPUTargetObjectFile>();
193 }
194 
196  return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
197 }
198 
200  return new SIScheduleDAGMI(C);
201 }
202 
203 static ScheduleDAGInstrs *
205  ScheduleDAGMILive *DAG =
206  new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
210  return DAG;
211 }
212 
213 static ScheduleDAGInstrs *
215  auto DAG = new GCNIterativeScheduler(C,
217  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
218  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
219  return DAG;
220 }
221 
223  return new GCNIterativeScheduler(C,
225 }
226 
227 static ScheduleDAGInstrs *
229  auto DAG = new GCNIterativeScheduler(C,
231  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
232  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
233  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
234  return DAG;
235 }
236 
238 R600SchedRegistry("r600", "Run R600's custom scheduler",
240 
242 SISchedRegistry("si", "Run SI's custom scheduler",
244 
246 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
247  "Run GCN scheduler to maximize occupancy",
249 
251 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
252  "Run GCN scheduler to maximize occupancy (experimental)",
254 
256 GCNMinRegSchedRegistry("gcn-minreg",
257  "Run GCN iterative scheduler for minimal register usage (experimental)",
259 
261 GCNILPSchedRegistry("gcn-ilp",
262  "Run GCN iterative scheduler for ILP scheduling (experimental)",
264 
265 static StringRef computeDataLayout(const Triple &TT) {
266  if (TT.getArch() == Triple::r600) {
267  // 32-bit pointers.
268  return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
269  "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
270  }
271 
272  // 32-bit private, local, and region pointers. 64-bit global, constant and
273  // flat.
274  return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
275  "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
276  "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
277 }
278 
280 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
281  if (!GPU.empty())
282  return GPU;
283 
284  if (TT.getArch() == Triple::amdgcn)
285  return "generic";
286 
287  return "r600";
288 }
289 
291  // The AMDGPU toolchain only supports generating shared objects, so we
292  // must always use PIC.
293  return Reloc::PIC_;
294 }
295 
297  if (CM)
298  return *CM;
299  return CodeModel::Small;
300 }
301 
303  StringRef CPU, StringRef FS,
304  TargetOptions Options,
307  CodeGenOpt::Level OptLevel)
308  : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
309  FS, Options, getEffectiveRelocModel(RM),
310  getEffectiveCodeModel(CM), OptLevel),
311  TLOF(createTLOF(getTargetTriple())) {
312  initAsmInfo();
313 }
314 
317 
319 
321  Attribute GPUAttr = F.getFnAttribute("target-cpu");
322  return GPUAttr.hasAttribute(Attribute::None) ?
323  getTargetCPU() : GPUAttr.getValueAsString();
324 }
325 
327  Attribute FSAttr = F.getFnAttribute("target-features");
328 
329  return FSAttr.hasAttribute(Attribute::None) ?
331  FSAttr.getValueAsString();
332 }
333 
335  return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
336  if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
337  AAR.addAAResult(WrapperPass->getResult());
338  });
339 }
340 
341 /// Predicate for Internalize pass.
342 static bool mustPreserveGV(const GlobalValue &GV) {
343  if (const Function *F = dyn_cast<Function>(&GV))
344  return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
345 
346  return !GV.use_empty();
347 }
348 
350  Builder.DivergentTarget = true;
351 
352  bool EnableOpt = getOptLevel() > CodeGenOpt::None;
353  bool Internalize = InternalizeSymbols;
354  bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls;
355  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
356  bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
357 
359  delete Builder.Inliner;
361  }
362 
363  Builder.addExtension(
365  [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
367  if (AMDGPUAA) {
370  }
372  if (Internalize) {
374  PM.add(createGlobalDCEPass());
375  }
376  if (EarlyInline)
378  });
379 
380  const auto &Opt = Options;
381  Builder.addExtension(
383  [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
385  if (AMDGPUAA) {
388  }
390  if (LibCallSimplify)
392  });
393 
394  Builder.addExtension(
397  // Add infer address spaces pass to the opt pipeline after inlining
398  // but before SROA to increase SROA opportunities.
400 
401  // This should run after inlining to have any chance of doing anything,
402  // and before other cleanup optimizations.
404  });
405 }
406 
407 //===----------------------------------------------------------------------===//
408 // R600 Target Machine (R600 -> Cayman)
409 //===----------------------------------------------------------------------===//
410 
412  StringRef CPU, StringRef FS,
416  CodeGenOpt::Level OL, bool JIT)
417  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
419 }
420 
422  const Function &F) const {
423  StringRef GPU = getGPUName(F);
424  StringRef FS = getFeatureString(F);
425 
426  SmallString<128> SubtargetKey(GPU);
427  SubtargetKey.append(FS);
428 
429  auto &I = SubtargetMap[SubtargetKey];
430  if (!I) {
431  // This needs to be done before we create a new subtarget since any
432  // creation will depend on the TM and the code generation flags on the
433  // function that reside in TargetOptions.
435  I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
436  }
437 
438  return I.get();
439 }
440 
443  return TargetTransformInfo(R600TTIImpl(this, F));
444 }
445 
446 //===----------------------------------------------------------------------===//
447 // GCN Target Machine (SI+)
448 //===----------------------------------------------------------------------===//
449 
451  StringRef CPU, StringRef FS,
455  CodeGenOpt::Level OL, bool JIT)
456  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
457 
459  StringRef GPU = getGPUName(F);
460  StringRef FS = getFeatureString(F);
461 
462  SmallString<128> SubtargetKey(GPU);
463  SubtargetKey.append(FS);
464 
465  auto &I = SubtargetMap[SubtargetKey];
466  if (!I) {
467  // This needs to be done before we create a new subtarget since any
468  // creation will depend on the TM and the code generation flags on the
469  // function that reside in TargetOptions.
471  I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
472  }
473 
474  I->setScalarizeGlobalBehavior(ScalarizeGlobal);
475 
476  return I.get();
477 }
478 
481  return TargetTransformInfo(GCNTTIImpl(this, F));
482 }
483 
484 //===----------------------------------------------------------------------===//
485 // AMDGPU Pass Setup
486 //===----------------------------------------------------------------------===//
487 
488 namespace {
489 
490 class AMDGPUPassConfig : public TargetPassConfig {
491 public:
492  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
493  : TargetPassConfig(TM, PM) {
494  // Exceptions and StackMaps are not supported, so these passes will never do
495  // anything.
496  disablePass(&StackMapLivenessID);
497  disablePass(&FuncletLayoutID);
498  }
499 
500  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
501  return getTM<AMDGPUTargetMachine>();
502  }
503 
505  createMachineScheduler(MachineSchedContext *C) const override {
509  return DAG;
510  }
511 
512  void addEarlyCSEOrGVNPass();
513  void addStraightLineScalarOptimizationPasses();
514  void addIRPasses() override;
515  void addCodeGenPrepare() override;
516  bool addPreISel() override;
517  bool addInstSelector() override;
518  bool addGCPasses() override;
519 };
520 
521 class R600PassConfig final : public AMDGPUPassConfig {
522 public:
523  R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
524  : AMDGPUPassConfig(TM, PM) {}
525 
526  ScheduleDAGInstrs *createMachineScheduler(
527  MachineSchedContext *C) const override {
528  return createR600MachineScheduler(C);
529  }
530 
531  bool addPreISel() override;
532  bool addInstSelector() override;
533  void addPreRegAlloc() override;
534  void addPreSched2() override;
535  void addPreEmitPass() override;
536 };
537 
538 class GCNPassConfig final : public AMDGPUPassConfig {
539 public:
540  GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
541  : AMDGPUPassConfig(TM, PM) {
542  // It is necessary to know the register usage of the entire call graph. We
543  // allow calls without EnableAMDGPUFunctionCalls if they are marked
544  // noinline, so this is always required.
545  setRequiresCodeGenSCCOrder(true);
546  }
547 
548  GCNTargetMachine &getGCNTargetMachine() const {
549  return getTM<GCNTargetMachine>();
550  }
551 
553  createMachineScheduler(MachineSchedContext *C) const override;
554 
555  bool addPreISel() override;
556  void addMachineSSAOptimization() override;
557  bool addILPOpts() override;
558  bool addInstSelector() override;
559  bool addIRTranslator() override;
560  bool addLegalizeMachineIR() override;
561  bool addRegBankSelect() override;
562  bool addGlobalInstructionSelect() override;
563  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
564  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
565  void addPreRegAlloc() override;
566  void addPostRegAlloc() override;
567  void addPreSched2() override;
568  void addPreEmitPass() override;
569 };
570 
571 } // end anonymous namespace
572 
573 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
575  addPass(createGVNPass());
576  else
577  addPass(createEarlyCSEPass());
578 }
579 
580 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
581  addPass(createLICMPass());
584  // ReassociateGEPs exposes more opportunites for SLSR. See
585  // the example in reassociate-geps-and-slsr.ll.
587  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
588  // EarlyCSE can reuse.
589  addEarlyCSEOrGVNPass();
590  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
591  addPass(createNaryReassociatePass());
592  // NaryReassociate on GEPs creates redundant common expressions, so run
593  // EarlyCSE after it.
594  addPass(createEarlyCSEPass());
595 }
596 
597 void AMDGPUPassConfig::addIRPasses() {
598  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
599 
600  // There is no reason to run these.
601  disablePass(&StackMapLivenessID);
602  disablePass(&FuncletLayoutID);
603  disablePass(&PatchableFunctionID);
604 
606 
607  if (TM.getTargetTriple().getArch() == Triple::r600 ||
609  // Function calls are not supported, so make sure we inline everything.
610  addPass(createAMDGPUAlwaysInlinePass());
612  // We need to add the barrier noop pass, otherwise adding the function
613  // inlining pass will cause all of the PassConfigs passes to be run
614  // one function at a time, which means if we have a nodule with two
615  // functions, then we will generate code for the first function
616  // without ever running any passes on the second.
617  addPass(createBarrierNoopPass());
618  }
619 
620  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
621  // TODO: May want to move later or split into an early and late one.
622 
624  }
625 
626  // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
627  if (TM.getTargetTriple().getArch() == Triple::r600)
629 
630  // Replace OpenCL enqueued block function pointers with global variables.
632 
633  if (TM.getOptLevel() > CodeGenOpt::None) {
634  addPass(createInferAddressSpacesPass());
635  addPass(createAMDGPUPromoteAlloca());
636 
637  if (EnableSROA)
638  addPass(createSROAPass());
639 
640  addStraightLineScalarOptimizationPasses();
641 
643  addPass(createAMDGPUAAWrapperPass());
645  AAResults &AAR) {
646  if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
647  AAR.addAAResult(WrapperPass->getResult());
648  }));
649  }
650  }
651 
653 
654  // EarlyCSE is not always strong enough to clean up what LSR produces. For
655  // example, GVN can combine
656  //
657  // %0 = add %a, %b
658  // %1 = add %b, %a
659  //
660  // and
661  //
662  // %0 = shl nsw %a, 2
663  // %1 = shl %a, 2
664  //
665  // but EarlyCSE can do neither of them.
666  if (getOptLevel() != CodeGenOpt::None)
667  addEarlyCSEOrGVNPass();
668 }
669 
670 void AMDGPUPassConfig::addCodeGenPrepare() {
671  if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
674 
676 
679 }
680 
681 bool AMDGPUPassConfig::addPreISel() {
682  addPass(createLowerSwitchPass());
683  addPass(createFlattenCFGPass());
684  return false;
685 }
686 
687 bool AMDGPUPassConfig::addInstSelector() {
688  addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
689  return false;
690 }
691 
692 bool AMDGPUPassConfig::addGCPasses() {
693  // Do nothing. GC is not supported.
694  return false;
695 }
696 
697 //===----------------------------------------------------------------------===//
698 // R600 Pass Setup
699 //===----------------------------------------------------------------------===//
700 
701 bool R600PassConfig::addPreISel() {
702  AMDGPUPassConfig::addPreISel();
703 
705  addPass(createStructurizeCFGPass());
706  return false;
707 }
708 
709 bool R600PassConfig::addInstSelector() {
710  addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
711  return false;
712 }
713 
714 void R600PassConfig::addPreRegAlloc() {
715  addPass(createR600VectorRegMerger());
716 }
717 
718 void R600PassConfig::addPreSched2() {
719  addPass(createR600EmitClauseMarkers(), false);
721  addPass(&IfConverterID, false);
722  addPass(createR600ClauseMergePass(), false);
723 }
724 
725 void R600PassConfig::addPreEmitPass() {
726  addPass(createAMDGPUCFGStructurizerPass(), false);
727  addPass(createR600ExpandSpecialInstrsPass(), false);
728  addPass(&FinalizeMachineBundlesID, false);
729  addPass(createR600Packetizer(), false);
730  addPass(createR600ControlFlowFinalizer(), false);
731 }
732 
734  return new R600PassConfig(*this, PM);
735 }
736 
737 //===----------------------------------------------------------------------===//
738 // GCN Pass Setup
739 //===----------------------------------------------------------------------===//
740 
741 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
742  MachineSchedContext *C) const {
743  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
744  if (ST.enableSIScheduler())
745  return createSIMachineScheduler(C);
747 }
748 
749 bool GCNPassConfig::addPreISel() {
750  AMDGPUPassConfig::addPreISel();
751 
752  // FIXME: We need to run a pass to propagate the attributes when calls are
753  // supported.
755 
756  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
757  // regions formed by them.
759  if (!LateCFGStructurize) {
760  addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
761  }
762  addPass(createSinkingPass());
764  if (!LateCFGStructurize) {
766  }
767 
768  return false;
769 }
770 
771 void GCNPassConfig::addMachineSSAOptimization() {
773 
774  // We want to fold operands after PeepholeOptimizer has run (or as part of
775  // it), because it will eliminate extra copies making it easier to fold the
776  // real source operand. We want to eliminate dead instructions after, so that
777  // we see fewer uses of the copies. We then need to clean up the dead
778  // instructions leftover after the operands are folded as well.
779  //
780  // XXX - Can we get away without running DeadMachineInstructionElim again?
781  addPass(&SIFoldOperandsID);
783  addPass(&SILoadStoreOptimizerID);
784  if (EnableSDWAPeephole) {
785  addPass(&SIPeepholeSDWAID);
786  addPass(&EarlyMachineLICMID);
787  addPass(&MachineCSEID);
788  addPass(&SIFoldOperandsID);
790  }
792 }
793 
794 bool GCNPassConfig::addILPOpts() {
796  addPass(&EarlyIfConverterID);
797 
799  return false;
800 }
801 
802 bool GCNPassConfig::addInstSelector() {
803  AMDGPUPassConfig::addInstSelector();
804  addPass(createSILowerI1CopiesPass());
805  addPass(&SIFixSGPRCopiesID);
806  return false;
807 }
808 
809 bool GCNPassConfig::addIRTranslator() {
810  addPass(new IRTranslator());
811  return false;
812 }
813 
814 bool GCNPassConfig::addLegalizeMachineIR() {
815  addPass(new Legalizer());
816  return false;
817 }
818 
819 bool GCNPassConfig::addRegBankSelect() {
820  addPass(new RegBankSelect());
821  return false;
822 }
823 
824 bool GCNPassConfig::addGlobalInstructionSelect() {
825  addPass(new InstructionSelect());
826  return false;
827 }
828 
829 void GCNPassConfig::addPreRegAlloc() {
830  if (LateCFGStructurize) {
832  }
833  addPass(createSIWholeQuadModePass());
834 }
835 
836 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
837  // FIXME: We have to disable the verifier here because of PHIElimination +
838  // TwoAddressInstructions disabling it.
839 
840  // This must be run immediately after phi elimination and before
841  // TwoAddressInstructions, otherwise the processing of the tied operand of
842  // SI_ELSE will introduce a copy of the tied operand source after the else.
843  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
844 
845  // This must be run after SILowerControlFlow, since it needs to use the
846  // machine-level CFG, but before register allocation.
847  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
848 
849  TargetPassConfig::addFastRegAlloc(RegAllocPass);
850 }
851 
852 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
854 
856 
857  // This must be run immediately after phi elimination and before
858  // TwoAddressInstructions, otherwise the processing of the tied operand of
859  // SI_ELSE will introduce a copy of the tied operand source after the else.
860  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
861 
862  // This must be run after SILowerControlFlow, since it needs to use the
863  // machine-level CFG, but before register allocation.
864  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
865 
867 }
868 
869 void GCNPassConfig::addPostRegAlloc() {
870  addPass(&SIFixVGPRCopiesID);
871  addPass(&SIOptimizeExecMaskingID);
873 }
874 
875 void GCNPassConfig::addPreSched2() {
876 }
877 
878 void GCNPassConfig::addPreEmitPass() {
879  addPass(createSIMemoryLegalizerPass());
880  addPass(createSIInsertWaitcntsPass());
882 
883  // The hazard recognizer that runs as part of the post-ra scheduler does not
884  // guarantee to be able handle all hazards correctly. This is because if there
885  // are multiple scheduling regions in a basic block, the regions are scheduled
886  // bottom up, so when we begin to schedule a region we don't know what
887  // instructions were emitted directly before it.
888  //
889  // Here we add a stand-alone hazard recognizer pass which can handle all
890  // cases.
891  //
892  // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
893  // be better for it to emit S_NOP <N> when possible.
894  addPass(&PostRAHazardRecognizerID);
895 
896  addPass(&SIInsertSkipsPassID);
898  addPass(&BranchRelaxationPassID);
899 }
900 
902  return new GCNPassConfig(*this, PM);
903 }
FunctionPass * createSpeculativeExecutionPass()
char & SIFormMemoryClausesID
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
FunctionPass * createStraightLineStrengthReducePass()
uint64_t CallInst * C
FunctionPass * createGVNPass(bool NoLoads=false)
Create a legacy GVN pass.
Definition: GVN.cpp:2579
StringRef getTargetFeatureString() const
Target & getTheGCNTarget()
The target for GCN GPUs.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
void initializeAMDGPUDAGToDAGISelPass(PassRegistry &)
void addAAResult(AAResultT &AAResult)
Register a specific AA result.
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine...
FunctionPass * createSIAnnotateControlFlowPass()
Create the annotation pass.
ModulePass * createAMDGPUAlwaysInlinePass(bool GlobalOpt=true)
static LLVM_READNONE StringRef getGPUOrDefault(const Triple &TT, StringRef GPU)
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PassManagerBuilder - This class is used to set up a standard optimization sequence for languages like...
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional< Reloc::Model > RM, Optional< CodeModel::Model > CM, CodeGenOpt::Level OL, bool JIT)
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
FunctionPass * createLowerSwitchPass()
void initializeSIFixVGPRCopiesPass(PassRegistry &)
virtual void addIRPasses()
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
static cl::opt< bool > EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer", cl::desc("Enable load store vectorizer"), cl::init(true), cl::Hidden)
FunctionPass * createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel)
This pass converts a legalized DAG into a R600-specific.
void initializeSIInsertWaitcntsPass(PassRegistry &)
void initializeSIFormMemoryClausesPass(PassRegistry &)
ModulePass * createR600OpenCLImageTypeLoweringPass()
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &)
Triple TargetTriple
Triple string, CPU name, and target feature strings the TargetMachine instance is created with...
Definition: TargetMachine.h:78
char & SILoadStoreOptimizerID
Target & getTheAMDGPUTarget()
The target which supports all AMD GPUs.
char & SIPeepholeSDWAID
void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &)
This file describes how to lower LLVM calls to machine code calls.
char & FuncletLayoutID
This pass lays out funclets contiguously.
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &)
char & EarlyIfConverterID
EarlyIfConverter - This pass performs if-conversion on SSA form by inserting cmov instructions...
void initializeR600ControlFlowFinalizerPass(PassRegistry &)
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry &)
FunctionPass * createAMDGPUPromoteAlloca()
virtual void add(Pass *P)=0
Add a pass to the queue of passes to run.
ModulePass * createAMDGPULowerKernelAttributesPass()
char & BranchRelaxationPassID
BranchRelaxation - This pass replaces branches that need to jump further than is supported by a branc...
FunctionPass * createAMDGPUCodeGenPreparePass()
F(f)
R600 Machine Scheduler interface.
char & MachineSchedulerID
MachineScheduler - This pass schedules machine instructions.
static cl::opt< bool > EnableLowerKernelArguments("amdgpu-ir-lower-kernel-arguments", cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), cl::Hidden)
FunctionPass * createAMDGPUCFGStructurizerPass()
MachineSchedRegistry provides a selection of available machine instruction schedulers.
virtual void addMachineSSAOptimization()
addMachineSSAOptimization - Add standard passes that optimize machine instructions in SSA form...
void initializeAMDGPUAAWrapperPassPass(PassRegistry &)
static std::unique_ptr< TargetLoweringObjectFile > createTLOF(const Triple &TT)
void initializeAMDGPUPromoteAllocaPass(PassRegistry &)
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &)
FunctionPass * createSIMemoryLegalizerPass()
Pass * Inliner
Inliner - Specifies the inliner to use.
FunctionPass * createAMDGPUMachineCFGStructurizerPass()
FunctionPass * createSIInsertWaitcntsPass()
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
StringRef getFeatureString(const Function &F) const
#define LLVM_READNONE
Definition: Compiler.h:177
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
void resetTargetOptions(const Function &F) const
Reset the target options based on the function&#39;s attributes.
This file declares the targeting of the InstructionSelector class for AMDGPU.
Pass * createAMDGPUFunctionInliningPass()
static cl::opt< bool > EnableSDWAPeephole("amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), cl::init(true))
This file declares the AMDGPU-specific subclass of TargetLoweringObjectFile.
Pass * createAMDGPUAnnotateKernelFeaturesPass()
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
This file contains the simple types necessary to represent the attributes associated with functions a...
ModulePass * createAMDGPUOpenCLEnqueuedBlockLoweringPass()
No attributes have been set.
Definition: Attributes.h:72
void initializeAMDGPUInlinerPass(PassRegistry &)
FunctionPass * createSinkingPass()
Definition: Sink.cpp:304
static MachineSchedRegistry GCNILPSchedRegistry("gcn-ilp", "Run GCN iterative scheduler for ILP scheduling (experimental)", createIterativeILPMachineScheduler)
char & SIOptimizeExecMaskingPreRAID
EP_ModuleOptimizerEarly - This extension point allows adding passes just before the main module-level...
char & FinalizeMachineBundlesID
FinalizeMachineBundles - This pass finalize machine instruction bundles (created earlier, e.g.
Target-Independent Code Generator Pass Configuration Options.
static StringRef computeDataLayout(const Triple &TT)
static cl::opt< bool, true > LateCFGStructurize("amdgpu-late-structurize", cl::desc("Enable late CFG structurization"), cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
FunctionPass * createR600ExpandSpecialInstrsPass()
static MachineSchedRegistry GCNMinRegSchedRegistry("gcn-minreg", "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler)
RegisterTargetMachine - Helper template for registering a target machine implementation, for use in the target machine initialization function.
char & MachineCSEID
MachineCSE - This pass performs global CSE on machine instructions.
Definition: MachineCSE.cpp:134
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:286
TargetTransformInfo getTargetTransformInfo(const Function &F) override
Get a TargetTransformInfo implementation for the target.
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:133
FunctionPass * createSILowerI1CopiesPass()
void initializeSIOptimizeExecMaskingPass(PassRegistry &)
char & DeadMachineInstructionElimID
DeadMachineInstructionElim - This pass removes dead machine instructions.
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry &)
Pass * createLoadStoreVectorizerPass()
ScheduleDAGMILive * createGenericSchedLive(MachineSchedContext *C)
Create the standard converging machine scheduler.
StringRef getTargetCPU() const
virtual bool addILPOpts()
Add passes that optimize instruction level parallelism for out-of-order targets.
void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &)
void initializeSIFixSGPRCopiesPass(PassRegistry &)
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &)
ModulePass * createGlobalDCEPass()
createGlobalDCEPass - This transform is designed to eliminate unreachable internal globals (functions...
FunctionPass * createR600VectorRegMerger()
static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > InternalizeSymbols("amdgpu-internalize-symbols", cl::desc("Enable elimination of non-kernel functions and unused globals"), cl::init(false), cl::Hidden)
static CodeModel::Model getEffectiveCodeModel(Optional< CodeModel::Model > CM)
SI Machine Scheduler interface.
StringRef getGPUName(const Function &F) const
void append(in_iter S, in_iter E)
Append from an iterator pair.
Definition: SmallString.h:75
char & PHIEliminationID
PHIElimination - This pass eliminates machine instruction PHI nodes by inserting copy instructions...
ImmutablePass * createExternalAAWrapperPass(std::function< void(Pass &, Function &, AAResults &)> Callback)
A wrapper pass around a callback which can be used to populate the AAResults in the AAResultsWrapperP...
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:410
FunctionPass * createAMDGPULowerKernelArgumentsPass()
bool hasAttribute(AttrKind Val) const
Return true if the attribute is present.
Definition: Attributes.cpp:202
CodeGenOpt::Level getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
FunctionPass * createSIDebuggerInsertNopsPass()
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
static Reloc::Model getEffectiveRelocModel(Optional< Reloc::Model > RM)
This pass implements the reg bank selector pass used in the GlobalISel pipeline.
Definition: RegBankSelect.h:91
FunctionPass * createFlattenCFGPass()
FunctionPass * createSIWholeQuadModePass()
This file provides the interface for LLVM&#39;s Global Value Numbering pass which eliminates fully redund...
static cl::opt< bool > EarlyInlineAll("amdgpu-early-inline-all", cl::desc("Inline all functions early"), cl::init(false), cl::Hidden)
char & SIInsertSkipsPassID
virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass)
addOptimizedRegAlloc - Add passes related to register allocation.
static MachineSchedRegistry GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler)
void adjustPassManager(PassManagerBuilder &) override
Allow the target to modify the pass manager, e.g.
bool isEntryFunctionCC(CallingConv::ID CC)
void LLVMInitializeAMDGPUTarget()
void initializeSIPeepholeSDWAPass(PassRegistry &)
Pass * createLICMPass()
Definition: LICM.cpp:248
static cl::opt< bool > EnableSROA("amdgpu-sroa", cl::desc("Run SROA after promote alloca pass"), cl::ReallyHidden, cl::init(true))
TargetPassConfig * createPassConfig(PassManagerBase &PM) override
Create a pass configuration object to be used by addPassToEmitX methods for generating a pipeline of ...
FunctionPass * createR600ControlFlowFinalizer()
Legacy wrapper pass to provide the AMDGPUAAResult object.
R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional< Reloc::Model > RM, Optional< CodeModel::Model > CM, CodeGenOpt::Level OL, bool JIT)
This class describes a target machine that is implemented with the LLVM target-independent code gener...
ModulePass * createBarrierNoopPass()
createBarrierNoopPass - This pass is purely a module pass barrier in a pass manager.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
Pass * createAlwaysInlinerLegacyPass(bool InsertLifetime=true)
Create a legacy pass manager instance of a pass to inline and remove functions marked as "always_inli...
const Triple & getTargetTriple() const
static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler)
void initializeSILowerControlFlowPass(PassRegistry &)
static ScheduleDAGInstrs * createMinRegScheduler(MachineSchedContext *C)
ModulePass * createAMDGPULowerIntrinsicsPass()
virtual void addCodeGenPrepare()
Add pass to prepare the LLVM IR for code generation.
const TargetSubtargetInfo * getSubtargetImpl() const
FunctionPass * createR600ClauseMergePass()
The AMDGPU TargetMachine interface definition for hw codgen targets.
static cl::opt< bool > EnableR600IfConvert("r600-if-convert", cl::desc("Use if conversion pass"), cl::ReallyHidden, cl::init(true))
std::unique_ptr< ScheduleDAGMutation > createStoreClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
static ScheduleDAGInstrs * createR600MachineScheduler(MachineSchedContext *C)
FunctionPass * createAMDGPUISelDag(TargetMachine *TM=nullptr, CodeGenOpt::Level OptLevel=CodeGenOpt::Default)
This pass converts a legalized DAG into a AMDGPU-specific.
std::unique_ptr< ScheduleDAGMutation > createAMDGPUMacroFusionDAGMutation()
Note that you have to add: DAG.addMutation(createAMDGPUMacroFusionDAGMutation()); to AMDGPUPassConfig...
static cl::opt< bool, true > EnableAMDGPUFunctionCalls("amdgpu-function-calls", cl::desc("Enable AMDGPU function call support"), cl::location(AMDGPUTargetMachine::EnableFunctionCalls), cl::init(false), cl::Hidden)
void initializeSIShrinkInstructionsPass(PassRegistry &)
void initializeAMDGPUUseNativeCallsPass(PassRegistry &)
Analysis pass providing a never-invalidated alias analysis result.
EP_EarlyAsPossible - This extension point allows adding passes before any other transformations, allowing them to see the code as it is coming out of the frontend.
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional< Reloc::Model > RM, Optional< CodeModel::Model > CM, CodeGenOpt::Level OL)
void initializeSIInsertSkipsPass(PassRegistry &)
void initializeR600PacketizerPass(PassRegistry &)
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
FunctionPass * createAMDGPUAnnotateUniformValues()
This is the AMGPU address space based alias analysis pass.
char & SIFixWWMLivenessID
Provides passes to inlining "always_inline" functions.
char & SIOptimizeExecMaskingID
EP_CGSCCOptimizerLate - This extension point allows adding CallGraphSCC passes at the end of the main...
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", "Run GCN scheduler to maximize occupancy (experimental)", createIterativeGCNMaxOccupancyMachineScheduler)
std::unique_ptr< ScheduleDAGMutation > createLoadClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
ImmutablePass class - This class is used to provide information that does not need to be run...
Definition: Pass.h:256
char & AMDGPUUnifyDivergentExitNodesID
bool enableSIScheduler() const
void initializeSIFixWWMLivenessPass(PassRegistry &)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
void initializeSIMemoryLegalizerPass(PassRegistry &)
PassManagerBase - An abstract interface to allow code to add passes to a pass manager without having ...
char & StackMapLivenessID
StackMapLiveness - This pass analyses the register live-out set of stackmap/patchpoint intrinsics and...
ModulePass * createInternalizePass(std::function< bool(const GlobalValue &)> MustPreserveGV)
createInternalizePass - This pass loops over all of the functions in the input module, internalizing all globals (functions and variables) it can.
void initializeSIWholeQuadModePass(PassRegistry &)
void setRequiresStructuredCFG(bool Value)
void initializeR600VectorRegMergerPass(PassRegistry &)
ImmutablePass * createAMDGPUAAWrapperPass()
char & SIFixVGPRCopiesID
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &)
char & EarlyMachineLICMID
This pass performs loop invariant code motion on machine instructions.
FunctionPass * createR600EmitClauseMarkers()
void initializeR600ClauseMergePassPass(PassRegistry &)
This pass is responsible for selecting generic machine instructions to target-specific instructions...
void initializeAMDGPUUnifyMetadataPass(PassRegistry &)
FunctionPass * createSeparateConstOffsetFromGEPPass(bool LowerGEP=false)
Target - Wrapper for Target specific information.
void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &)
virtual void addPostRegAlloc()
This method may be implemented by targets that want to run passes after register allocation pass pipe...
This file declares the targeting of the Machinelegalizer class for AMDGPU.
virtual void addFastRegAlloc(FunctionPass *RegAllocPass)
addFastRegAlloc - Add the minimum set of target-independent passes that are required for fast registe...
FunctionPass * createR600Packetizer()
void initializeSILoadStoreOptimizerPass(PassRegistry &)
char & SILowerControlFlowID
ModulePass * createAMDGPUUnifyMetadataPass()
void initializeSIAnnotateControlFlowPass(PassRegistry &)
A ScheduleDAG for scheduling lists of MachineInstr.
char & PatchableFunctionID
This pass implements the "patchable-function" attribute.
FunctionPass * createInferAddressSpacesPass()
void initializeSIFoldOperandsPass(PassRegistry &)
char & SIFoldOperandsID
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:563
FunctionPass * createSIShrinkInstructionsPass()
static cl::opt< bool > EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true))
void initializeSIDebuggerInsertNopsPass(PassRegistry &)
StringRef getValueAsString() const
Return the attribute&#39;s value as a string.
Definition: Attributes.cpp:195
TargetOptions Options
Definition: TargetMachine.h:97
char & IfConverterID
IfConverter - This pass performs machine code if conversion.
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
char & SIFixSGPRCopiesID
#define I(x, y, z)
Definition: MD5.cpp:58
FunctionPass * createAMDGPUSimplifyLibCallsPass(const TargetOptions &)
FunctionPass * createSROAPass()
Definition: SROA.cpp:4555
static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler)
static bool mustPreserveGV(const GlobalValue &GV)
Predicate for Internalize pass.
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:562
This file declares the IRTranslator pass.
FunctionPass * createAMDGPUUseNativeCallsPass()
char & PostRAHazardRecognizerID
createPostRAHazardRecognizer - This pass runs the post-ra hazard recognizer.
AnalysisType * getAnalysisIfAvailable() const
getAnalysisIfAvailable<AnalysisType>() - Subclasses use this function to get analysis information tha...
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.h:331
FunctionPass * createEarlyCSEPass(bool UseMemorySSA=false)
Definition: EarlyCSE.cpp:1319
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:39
TargetTransformInfo getTargetTransformInfo(const Function &F) override
Get a TargetTransformInfo implementation for the target.
void initializeSILowerI1CopiesPass(PassRegistry &)
void addExtension(ExtensionPointTy Ty, ExtensionFn Fn)
static ScheduleDAGInstrs * createIterativeILPMachineScheduler(MachineSchedContext *C)
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry &)
static cl::opt< bool > EnableLibCallSimplify("amdgpu-simplify-libcall", cl::desc("Enable amdgpu library simplifications"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableR600StructurizeCFG("r600-ir-structurize", cl::desc("Use StructurizeCFG IR pass"), cl::init(true))
Pass * createStructurizeCFGPass(bool SkipUniformRegions=false)
When SkipUniformRegions is true the structizer will not structurize regions that only contain uniform...
void initializeAMDGPUAlwaysInlinePass(PassRegistry &)
TargetPassConfig * createPassConfig(PassManagerBase &PM) override
Create a pass configuration object to be used by addPassToEmitX methods for generating a pipeline of ...
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &)
static ImmutablePass * createAMDGPUExternalAAWrapperPass()
void initializeGlobalISel(PassRegistry &)
Initialize all passes linked into the GlobalISel library.
Definition: GlobalISel.cpp:19
bool use_empty() const
Definition: Value.h:323
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:426
static ScheduleDAGInstrs * createSIMachineScheduler(MachineSchedContext *C)
static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(false))
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
FunctionPass * createNaryReassociatePass()