LLVM  12.0.0git
AMDGPUTargetMachine.cpp
Go to the documentation of this file.
1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information needed to emit code for R600 and SI GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUExportClustering.h"
21 #include "AMDGPULegalizerInfo.h"
22 #include "AMDGPUMacroFusion.h"
23 #include "AMDGPUTargetObjectFile.h"
25 #include "GCNIterativeScheduler.h"
26 #include "GCNSchedStrategy.h"
28 #include "R600MachineScheduler.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "SIMachineScheduler.h"
38 #include "llvm/CodeGen/Passes.h"
40 #include "llvm/IR/Attributes.h"
41 #include "llvm/IR/Function.h"
43 #include "llvm/InitializePasses.h"
44 #include "llvm/Pass.h"
46 #include "llvm/Support/Compiler.h"
49 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Scalar.h"
54 #include "llvm/Transforms/Utils.h"
56 #include <memory>
57 
58 using namespace llvm;
59 
61  "r600-ir-structurize",
62  cl::desc("Use StructurizeCFG IR pass"),
63  cl::init(true));
64 
66  "amdgpu-sroa",
67  cl::desc("Run SROA after promote alloca pass"),
69  cl::init(true));
70 
71 static cl::opt<bool>
72 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
73  cl::desc("Run early if-conversion"),
74  cl::init(false));
75 
76 static cl::opt<bool>
77 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
78  cl::desc("Run pre-RA exec mask optimizations"),
79  cl::init(true));
80 
82  "r600-if-convert",
83  cl::desc("Use if conversion pass"),
85  cl::init(true));
86 
87 // Option to disable vectorizer for tests.
89  "amdgpu-load-store-vectorizer",
90  cl::desc("Enable load store vectorizer"),
91  cl::init(true),
92  cl::Hidden);
93 
94 // Option to control global loads scalarization
96  "amdgpu-scalarize-global-loads",
97  cl::desc("Enable global load scalarization"),
98  cl::init(true),
99  cl::Hidden);
100 
101 // Option to run internalize pass.
103  "amdgpu-internalize-symbols",
104  cl::desc("Enable elimination of non-kernel functions and unused globals"),
105  cl::init(false),
106  cl::Hidden);
107 
108 // Option to inline all early.
110  "amdgpu-early-inline-all",
111  cl::desc("Inline all functions early"),
112  cl::init(false),
113  cl::Hidden);
114 
116  "amdgpu-sdwa-peephole",
117  cl::desc("Enable SDWA peepholer"),
118  cl::init(true));
119 
121  "amdgpu-dpp-combine",
122  cl::desc("Enable DPP combiner"),
123  cl::init(true));
124 
125 // Enable address space based alias analysis
126 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
127  cl::desc("Enable AMDGPU Alias Analysis"),
128  cl::init(true));
129 
130 // Option to run late CFG structurizer
132  "amdgpu-late-structurize",
133  cl::desc("Enable late CFG structurization"),
135  cl::Hidden);
136 
138  "amdgpu-function-calls",
139  cl::desc("Enable AMDGPU function call support"),
141  cl::init(true),
142  cl::Hidden);
143 
145  "amdgpu-fixed-function-abi",
146  cl::desc("Enable all implicit function arguments"),
148  cl::init(false),
149  cl::Hidden);
150 
151 // Enable lib calls simplifications
153  "amdgpu-simplify-libcall",
154  cl::desc("Enable amdgpu library simplifications"),
155  cl::init(true),
156  cl::Hidden);
157 
159  "amdgpu-ir-lower-kernel-arguments",
160  cl::desc("Lower kernel argument loads in IR pass"),
161  cl::init(true),
162  cl::Hidden);
163 
165  "amdgpu-reassign-regs",
166  cl::desc("Enable register reassign optimizations on gfx10+"),
167  cl::init(true),
168  cl::Hidden);
169 
170 // Enable atomic optimization
172  "amdgpu-atomic-optimizations",
173  cl::desc("Enable atomic optimizations"),
174  cl::init(false),
175  cl::Hidden);
176 
177 // Enable Mode register optimization
179  "amdgpu-mode-register",
180  cl::desc("Enable mode register pass"),
181  cl::init(true),
182  cl::Hidden);
183 
184 // Option is used in lit tests to prevent deadcoding of patterns inspected.
185 static cl::opt<bool>
186 EnableDCEInRA("amdgpu-dce-in-ra",
187  cl::init(true), cl::Hidden,
188  cl::desc("Enable machine DCE inside regalloc"));
189 
191  "amdgpu-scalar-ir-passes",
192  cl::desc("Enable scalar IR passes"),
193  cl::init(true),
194  cl::Hidden);
195 
197  "amdgpu-enable-structurizer-workarounds",
198  cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
199  cl::Hidden);
200 
202  // Register the target
205 
268 }
269 
270 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
271  return std::make_unique<AMDGPUTargetObjectFile>();
272 }
273 
275  return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
276 }
277 
279  return new SIScheduleDAGMI(C);
280 }
281 
282 static ScheduleDAGInstrs *
284  ScheduleDAGMILive *DAG =
285  new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
289  return DAG;
290 }
291 
292 static ScheduleDAGInstrs *
294  auto DAG = new GCNIterativeScheduler(C,
296  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
297  return DAG;
298 }
299 
301  return new GCNIterativeScheduler(C,
303 }
304 
305 static ScheduleDAGInstrs *
307  auto DAG = new GCNIterativeScheduler(C,
309  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
310  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
311  return DAG;
312 }
313 
315 R600SchedRegistry("r600", "Run R600's custom scheduler",
317 
319 SISchedRegistry("si", "Run SI's custom scheduler",
321 
323 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
324  "Run GCN scheduler to maximize occupancy",
326 
328 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
329  "Run GCN scheduler to maximize occupancy (experimental)",
331 
333 GCNMinRegSchedRegistry("gcn-minreg",
334  "Run GCN iterative scheduler for minimal register usage (experimental)",
336 
338 GCNILPSchedRegistry("gcn-ilp",
339  "Run GCN iterative scheduler for ILP scheduling (experimental)",
341 
342 static StringRef computeDataLayout(const Triple &TT) {
343  if (TT.getArch() == Triple::r600) {
344  // 32-bit pointers.
345  return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
346  "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
347  }
348 
349  // 32-bit private, local, and region pointers. 64-bit global, constant and
350  // flat, non-integral buffer fat pointers.
351  return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
352  "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
353  "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
354  "-ni:7";
355 }
356 
358 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
359  if (!GPU.empty())
360  return GPU;
361 
362  // Need to default to a target with flat support for HSA.
363  if (TT.getArch() == Triple::amdgcn)
364  return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
365 
366  return "r600";
367 }
368 
370  // The AMDGPU toolchain only supports generating shared objects, so we
371  // must always use PIC.
372  return Reloc::PIC_;
373 }
374 
376  StringRef CPU, StringRef FS,
377  TargetOptions Options,
380  CodeGenOpt::Level OptLevel)
381  : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
382  FS, Options, getEffectiveRelocModel(RM),
383  getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
384  TLOF(createTLOF(getTargetTriple())) {
385  initAsmInfo();
386  if (TT.getArch() == Triple::amdgcn) {
387  if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
389  else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
391  }
392 }
393 
397 
399 
401  Attribute GPUAttr = F.getFnAttribute("target-cpu");
402  return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
403 }
404 
406  Attribute FSAttr = F.getFnAttribute("target-features");
407 
408  return FSAttr.isValid() ? FSAttr.getValueAsString()
410 }
411 
412 /// Predicate for Internalize pass.
413 static bool mustPreserveGV(const GlobalValue &GV) {
414  if (const Function *F = dyn_cast<Function>(&GV))
415  return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
416 
417  return !GV.use_empty();
418 }
419 
421  Builder.DivergentTarget = true;
422 
423  bool EnableOpt = getOptLevel() > CodeGenOpt::None;
424  bool Internalize = InternalizeSymbols;
425  bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
426  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
427  bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
428 
429  if (EnableFunctionCalls) {
430  delete Builder.Inliner;
432  }
433 
434  Builder.addExtension(
436  [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
438  if (AMDGPUAA) {
441  }
444  if (Internalize)
447  if (Internalize)
448  PM.add(createGlobalDCEPass());
449  if (EarlyInline)
451  });
452 
453  Builder.addExtension(
455  [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
457  if (AMDGPUAA) {
460  }
463  if (LibCallSimplify)
465  });
466 
467  Builder.addExtension(
469  [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
470  // Add infer address spaces pass to the opt pipeline after inlining
471  // but before SROA to increase SROA opportunities.
473 
474  // This should run after inlining to have any chance of doing anything,
475  // and before other cleanup optimizations.
477 
478  // Promote alloca to vector before SROA and loop unroll. If we manage
479  // to eliminate allocas before unroll we may choose to unroll less.
480  if (EnableOpt)
482  });
483 }
484 
485 //===----------------------------------------------------------------------===//
486 // R600 Target Machine (R600 -> Cayman)
487 //===----------------------------------------------------------------------===//
488 
490  StringRef CPU, StringRef FS,
494  CodeGenOpt::Level OL, bool JIT)
495  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
497 
498  // Override the default since calls aren't supported for r600.
499  if (EnableFunctionCalls &&
500  EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
501  EnableFunctionCalls = false;
502 }
503 
505  const Function &F) const {
506  StringRef GPU = getGPUName(F);
508 
509  SmallString<128> SubtargetKey(GPU);
510  SubtargetKey.append(FS);
511 
512  auto &I = SubtargetMap[SubtargetKey];
513  if (!I) {
514  // This needs to be done before we create a new subtarget since any
515  // creation will depend on the TM and the code generation flags on the
516  // function that reside in TargetOptions.
518  I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
519  }
520 
521  return I.get();
522 }
523 
525  unsigned DestAS) const {
526  return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
528 }
529 
531  const auto *LD = dyn_cast<LoadInst>(V);
532  if (!LD)
534 
535  // It must be a generic pointer loaded.
536  assert(V->getType()->isPointerTy() &&
538 
539  const auto *Ptr = LD->getPointerOperand();
540  if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
542  // For a generic pointer loaded from the constant memory, it could be assumed
543  // as a global pointer since the constant memory is only populated on the
544  // host side. As implied by the offload programming model, only global
545  // pointers could be referenced on the host side.
547 }
548 
551  return TargetTransformInfo(R600TTIImpl(this, F));
552 }
553 
554 //===----------------------------------------------------------------------===//
555 // GCN Target Machine (SI+)
556 //===----------------------------------------------------------------------===//
557 
559  StringRef CPU, StringRef FS,
563  CodeGenOpt::Level OL, bool JIT)
564  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
565 
567  StringRef GPU = getGPUName(F);
569 
570  SmallString<128> SubtargetKey(GPU);
571  SubtargetKey.append(FS);
572 
573  auto &I = SubtargetMap[SubtargetKey];
574  if (!I) {
575  // This needs to be done before we create a new subtarget since any
576  // creation will depend on the TM and the code generation flags on the
577  // function that reside in TargetOptions.
579  I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
580  }
581 
582  I->setScalarizeGlobalBehavior(ScalarizeGlobal);
583 
584  return I.get();
585 }
586 
589  return TargetTransformInfo(GCNTTIImpl(this, F));
590 }
591 
592 //===----------------------------------------------------------------------===//
593 // AMDGPU Pass Setup
594 //===----------------------------------------------------------------------===//
595 
596 namespace {
597 
598 class AMDGPUPassConfig : public TargetPassConfig {
599 public:
600  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
601  : TargetPassConfig(TM, PM) {
602  // Exceptions and StackMaps are not supported, so these passes will never do
603  // anything.
604  disablePass(&StackMapLivenessID);
605  disablePass(&FuncletLayoutID);
606  }
607 
608  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
609  return getTM<AMDGPUTargetMachine>();
610  }
611 
613  createMachineScheduler(MachineSchedContext *C) const override {
616  return DAG;
617  }
618 
619  void addEarlyCSEOrGVNPass();
620  void addStraightLineScalarOptimizationPasses();
621  void addIRPasses() override;
622  void addCodeGenPrepare() override;
623  bool addPreISel() override;
624  bool addInstSelector() override;
625  bool addGCPasses() override;
626 
627  std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
628 };
629 
630 std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
631  return getStandardCSEConfigForOpt(TM->getOptLevel());
632 }
633 
634 class R600PassConfig final : public AMDGPUPassConfig {
635 public:
636  R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
637  : AMDGPUPassConfig(TM, PM) {}
638 
639  ScheduleDAGInstrs *createMachineScheduler(
640  MachineSchedContext *C) const override {
641  return createR600MachineScheduler(C);
642  }
643 
644  bool addPreISel() override;
645  bool addInstSelector() override;
646  void addPreRegAlloc() override;
647  void addPreSched2() override;
648  void addPreEmitPass() override;
649 };
650 
651 class GCNPassConfig final : public AMDGPUPassConfig {
652 public:
653  GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
654  : AMDGPUPassConfig(TM, PM) {
655  // It is necessary to know the register usage of the entire call graph. We
656  // allow calls without EnableAMDGPUFunctionCalls if they are marked
657  // noinline, so this is always required.
658  setRequiresCodeGenSCCOrder(true);
659  }
660 
661  GCNTargetMachine &getGCNTargetMachine() const {
662  return getTM<GCNTargetMachine>();
663  }
664 
666  createMachineScheduler(MachineSchedContext *C) const override;
667 
668  bool addPreISel() override;
669  void addMachineSSAOptimization() override;
670  bool addILPOpts() override;
671  bool addInstSelector() override;
672  bool addIRTranslator() override;
673  void addPreLegalizeMachineIR() override;
674  bool addLegalizeMachineIR() override;
675  void addPreRegBankSelect() override;
676  bool addRegBankSelect() override;
677  bool addGlobalInstructionSelect() override;
678  void addFastRegAlloc() override;
679  void addOptimizedRegAlloc() override;
680  void addPreRegAlloc() override;
681  bool addPreRewrite() override;
682  void addPostRegAlloc() override;
683  void addPreSched2() override;
684  void addPreEmitPass() override;
685 };
686 
687 } // end anonymous namespace
688 
689 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
691  addPass(createGVNPass());
692  else
693  addPass(createEarlyCSEPass());
694 }
695 
696 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
697  addPass(createLICMPass());
700  // ReassociateGEPs exposes more opportunites for SLSR. See
701  // the example in reassociate-geps-and-slsr.ll.
703  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
704  // EarlyCSE can reuse.
705  addEarlyCSEOrGVNPass();
706  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
707  addPass(createNaryReassociatePass());
708  // NaryReassociate on GEPs creates redundant common expressions, so run
709  // EarlyCSE after it.
710  addPass(createEarlyCSEPass());
711 }
712 
713 void AMDGPUPassConfig::addIRPasses() {
714  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
715 
716  // There is no reason to run these.
717  disablePass(&StackMapLivenessID);
718  disablePass(&FuncletLayoutID);
719  disablePass(&PatchableFunctionID);
720 
722 
723  // This must occur before inlining, as the inliner will not look through
724  // bitcast calls.
726 
727  // A call to propagate attributes pass in the backend in case opt was not run.
729 
730  addPass(createAtomicExpandPass());
731 
732 
734 
735  // Function calls are not supported, so make sure we inline everything.
736  addPass(createAMDGPUAlwaysInlinePass());
738  // We need to add the barrier noop pass, otherwise adding the function
739  // inlining pass will cause all of the PassConfigs passes to be run
740  // one function at a time, which means if we have a nodule with two
741  // functions, then we will generate code for the first function
742  // without ever running any passes on the second.
743  addPass(createBarrierNoopPass());
744 
745  // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
746  if (TM.getTargetTriple().getArch() == Triple::r600)
748 
749  // Replace OpenCL enqueued block function pointers with global variables.
751 
752  if (TM.getOptLevel() > CodeGenOpt::None) {
753  addPass(createInferAddressSpacesPass());
754  addPass(createAMDGPUPromoteAlloca());
755 
756  if (EnableSROA)
757  addPass(createSROAPass());
758 
760  addStraightLineScalarOptimizationPasses();
761 
763  addPass(createAMDGPUAAWrapperPass());
765  AAResults &AAR) {
766  if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
767  AAR.addAAResult(WrapperPass->getResult());
768  }));
769  }
770  }
771 
772  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
773  // TODO: May want to move later or split into an early and late one.
775  }
776 
778 
779  // EarlyCSE is not always strong enough to clean up what LSR produces. For
780  // example, GVN can combine
781  //
782  // %0 = add %a, %b
783  // %1 = add %b, %a
784  //
785  // and
786  //
787  // %0 = shl nsw %a, 2
788  // %1 = shl %a, 2
789  //
790  // but EarlyCSE can do neither of them.
792  addEarlyCSEOrGVNPass();
793 }
794 
795 void AMDGPUPassConfig::addCodeGenPrepare() {
796  if (TM->getTargetTriple().getArch() == Triple::amdgcn)
798 
799  if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
802 
803  addPass(&AMDGPUPerfHintAnalysisID);
804 
806 
809 
810  // LowerSwitch pass may introduce unreachable blocks that can
811  // cause unexpected behavior for subsequent passes. Placing it
812  // here seems better that these blocks would get cleaned up by
813  // UnreachableBlockElim inserted next in the pass flow.
814  addPass(createLowerSwitchPass());
815 }
816 
817 bool AMDGPUPassConfig::addPreISel() {
818  addPass(createFlattenCFGPass());
819  return false;
820 }
821 
822 bool AMDGPUPassConfig::addInstSelector() {
823  // Defer the verifier until FinalizeISel.
824  addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
825  return false;
826 }
827 
828 bool AMDGPUPassConfig::addGCPasses() {
829  // Do nothing. GC is not supported.
830  return false;
831 }
832 
833 //===----------------------------------------------------------------------===//
834 // R600 Pass Setup
835 //===----------------------------------------------------------------------===//
836 
837 bool R600PassConfig::addPreISel() {
838  AMDGPUPassConfig::addPreISel();
839 
841  addPass(createStructurizeCFGPass());
842  return false;
843 }
844 
845 bool R600PassConfig::addInstSelector() {
846  addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
847  return false;
848 }
849 
850 void R600PassConfig::addPreRegAlloc() {
851  addPass(createR600VectorRegMerger());
852 }
853 
854 void R600PassConfig::addPreSched2() {
855  addPass(createR600EmitClauseMarkers(), false);
857  addPass(&IfConverterID, false);
858  addPass(createR600ClauseMergePass(), false);
859 }
860 
861 void R600PassConfig::addPreEmitPass() {
862  addPass(createAMDGPUCFGStructurizerPass(), false);
863  addPass(createR600ExpandSpecialInstrsPass(), false);
864  addPass(&FinalizeMachineBundlesID, false);
865  addPass(createR600Packetizer(), false);
866  addPass(createR600ControlFlowFinalizer(), false);
867 }
868 
870  return new R600PassConfig(*this, PM);
871 }
872 
873 //===----------------------------------------------------------------------===//
874 // GCN Pass Setup
875 //===----------------------------------------------------------------------===//
876 
877 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
878  MachineSchedContext *C) const {
879  const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
880  if (ST.enableSIScheduler())
881  return createSIMachineScheduler(C);
883 }
884 
885 bool GCNPassConfig::addPreISel() {
886  AMDGPUPassConfig::addPreISel();
887 
891  }
892 
893  // FIXME: We need to run a pass to propagate the attributes when calls are
894  // supported.
895 
896  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
897  // regions formed by them.
899  if (!LateCFGStructurize) {
901  addPass(createFixIrreduciblePass());
902  addPass(createUnifyLoopExitsPass());
903  }
904  addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
905  }
906  addPass(createSinkingPass());
908  if (!LateCFGStructurize) {
910  }
911  addPass(createLCSSAPass());
912 
913  return false;
914 }
915 
916 void GCNPassConfig::addMachineSSAOptimization() {
918 
919  // We want to fold operands after PeepholeOptimizer has run (or as part of
920  // it), because it will eliminate extra copies making it easier to fold the
921  // real source operand. We want to eliminate dead instructions after, so that
922  // we see fewer uses of the copies. We then need to clean up the dead
923  // instructions leftover after the operands are folded as well.
924  //
925  // XXX - Can we get away without running DeadMachineInstructionElim again?
926  addPass(&SIFoldOperandsID);
927  if (EnableDPPCombine)
928  addPass(&GCNDPPCombineID);
930  addPass(&SILoadStoreOptimizerID);
931  if (EnableSDWAPeephole) {
932  addPass(&SIPeepholeSDWAID);
933  addPass(&EarlyMachineLICMID);
934  addPass(&MachineCSEID);
935  addPass(&SIFoldOperandsID);
937  }
939 }
940 
941 bool GCNPassConfig::addILPOpts() {
943  addPass(&EarlyIfConverterID);
944 
946  return false;
947 }
948 
949 bool GCNPassConfig::addInstSelector() {
950  AMDGPUPassConfig::addInstSelector();
951  addPass(&SIFixSGPRCopiesID);
952  addPass(createSILowerI1CopiesPass());
953  addPass(createSIAddIMGInitPass());
954  return false;
955 }
956 
957 bool GCNPassConfig::addIRTranslator() {
958  addPass(new IRTranslator(getOptLevel()));
959  return false;
960 }
961 
962 void GCNPassConfig::addPreLegalizeMachineIR() {
963  bool IsOptNone = getOptLevel() == CodeGenOpt::None;
964  addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
965  addPass(new Localizer());
966 }
967 
968 bool GCNPassConfig::addLegalizeMachineIR() {
969  addPass(new Legalizer());
970  return false;
971 }
972 
973 void GCNPassConfig::addPreRegBankSelect() {
974  bool IsOptNone = getOptLevel() == CodeGenOpt::None;
975  addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
976 }
977 
978 bool GCNPassConfig::addRegBankSelect() {
979  addPass(new RegBankSelect());
980  return false;
981 }
982 
983 bool GCNPassConfig::addGlobalInstructionSelect() {
984  addPass(new InstructionSelect());
985  return false;
986 }
987 
988 void GCNPassConfig::addPreRegAlloc() {
989  if (LateCFGStructurize) {
991  }
992 }
993 
994 void GCNPassConfig::addFastRegAlloc() {
995  // FIXME: We have to disable the verifier here because of PHIElimination +
996  // TwoAddressInstructions disabling it.
997 
998  // This must be run immediately after phi elimination and before
999  // TwoAddressInstructions, otherwise the processing of the tied operand of
1000  // SI_ELSE will introduce a copy of the tied operand source after the else.
1001  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
1002 
1005 
1007 }
1008 
1009 void GCNPassConfig::addOptimizedRegAlloc() {
1010  // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1011  // instructions that cause scheduling barriers.
1012  insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1014 
1015  if (OptExecMaskPreRA)
1018 
1019  // This must be run immediately after phi elimination and before
1020  // TwoAddressInstructions, otherwise the processing of the tied operand of
1021  // SI_ELSE will introduce a copy of the tied operand source after the else.
1022  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
1023 
1024  if (EnableDCEInRA)
1026 
1028 }
1029 
1030 bool GCNPassConfig::addPreRewrite() {
1031  if (EnableRegReassign) {
1032  addPass(&GCNNSAReassignID);
1033  addPass(&GCNRegBankReassignID);
1034  }
1035  return true;
1036 }
1037 
1038 void GCNPassConfig::addPostRegAlloc() {
1039  addPass(&SIFixVGPRCopiesID);
1040  if (getOptLevel() > CodeGenOpt::None)
1041  addPass(&SIOptimizeExecMaskingID);
1043 
1044  // Equivalent of PEI for SGPRs.
1045  addPass(&SILowerSGPRSpillsID);
1046 }
1047 
1048 void GCNPassConfig::addPreSched2() {
1049  addPass(&SIPostRABundlerID);
1050 }
1051 
1052 void GCNPassConfig::addPreEmitPass() {
1053  addPass(createSIMemoryLegalizerPass());
1054  addPass(createSIInsertWaitcntsPass());
1055  addPass(createSIShrinkInstructionsPass());
1056  addPass(createSIModeRegisterPass());
1057 
1058  if (getOptLevel() > CodeGenOpt::None)
1059  addPass(&SIInsertHardClausesID);
1060 
1061  addPass(&SIRemoveShortExecBranchesID);
1062  addPass(&SIInsertSkipsPassID);
1063  addPass(&SIPreEmitPeepholeID);
1064  // The hazard recognizer that runs as part of the post-ra scheduler does not
1065  // guarantee to be able handle all hazards correctly. This is because if there
1066  // are multiple scheduling regions in a basic block, the regions are scheduled
1067  // bottom up, so when we begin to schedule a region we don't know what
1068  // instructions were emitted directly before it.
1069  //
1070  // Here we add a stand-alone hazard recognizer pass which can handle all
1071  // cases.
1072  addPass(&PostRAHazardRecognizerID);
1073  addPass(&BranchRelaxationPassID);
1074 }
1075 
1077  return new GCNPassConfig(*this, PM);
1078 }
1079 
1081  return new yaml::SIMachineFunctionInfo();
1082 }
1083 
1087  return new yaml::SIMachineFunctionInfo(*MFI,
1088  *MF.getSubtarget().getRegisterInfo());
1089 }
1090 
1093  SMDiagnostic &Error, SMRange &SourceRange) const {
1094  const yaml::SIMachineFunctionInfo &YamlMFI =
1095  reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1096  MachineFunction &MF = PFS.MF;
1098 
1099  MFI->initializeBaseYamlFields(YamlMFI);
1100 
1101  auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1102  Register TempReg;
1103  if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1104  SourceRange = RegName.SourceRange;
1105  return true;
1106  }
1107  RegVal = TempReg;
1108 
1109  return false;
1110  };
1111 
1112  auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1113  // Create a diagnostic for a the register string literal.
1114  const MemoryBuffer &Buffer =
1115  *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1116  Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1117  RegName.Value.size(), SourceMgr::DK_Error,
1118  "incorrect register class for field", RegName.Value,
1119  None, None);
1120  SourceRange = RegName.SourceRange;
1121  return true;
1122  };
1123 
1124  if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1125  parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1126  parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1127  return true;
1128 
1129  if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1130  !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1131  return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1132  }
1133 
1134  if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1135  !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1136  return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1137  }
1138 
1139  if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1140  !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1141  return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1142  }
1143 
1144  auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1145  const TargetRegisterClass &RC,
1146  ArgDescriptor &Arg, unsigned UserSGPRs,
1147  unsigned SystemSGPRs) {
1148  // Skip parsing if it's not present.
1149  if (!A)
1150  return false;
1151 
1152  if (A->IsRegister) {
1153  Register Reg;
1154  if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1155  SourceRange = A->RegisterName.SourceRange;
1156  return true;
1157  }
1158  if (!RC.contains(Reg))
1159  return diagnoseRegisterClass(A->RegisterName);
1160  Arg = ArgDescriptor::createRegister(Reg);
1161  } else
1163  // Check and apply the optional mask.
1164  if (A->Mask)
1165  Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
1166 
1167  MFI->NumUserSGPRs += UserSGPRs;
1168  MFI->NumSystemSGPRs += SystemSGPRs;
1169  return false;
1170  };
1171 
1172  if (YamlMFI.ArgInfo &&
1173  (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1174  AMDGPU::SGPR_128RegClass,
1175  MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1176  parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1177  AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1178  2, 0) ||
1179  parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1180  MFI->ArgInfo.QueuePtr, 2, 0) ||
1181  parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1182  AMDGPU::SReg_64RegClass,
1183  MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1184  parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1185  AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1186  2, 0) ||
1187  parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1188  AMDGPU::SReg_64RegClass,
1189  MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1190  parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1191  AMDGPU::SGPR_32RegClass,
1192  MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1193  parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1194  AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1195  0, 1) ||
1196  parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1197  AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1198  0, 1) ||
1199  parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1200  AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1201  0, 1) ||
1202  parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1203  AMDGPU::SGPR_32RegClass,
1204  MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1205  parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1206  AMDGPU::SGPR_32RegClass,
1207  MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1208  parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1209  AMDGPU::SReg_64RegClass,
1210  MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1211  parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1212  AMDGPU::SReg_64RegClass,
1213  MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1214  parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1215  AMDGPU::VGPR_32RegClass,
1216  MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1217  parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1218  AMDGPU::VGPR_32RegClass,
1219  MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1220  parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1221  AMDGPU::VGPR_32RegClass,
1222  MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1223  return true;
1224 
1225  MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1226  MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1231 
1232  return false;
1233 }
FunctionPass * createSpeculativeExecutionPass()
char & SIFormMemoryClausesID
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:77
FunctionPass * createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *)
FunctionPass * createStraightLineStrengthReducePass()
uint64_t CallInst * C
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Definition: MemoryBuffer.h:75
Represents a range in source code.
Definition: SMLoc.h:48
StringRef getTargetFeatureString() const
static cl::opt< bool > EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, cl::desc("Enable machine DCE inside regalloc"))
Target & getTheGCNTarget()
The target for GCN GPUs.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
void initializeAMDGPUDAGToDAGISelPass(PassRegistry &)
void addAAResult(AAResultT &AAResult)
Register a specific AA result.
char & SIWholeQuadModeID
LLVM_NODISCARD std::enable_if_t< !is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type > dyn_cast(const Y &Val)
Definition: Casting.h:334
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine...
FunctionPass * createSIAnnotateControlFlowPass()
Create the annotation pass.
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget()
CodeModel::Model getEffectiveCodeModel(Optional< CodeModel::Model > CM, CodeModel::Model Default)
Helper method for getting the code model, returning Default if CM does not have a value...
ModulePass * createAMDGPUAlwaysInlinePass(bool GlobalOpt=true)
static LLVM_READNONE StringRef getGPUOrDefault(const Triple &TT, StringRef GPU)
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PassManagerBuilder - This class is used to set up a standard optimization sequence for languages like...
bool FP32InputDenormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
static constexpr ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional< Reloc::Model > RM, Optional< CodeModel::Model > CM, CodeGenOpt::Level OL, bool JIT)
This class represents lattice values for constants.
Definition: AllocatorList.h:23
void initializeSIFixVGPRCopiesPass(PassRegistry &)
virtual void addIRPasses()
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
static cl::opt< bool > EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer", cl::desc("Enable load store vectorizer"), cl::init(true), cl::Hidden)
FunctionPass * createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel)
This pass converts a legalized DAG into a R600-specific.
void initializeSIInsertWaitcntsPass(PassRegistry &)
char & GCNNSAReassignID
void initializeSIFormMemoryClausesPass(PassRegistry &)
ModulePass * createR600OpenCLImageTypeLoweringPass()
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &)
Triple TargetTriple
Triple string, CPU name, and target feature strings the TargetMachine instance is created with...
Definition: TargetMachine.h:85
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
getOS - Get the parsed operating system type of this triple.
Definition: Triple.h:309
void initializeAMDGPUAtomicOptimizerPass(PassRegistry &)
char & SILoadStoreOptimizerID
Target & getTheAMDGPUTarget()
The target which supports all AMD GPUs.
Address space for constant memory (VTX2).
Definition: AMDGPU.h:299
char & SIPeepholeSDWAID
void initializeSIModeRegisterPass(PassRegistry &)
This pass implements the localization mechanism described at the top of this file.
Definition: Localizer.h:40
void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &)
This file describes how to lower LLVM calls to machine code calls.
char & FuncletLayoutID
This pass lays out funclets contiguously.
unsigned Reg
char & SIInsertSkipsPassID
FunctionPass * createAMDGPUSimplifyLibCallsPass(const TargetMachine *)
FunctionPass * createLowerSwitchPass()
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &)
char & EarlyIfConverterID
EarlyIfConverter - This pass performs if-conversion on SSA form by inserting cmov instructions...
void initializeR600ControlFlowFinalizerPass(PassRegistry &)
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry &)
static constexpr ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
Pass * createLCSSAPass()
Definition: LCSSA.cpp:492
FunctionPass * createAMDGPUPromoteAlloca()
virtual void add(Pass *P)=0
Add a pass to the queue of passes to run.
ModulePass * createAMDGPULowerKernelAttributesPass()
bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) const override
Parse out the target&#39;s MachineFunctionInfo from the YAML reprsentation.
char & BranchRelaxationPassID
BranchRelaxation - This pass replaces branches that need to jump further than is supported by a branc...
FunctionPass * createAMDGPUCodeGenPreparePass()
F(f)
R600 Machine Scheduler interface.
char & MachineSchedulerID
MachineScheduler - This pass schedules machine instructions.
static cl::opt< bool > EnableLowerKernelArguments("amdgpu-ir-lower-kernel-arguments", cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), cl::Hidden)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:713
An instruction for reading from memory.
Definition: Instructions.h:174
FunctionPass * createAMDGPUCFGStructurizerPass()
MachineSchedRegistry provides a selection of available machine instruction schedulers.
void initializeSIPostRABundlerPass(PassRegistry &)
static cl::opt< bool, true > EnableAMDGPUFunctionCallsOpt("amdgpu-function-calls", cl::desc("Enable AMDGPU function call support"), cl::location(AMDGPUTargetMachine::EnableFunctionCalls), cl::init(true), cl::Hidden)
virtual void addMachineSSAOptimization()
addMachineSSAOptimization - Add standard passes that optimize machine instructions in SSA form...
void initializeAMDGPUAAWrapperPassPass(PassRegistry &)
static std::unique_ptr< TargetLoweringObjectFile > createTLOF(const Triple &TT)
char & DetectDeadLanesID
This pass adds dead/undef flags after analyzing subregister lanes.
void initializeAMDGPUPromoteAllocaPass(PassRegistry &)
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &)
FunctionPass * createSIAddIMGInitPass()
FunctionPass * createSIMemoryLegalizerPass()
Pass * Inliner
Inliner - Specifies the inliner to use.
std::unique_ptr< ScheduleDAGMutation > createAMDGPUExportClusteringDAGMutation()
FunctionPass * createAMDGPUMachineCFGStructurizerPass()
FunctionPass * createSIInsertWaitcntsPass()
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
StringRef getFeatureString(const Function &F) const
ModulePass * createAMDGPUPrintfRuntimeBinding()
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
void resetTargetOptions(const Function &F) const
Reset the target options based on the function&#39;s attributes.
yaml::MachineFunctionInfo * createDefaultFuncInfoYAML() const override
Allocate and return a default initialized instance of the YAML representation for the MachineFunction...
This file declares the targeting of the InstructionSelector class for AMDGPU.
Pass * createAMDGPUFunctionInliningPass()
static cl::opt< bool > EnableSDWAPeephole("amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), cl::init(true))
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
This file declares the AMDGPU-specific subclass of TargetLoweringObjectFile.
Pass * createAMDGPUAnnotateKernelFeaturesPass()
static cl::opt< bool > EnableSIModeRegisterPass("amdgpu-mode-register", cl::desc("Enable mode register pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableRegReassign("amdgpu-reassign-regs", cl::desc("Enable register reassign optimizations on gfx10+"), cl::init(true), cl::Hidden)
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
This file contains the simple types necessary to represent the attributes associated with functions a...
ModulePass * createAMDGPUOpenCLEnqueuedBlockLoweringPass()
void initializeGCNNSAReassignPass(PassRegistry &)
virtual void addFastRegAlloc()
addFastRegAlloc - Add the minimum set of target-independent passes that are required for fast registe...
void initializeAMDGPUInlinerPass(PassRegistry &)
FunctionPass * createSinkingPass()
Definition: Sink.cpp:284
static MachineSchedRegistry GCNILPSchedRegistry("gcn-ilp", "Run GCN iterative scheduler for ILP scheduling (experimental)", createIterativeILPMachineScheduler)
char & SIOptimizeExecMaskingPreRAID
EP_ModuleOptimizerEarly - This extension point allows adding passes just before the main module-level...
FunctionPass * createAMDGPUPromoteAllocaToVector()
char & FinalizeMachineBundlesID
FinalizeMachineBundles - This pass finalize machine instruction bundles (created earlier, e.g.
char & SIPreEmitPeepholeID
bool checkFeatures(StringRef FS) const
Check whether the subtarget features are enabled/disabled as per the provided string, ignoring all other features.
LLVM_NODISCARD bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:156
Target-Independent Code Generator Pass Configuration Options.
static StringRef computeDataLayout(const Triple &TT)
void initializeSIRemoveShortExecBranchesPass(PassRegistry &)
static cl::opt< bool, true > LateCFGStructurize("amdgpu-late-structurize", cl::desc("Enable late CFG structurization"), cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden)
void initializeSIInsertHardClausesPass(PassRegistry &)
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition: SourceMgr.h:122
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:343
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
FunctionPass * createR600ExpandSpecialInstrsPass()
static MachineSchedRegistry GCNMinRegSchedRegistry("gcn-minreg", "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler)
RegisterTargetMachine - Helper template for registering a target machine implementation, for use in the target machine initialization function.
char & MachineCSEID
MachineCSE - This pass performs global CSE on machine instructions.
Definition: MachineCSE.cpp:153
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI)
static cl::opt< bool, true > EnableAMDGPUFixedFunctionABIOpt("amdgpu-fixed-function-abi", cl::desc("Enable all implicit function arguments"), cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI), cl::init(false), cl::Hidden)
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:300
TargetTransformInfo getTargetTransformInfo(const Function &F) override
Get a TargetTransformInfo implementation for the target.
FunctionPass * createSILowerI1CopiesPass()
unsigned getAssumedAddrSpace(const Value *V) const override
If the specified generic pointer could be assumed as a pointer to a specific address space...
void initializeSIOptimizeExecMaskingPass(PassRegistry &)
char & DeadMachineInstructionElimID
DeadMachineInstructionElim - This pass removes dead machine instructions.
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry &)
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
ScheduleDAGMILive * createGenericSchedLive(MachineSchedContext *C)
Create the standard converging machine scheduler.
StringRef getTargetCPU() const
virtual bool addILPOpts()
Add passes that optimize instruction level parallelism for out-of-order targets.
void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &)
void initializeSIFixSGPRCopiesPass(PassRegistry &)
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &)
ModulePass * createGlobalDCEPass()
createGlobalDCEPass - This transform is designed to eliminate unreachable internal globals (functions...
FunctionPass * createR600VectorRegMerger()
bool parseNamedRegisterReference(PerFunctionMIParsingState &PFS, Register &Reg, StringRef Src, SMDiagnostic &Error)
Definition: MIParser.cpp:3232
static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > InternalizeSymbols("amdgpu-internalize-symbols", cl::desc("Enable elimination of non-kernel functions and unused globals"), cl::init(false), cl::Hidden)
Optional< SIArgumentInfo > ArgInfo
SI Machine Scheduler interface.
StringRef getGPUName(const Function &F) const
unsigned getMainFileID() const
Definition: SourceMgr.h:129
void append(in_iter S, in_iter E)
Append from an iterator pair.
Definition: SmallString.h:86
void initializeAMDGPUExternalAAWrapperPass(PassRegistry &)
char & PHIEliminationID
PHIElimination - This pass eliminates machine instruction PHI nodes by inserting copy instructions...
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:151
FunctionPass * createInferAddressSpacesPass(unsigned AddressSpace=~0u)
FunctionPass * createAMDGPUISelDag(TargetMachine *TM=nullptr, CodeGenOpt::Level OptLevel=CodeGenOpt::Default)
This pass converts a legalized DAG into a AMDGPU-specific.
ImmutablePass * createExternalAAWrapperPass(std::function< void(Pass &, Function &, AAResults &)> Callback)
A wrapper pass around a callback which can be used to populate the AAResults in the AAResultsWrapperP...
#define P(N)
std::unique_ptr< CSEConfigBase > getStandardCSEConfigForOpt(CodeGenOpt::Level Level)
Definition: CSEInfo.cpp:73
char & GCNDPPCombineID
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:427
FunctionPass * createAMDGPULowerKernelArgumentsPass()
char & SIInsertHardClausesID
CodeGenOpt::Level getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
FunctionPass * createAMDGPULateCodeGenPreparePass()
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
static Reloc::Model getEffectiveRelocModel(Optional< Reloc::Model > RM)
This pass implements the reg bank selector pass used in the GlobalISel pipeline.
Definition: RegBankSelect.h:90
FunctionPass * createFlattenCFGPass()
static cl::opt< bool > EnableAtomicOptimizations("amdgpu-atomic-optimizations", cl::desc("Enable atomic optimizations"), cl::init(false), cl::Hidden)
This file provides the interface for LLVM&#39;s Global Value Numbering pass which eliminates fully redund...
static cl::opt< bool > EarlyInlineAll("amdgpu-early-inline-all", cl::desc("Inline all functions early"), cl::init(false), cl::Hidden)
void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry &)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:225
static MachineSchedRegistry GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler)
void adjustPassManager(PassManagerBuilder &) override
Allow the target to modify the pass manager, e.g.
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:296
bool isEntryFunctionCC(CallingConv::ID CC)
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
void initializeSIPeepholeSDWAPass(PassRegistry &)
void initializeSIPreEmitPeepholePass(PassRegistry &)
Pass * createLICMPass()
Definition: LICM.cpp:296
static cl::opt< bool > EnableSROA("amdgpu-sroa", cl::desc("Run SROA after promote alloca pass"), cl::ReallyHidden, cl::init(true))
TargetPassConfig * createPassConfig(PassManagerBase &PM) override
Create a pass configuration object to be used by addPassToEmitX methods for generating a pipeline of ...
char & AMDGPUPerfHintAnalysisID
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
FunctionPass * createR600ControlFlowFinalizer()
Legacy wrapper pass to provide the AMDGPUAAResult object.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise...
R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional< Reloc::Model > RM, Optional< CodeModel::Model > CM, CodeGenOpt::Level OL, bool JIT)
virtual void addOptimizedRegAlloc()
addOptimizedRegAlloc - Add passes related to register allocation.
FunctionPass * createAMDGPUPreLegalizeCombiner(bool IsOptNone)
This class describes a target machine that is implemented with the LLVM target-independent code gener...
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
ModulePass * createBarrierNoopPass()
createBarrierNoopPass - This pass is purely a module pass barrier in a pass manager.
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Pass * createAlwaysInlinerLegacyPass(bool InsertLifetime=true)
Create a legacy pass manager instance of a pass to inline and remove functions marked as "always_inli...
const Triple & getTargetTriple() const
static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler)
static cl::opt< bool > EnableScalarIRPasses("amdgpu-scalar-ir-passes", cl::desc("Enable scalar IR passes"), cl::init(true), cl::Hidden)
AMDGPU::SIModeRegisterDefaults Mode
void initializeSILowerControlFlowPass(PassRegistry &)
static ScheduleDAGInstrs * createMinRegScheduler(MachineSchedContext *C)
This file defines the class GCNIterativeScheduler, which uses an iterative approach to find a best sc...
ModulePass * createAMDGPULowerIntrinsicsPass()
virtual void addCodeGenPrepare()
Add pass to prepare the LLVM IR for code generation.
FunctionPass * createSIModeRegisterPass()
const TargetSubtargetInfo * getSubtargetImpl() const
bool FP64FP16InputDenormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
FunctionPass * createR600ClauseMergePass()
The AMDGPU TargetMachine interface definition for hw codgen targets.
static cl::opt< bool > EnableR600IfConvert("r600-if-convert", cl::desc("Use if conversion pass"), cl::ReallyHidden, cl::init(true))
static ScheduleDAGInstrs * createR600MachineScheduler(MachineSchedContext *C)
std::unique_ptr< ScheduleDAGMutation > createAMDGPUMacroFusionDAGMutation()
Note that you have to add: DAG.addMutation(createAMDGPUMacroFusionDAGMutation()); to AMDGPUPassConfig...
assume Assume Builder
void initializeSIShrinkInstructionsPass(PassRegistry &)
char & TwoAddressInstructionPassID
TwoAddressInstruction - This pass reduces two-address instructions to use two operands.
void initializeAMDGPUUseNativeCallsPass(PassRegistry &)
void initializeSIAddIMGInitPass(PassRegistry &)
Analysis pass providing a never-invalidated alias analysis result.
EP_EarlyAsPossible - This extension point allows adding passes before any other transformations, allowing them to see the code as it is coming out of the frontend.
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional< Reloc::Model > RM, Optional< CodeModel::Model > CM, CodeGenOpt::Level OL)
void initializeSIInsertSkipsPass(PassRegistry &)
void initializeR600PacketizerPass(PassRegistry &)
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:45
FunctionPass * createAMDGPUAnnotateUniformValues()
void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry &)
This is the AMGPU address space based alias analysis pass.
Provides passes to inlining "always_inline" functions.
char & SIOptimizeExecMaskingID
EP_CGSCCOptimizerLate - This extension point allows adding CallGraphSCC passes at the end of the main...
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", "Run GCN scheduler to maximize occupancy (experimental)", createIterativeGCNMaxOccupancyMachineScheduler)
void initializeGCNRegBankReassignPass(PassRegistry &)
MCRegisterInfo * createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour)
std::unique_ptr< ScheduleDAGMutation > createLoadClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &)
char & AMDGPUUnifyDivergentExitNodesID
bool enableSIScheduler() const
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
void initializeSIMemoryLegalizerPass(PassRegistry &)
PassManagerBase - An abstract interface to allow code to add passes to a pass manager without having ...
char & StackMapLivenessID
StackMapLiveness - This pass analyses the register live-out set of stackmap/patchpoint intrinsics and...
ModulePass * createInternalizePass(std::function< bool(const GlobalValue &)> MustPreserveGV)
createInternalizePass - This pass loops over all of the functions in the input module, internalizing all globals (functions and variables) it can.
char & SIPreAllocateWWMRegsID
Optional< unsigned > Mask
constexpr const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:256
void initializeSIWholeQuadModePass(PassRegistry &)
void setRequiresStructuredCFG(bool Value)
char & SIPostRABundlerID
FunctionPass * createAMDGPUAtomicOptimizerPass()
void initializeR600VectorRegMergerPass(PassRegistry &)
const MCSubtargetInfo * getMCSubtargetInfo() const
char & SIFixVGPRCopiesID
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &)
char & EarlyMachineLICMID
This pass performs loop invariant code motion on machine instructions.
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:131
#define RegName(no)
void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &)
void initializeGCNDPPCombinePass(PassRegistry &)
ImmutablePass * createAMDGPUAAWrapperPass()
FunctionPass * createR600EmitClauseMarkers()
void initializeR600ClauseMergePassPass(PassRegistry &)
This interface provides simple read-only access to a block of memory, and provides simple methods for...
Definition: MemoryBuffer.h:50
This pass is responsible for selecting generic machine instructions to target-specific instructions...
ModulePass * createAMDGPUFixFunctionBitcastsPass()
void initializeAMDGPUUnifyMetadataPass(PassRegistry &)
FunctionPass * createSeparateConstOffsetFromGEPPass(bool LowerGEP=false)
Address space for flat memory.
Definition: AMDGPU.h:295
Target - Wrapper for Target specific information.
void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &)
virtual void addPostRegAlloc()
This method may be implemented by targets that want to run passes after register allocation pass pipe...
static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
This file declares the targeting of the Machinelegalizer class for AMDGPU.
A wrapper around std::string which contains a source range that&#39;s being set during parsing...
char & SILowerSGPRSpillsID
FunctionPass * createR600Packetizer()
void initializeSILoadStoreOptimizerPass(PassRegistry &)
char & SILowerControlFlowID
ModulePass * createAMDGPUUnifyMetadataPass()
void initializeSIAnnotateControlFlowPass(PassRegistry &)
Targets should override this in a way that mirrors the implementation of llvm::MachineFunctionInfo.
Provides AMDGPU specific target descriptions.
A ScheduleDAG for scheduling lists of MachineInstr.
char & PatchableFunctionID
This pass implements the "patchable-function" attribute.
void initializeSIFoldOperandsPass(PassRegistry &)
char & SIFoldOperandsID
void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:559
FunctionPass * createSIShrinkInstructionsPass()
FunctionPass * createFixIrreduciblePass()
static cl::opt< bool > EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true))
StringRef getValueAsString() const
Return the attribute&#39;s value as a string.
Definition: Attributes.cpp:275
TargetOptions Options
char & IfConverterID
IfConverter - This pass performs machine code if conversion.
#define LLVM_READNONE
Definition: Compiler.h:204
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
char & SIFixSGPRCopiesID
#define I(x, y, z)
Definition: MD5.cpp:59
void initializeSILowerSGPRSpillsPass(PassRegistry &)
FunctionPass * createSROAPass()
Definition: SROA.cpp:4834
void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &)
static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler)
ImmutablePass * createAMDGPUExternalAAWrapperPass()
static cl::opt< bool > OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, cl::desc("Run pre-RA exec mask optimizations"), cl::init(true))
static bool mustPreserveGV(const GlobalValue &GV)
Predicate for Internalize pass.
ModulePass * createAMDGPUPropagateAttributesLatePass(const TargetMachine *)
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:558
char & SIRemoveShortExecBranchesID
std::unique_ptr< const MCRegisterInfo > MRI
Definition: TargetMachine.h:95
char & GCNRegBankReassignID
This file declares the IRTranslator pass.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void initializeAMDGPULateCodeGenPreparePass(PassRegistry &)
FunctionPass * createAMDGPUUseNativeCallsPass()
yaml::MachineFunctionInfo * convertFuncInfoToYAML(const MachineFunction &MF) const override
Allocate and initialize an instance of the YAML representation of the MachineFunctionInfo.
LLVM Value Representation.
Definition: Value.h:75
char & PostRAHazardRecognizerID
PostRAHazardRecognizer - This pass runs the post-ra hazard recognizer.
void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &)
AnalysisType * getAnalysisIfAvailable() const
getAnalysisIfAvailable<AnalysisType>() - Subclasses use this function to get analysis information tha...
Lightweight error class with error context and mandatory checking.
Definition: Error.h:157
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.h:349
FunctionPass * createEarlyCSEPass(bool UseMemorySSA=false)
Definition: EarlyCSE.cpp:1687
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:38
TargetTransformInfo getTargetTransformInfo(const Function &F) override
Get a TargetTransformInfo implementation for the target.
void initializeSILowerI1CopiesPass(PassRegistry &)
static cl::opt< bool > EnableDPPCombine("amdgpu-dpp-combine", cl::desc("Enable DPP combiner"), cl::init(true))
void addExtension(ExtensionPointTy Ty, ExtensionFn Fn)
static ScheduleDAGInstrs * createIterativeILPMachineScheduler(MachineSchedContext *C)
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry &)
Represents a location in source code.
Definition: SMLoc.h:23
static cl::opt< bool > EnableLibCallSimplify("amdgpu-simplify-libcall", cl::desc("Enable amdgpu library simplifications"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableR600StructurizeCFG("r600-ir-structurize", cl::desc("Use StructurizeCFG IR pass"), cl::init(true))
Pass * createStructurizeCFGPass(bool SkipUniformRegions=false)
When SkipUniformRegions is true the structizer will not structurize regions that only contain uniform...
FunctionPass * createAtomicExpandPass()
void initializeAMDGPUAlwaysInlinePass(PassRegistry &)
TargetPassConfig * createPassConfig(PassManagerBase &PM) override
Create a pass configuration object to be used by addPassToEmitX methods for generating a pipeline of ...
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &)
void initializeSIPreAllocateWWMRegsPass(PassRegistry &)
FunctionPass * createUnifyLoopExitsPass()
void initializeGlobalISel(PassRegistry &)
Initialize all passes linked into the GlobalISel library.
Definition: GlobalISel.cpp:18
bool use_empty() const
Definition: Value.h:343
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:443
FunctionPass * createGVNPass(bool NoMemDepAnalysis=false)
Create a legacy GVN pass.
Definition: GVN.cpp:2939
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static cl::opt< bool > EnableStructurizerWorkarounds("amdgpu-enable-structurizer-workarounds", cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createSIMachineScheduler(MachineSchedContext *C)
static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(false))
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
FunctionPass * createNaryReassociatePass()
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition: SourceMgr.h:251