LLVM  4.0.0
SIISelLowering.cpp
Go to the documentation of this file.
1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for SI
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifdef _MSC_VER
16 // Provide M_PI.
17 #define _USE_MATH_DEFINES
18 #include <cmath>
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPUIntrinsicInfo.h"
23 #include "AMDGPUSubtarget.h"
24 #include "SIDefines.h"
25 #include "SIISelLowering.h"
26 #include "SIInstrInfo.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIRegisterInfo.h"
29 #include "llvm/ADT/BitVector.h"
30 #include "llvm/ADT/StringSwitch.h"
35 #include "llvm/CodeGen/Analysis.h"
36 #include "llvm/IR/DiagnosticInfo.h"
37 #include "llvm/IR/Function.h"
38 
39 using namespace llvm;
40 
42  "amdgpu-vgpr-index-mode",
43  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44  cl::init(false));
45 
46 
47 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
48  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
49  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
50  if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
51  return AMDGPU::SGPR0 + Reg;
52  }
53  }
54  llvm_unreachable("Cannot allocate sgpr");
55 }
56 
58  const SISubtarget &STI)
59  : AMDGPUTargetLowering(TM, STI) {
60  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
61  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
62 
63  addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
64  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
65 
66  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
67  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
68  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
69 
70  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
71  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
72 
73  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
74  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
75 
76  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
77  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
78 
79  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
80  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
81 
82  if (Subtarget->has16BitInsts()) {
83  addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
84  addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
85  }
86 
88 
89  // We need to custom lower vector stores from local memory
95 
101 
112 
113 
117 
122 
128 
133 
136 
144 
148 
155 
156  // We only support LOAD/STORE and vector manipulation ops for vectors
157  // with > 4 elements.
159  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
160  switch (Op) {
161  case ISD::LOAD:
162  case ISD::STORE:
163  case ISD::BUILD_VECTOR:
164  case ISD::BITCAST:
170  break;
171  case ISD::CONCAT_VECTORS:
173  break;
174  default:
176  break;
177  }
178  }
179  }
180 
181  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
182  // is expanded to avoid having two separate loops in case the index is a VGPR.
183 
184  // Most operations are naturally 32-bit vector operations. We only support
185  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
186  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
189 
192 
195 
198  }
199 
204 
205  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
206  // and output demarshalling
209 
210  // We can't return success/failure, only the old value,
211  // let LLVM add the comparison
214 
215  if (getSubtarget()->hasFlatAddressSpace()) {
218  }
219 
222 
223  // On SI this is s_memtime and s_memrealtime on VI.
226 
229 
234  }
235 
237 
242 
243  if (Subtarget->has16BitInsts()) {
245 
248 
251 
254 
257 
262 
265 
270 
272 
274 
276 
278 
283 
288 
289  // F16 - Constant Actions.
291 
292  // F16 - Load/Store Actions.
297 
298  // F16 - VOP1 Actions.
306 
307  // F16 - VOP2 Actions.
313 
314  // F16 - VOP3 Actions.
316  if (!Subtarget->hasFP16Denormals())
318  }
319 
335 
336  // All memory operations. Some folding on the pointer operand is done to help
337  // matching the constant offsets in the addressing modes.
355 
357 }
358 
360  return static_cast<const SISubtarget *>(Subtarget);
361 }
362 
363 //===----------------------------------------------------------------------===//
364 // TargetLowering queries
365 //===----------------------------------------------------------------------===//
366 
368  const CallInst &CI,
369  unsigned IntrID) const {
370  switch (IntrID) {
371  case Intrinsic::amdgcn_atomic_inc:
372  case Intrinsic::amdgcn_atomic_dec:
374  Info.memVT = MVT::getVT(CI.getType());
375  Info.ptrVal = CI.getOperand(0);
376  Info.align = 0;
377  Info.vol = false;
378  Info.readMem = true;
379  Info.writeMem = true;
380  return true;
381  default:
382  return false;
383  }
384 }
385 
387  EVT) const {
388  // SI has some legal vector types, but no legal vector operations. Say no
389  // shuffles are legal in order to prefer scalarizing some vector operations.
390  return false;
391 }
392 
393 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
394  // Flat instructions do not have offsets, and only have the register
395  // address.
396  return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
397 }
398 
399 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
400  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
401  // additionally can do r + r + i with addr64. 32-bit has more addressing
402  // mode options. Depending on the resource constant, it can also do
403  // (i64 r0) + (i32 r1) * (i14 i).
404  //
405  // Private arrays end up using a scratch buffer most of the time, so also
406  // assume those use MUBUF instructions. Scratch loads / stores are currently
407  // implemented as mubuf instructions with offen bit set, so slightly
408  // different than the normal addr64.
409  if (!isUInt<12>(AM.BaseOffs))
410  return false;
411 
412  // FIXME: Since we can split immediate into soffset and immediate offset,
413  // would it make sense to allow any immediate?
414 
415  switch (AM.Scale) {
416  case 0: // r + i or just i, depending on HasBaseReg.
417  return true;
418  case 1:
419  return true; // We have r + r or r + i.
420  case 2:
421  if (AM.HasBaseReg) {
422  // Reject 2 * r + r.
423  return false;
424  }
425 
426  // Allow 2 * r as r + r
427  // Or 2 * r + i is allowed as r + r + i.
428  return true;
429  default: // Don't allow n * r
430  return false;
431  }
432 }
433 
435  const AddrMode &AM, Type *Ty,
436  unsigned AS) const {
437  // No global is ever allowed as a base.
438  if (AM.BaseGV)
439  return false;
440 
441  switch (AS) {
444  // Assume the we will use FLAT for all global memory accesses
445  // on VI.
446  // FIXME: This assumption is currently wrong. On VI we still use
447  // MUBUF instructions for the r + i addressing mode. As currently
448  // implemented, the MUBUF instructions only work on buffer < 4GB.
449  // It may be possible to support > 4GB buffers with MUBUF instructions,
450  // by setting the stride value in the resource descriptor which would
451  // increase the size limit to (stride * 4GB). However, this is risky,
452  // because it has never been validated.
453  return isLegalFlatAddressingMode(AM);
454  }
455 
456  return isLegalMUBUFAddressingMode(AM);
457  }
459  // If the offset isn't a multiple of 4, it probably isn't going to be
460  // correctly aligned.
461  // FIXME: Can we get the real alignment here?
462  if (AM.BaseOffs % 4 != 0)
463  return isLegalMUBUFAddressingMode(AM);
464 
465  // There are no SMRD extloads, so if we have to do a small type access we
466  // will use a MUBUF load.
467  // FIXME?: We also need to do this if unaligned, but we don't know the
468  // alignment here.
469  if (DL.getTypeStoreSize(Ty) < 4)
470  return isLegalMUBUFAddressingMode(AM);
471 
473  // SMRD instructions have an 8-bit, dword offset on SI.
474  if (!isUInt<8>(AM.BaseOffs / 4))
475  return false;
477  // On CI+, this can also be a 32-bit literal constant offset. If it fits
478  // in 8-bits, it can use a smaller encoding.
479  if (!isUInt<32>(AM.BaseOffs / 4))
480  return false;
482  // On VI, these use the SMEM format and the offset is 20-bit in bytes.
483  if (!isUInt<20>(AM.BaseOffs))
484  return false;
485  } else
486  llvm_unreachable("unhandled generation");
487 
488  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
489  return true;
490 
491  if (AM.Scale == 1 && AM.HasBaseReg)
492  return true;
493 
494  return false;
495  }
496 
498  return isLegalMUBUFAddressingMode(AM);
499 
502  // Basic, single offset DS instructions allow a 16-bit unsigned immediate
503  // field.
504  // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
505  // an 8-bit dword offset but we don't know the alignment here.
506  if (!isUInt<16>(AM.BaseOffs))
507  return false;
508 
509  if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
510  return true;
511 
512  if (AM.Scale == 1 && AM.HasBaseReg)
513  return true;
514 
515  return false;
516  }
519  // For an unknown address space, this usually means that this is for some
520  // reason being used for pure arithmetic, and not based on some addressing
521  // computation. We don't have instructions that compute pointers with any
522  // addressing modes, so treat them as having no offset like flat
523  // instructions.
524  return isLegalFlatAddressingMode(AM);
525 
526  default:
527  llvm_unreachable("unhandled address space");
528  }
529 }
530 
532  unsigned AddrSpace,
533  unsigned Align,
534  bool *IsFast) const {
535  if (IsFast)
536  *IsFast = false;
537 
538  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
539  // which isn't a simple VT.
540  // Until MVT is extended to handle this, simply check for the size and
541  // rely on the condition below: allow accesses if the size is a multiple of 4.
542  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
543  VT.getStoreSize() > 16)) {
544  return false;
545  }
546 
547  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
548  AddrSpace == AMDGPUAS::REGION_ADDRESS) {
549  // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
550  // aligned, 8 byte access in a single operation using ds_read2/write2_b32
551  // with adjacent offsets.
552  bool AlignedBy4 = (Align % 4 == 0);
553  if (IsFast)
554  *IsFast = AlignedBy4;
555 
556  return AlignedBy4;
557  }
558 
559  // FIXME: We have to be conservative here and assume that flat operations
560  // will access scratch. If we had access to the IR function, then we
561  // could determine if any private memory was used in the function.
563  (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
564  AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
565  return false;
566  }
567 
569  // If we have an uniform constant load, it still requires using a slow
570  // buffer instruction if unaligned.
571  if (IsFast) {
572  *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
573  (Align % 4 == 0) : true;
574  }
575 
576  return true;
577  }
578 
579  // Smaller than dword value must be aligned.
580  if (VT.bitsLT(MVT::i32))
581  return false;
582 
583  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
584  // byte-address are ignored, thus forcing Dword alignment.
585  // This applies to private, global, and constant memory.
586  if (IsFast)
587  *IsFast = true;
588 
589  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
590 }
591 
592 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
593  unsigned SrcAlign, bool IsMemset,
594  bool ZeroMemset,
595  bool MemcpyStrSrc,
596  MachineFunction &MF) const {
597  // FIXME: Should account for address space here.
598 
599  // The default fallback uses the private pointer size as a guess for a type to
600  // use. Make sure we switch these to 64-bit accesses.
601 
602  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
603  return MVT::v4i32;
604 
605  if (Size >= 8 && DstAlign >= 4)
606  return MVT::v2i32;
607 
608  // Use the default.
609  return MVT::Other;
610 }
611 
612 static bool isFlatGlobalAddrSpace(unsigned AS) {
613  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
614  AS == AMDGPUAS::FLAT_ADDRESS ||
616 }
617 
619  unsigned DestAS) const {
620  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
621 }
622 
624  const MemSDNode *MemNode = cast<MemSDNode>(N);
625  const Value *Ptr = MemNode->getMemOperand()->getValue();
626  const Instruction *I = dyn_cast<Instruction>(Ptr);
627  return I && I->getMetadata("amdgpu.noclobber");
628 }
629 
631  unsigned DestAS) const {
632  // Flat -> private/local is a simple truncate.
633  // Flat -> global is no-op
634  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
635  return true;
636 
637  return isNoopAddrSpaceCast(SrcAS, DestAS);
638 }
639 
641  const MemSDNode *MemNode = cast<MemSDNode>(N);
642  const Value *Ptr = MemNode->getMemOperand()->getValue();
643 
644  // UndefValue means this is a load of a kernel input. These are uniform.
645  // Sometimes LDS instructions have constant pointers.
646  // If Ptr is null, then that means this mem operand contains a
647  // PseudoSourceValue like GOT.
648  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
649  isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
650  return true;
651 
652  const Instruction *I = dyn_cast<Instruction>(Ptr);
653  return I && I->getMetadata("amdgpu.uniform");
654 }
655 
658  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
659  return TypeSplitVector;
660 
662 }
663 
665  Type *Ty) const {
666  // FIXME: Could be smarter if called for vector constants.
667  return true;
668 }
669 
671  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
672  switch (Op) {
673  case ISD::LOAD:
674  case ISD::STORE:
675 
676  // These operations are done with 32-bit instructions anyway.
677  case ISD::AND:
678  case ISD::OR:
679  case ISD::XOR:
680  case ISD::SELECT:
681  // TODO: Extensions?
682  return true;
683  default:
684  return false;
685  }
686  }
687 
688  // SimplifySetCC uses this function to determine whether or not it should
689  // create setcc with i1 operands. We don't have instructions for i1 setcc.
690  if (VT == MVT::i1 && Op == ISD::SETCC)
691  return false;
692 
694 }
695 
696 SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
697  const SDLoc &SL, SDValue Chain,
698  unsigned Offset) const {
699  const DataLayout &DL = DAG.getDataLayout();
701  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
702  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
703 
706  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
707  MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
708  return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
709  DAG.getConstant(Offset, SL, PtrVT));
710 }
711 
712 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
713  const SDLoc &SL, SDValue Chain,
714  unsigned Offset, bool Signed,
715  const ISD::InputArg *Arg) const {
716  const DataLayout &DL = DAG.getDataLayout();
717  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
719  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
720 
721  unsigned Align = DL.getABITypeAlignment(Ty);
722 
723  SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
724  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
728 
729  SDValue Val = Load;
730  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
731  VT.bitsLT(MemVT)) {
732  unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
733  Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
734  }
735 
736  if (MemVT.isFloatingPoint())
737  Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
738  else if (Signed)
739  Val = DAG.getSExtOrTrunc(Val, SL, VT);
740  else
741  Val = DAG.getZExtOrTrunc(Val, SL, VT);
742 
743  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
744 }
745 
747  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
748  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
749  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
750  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
751 
753  FunctionType *FType = MF.getFunction()->getFunctionType();
755  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
756 
757  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
758  const Function *Fn = MF.getFunction();
759  DiagnosticInfoUnsupported NoGraphicsHSA(
760  *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
761  DAG.getContext()->diagnose(NoGraphicsHSA);
762  return DAG.getEntryNode();
763  }
764 
765  // Create stack objects that are used for emitting debugger prologue if
766  // "amdgpu-debugger-emit-prologue" attribute was specified.
767  if (ST.debuggerEmitPrologue())
768  createDebuggerPrologueStackObjects(MF);
769 
771  BitVector Skipped(Ins.size());
772 
773  for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
774  const ISD::InputArg &Arg = Ins[i];
775 
776  // First check if it's a PS input addr
777  if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
778  !Arg.Flags.isByVal() && PSInputNum <= 15) {
779 
780  if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
781  // We can safely skip PS inputs
782  Skipped.set(i);
783  ++PSInputNum;
784  continue;
785  }
786 
787  Info->markPSInputAllocated(PSInputNum);
788  if (Arg.Used)
789  Info->PSInputEna |= 1 << PSInputNum;
790 
791  ++PSInputNum;
792  }
793 
794  if (AMDGPU::isShader(CallConv)) {
795  // Second split vertices into their elements
796  if (Arg.VT.isVector()) {
797  ISD::InputArg NewArg = Arg;
798  NewArg.Flags.setSplit();
799  NewArg.VT = Arg.VT.getVectorElementType();
800 
801  // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
802  // three or five element vertex only needs three or five registers,
803  // NOT four or eight.
804  Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
805  unsigned NumElements = ParamType->getVectorNumElements();
806 
807  for (unsigned j = 0; j != NumElements; ++j) {
808  Splits.push_back(NewArg);
809  NewArg.PartOffset += NewArg.VT.getStoreSize();
810  }
811  } else {
812  Splits.push_back(Arg);
813  }
814  }
815  }
816 
818  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
819  *DAG.getContext());
820 
821  // At least one interpolation mode must be enabled or else the GPU will hang.
822  //
823  // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
824  // PSInputAddr, the user wants to enable some bits after the compilation
825  // based on run-time states. Since we can't know what the final PSInputEna
826  // will look like, so we shouldn't do anything here and the user should take
827  // responsibility for the correct programming.
828  //
829  // Otherwise, the following restrictions apply:
830  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
831  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
832  // enabled too.
833  if (CallConv == CallingConv::AMDGPU_PS &&
834  ((Info->getPSInputAddr() & 0x7F) == 0 ||
835  ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
836  CCInfo.AllocateReg(AMDGPU::VGPR0);
837  CCInfo.AllocateReg(AMDGPU::VGPR1);
838  Info->markPSInputAllocated(0);
839  Info->PSInputEna |= 1;
840  }
841 
842  if (!AMDGPU::isShader(CallConv)) {
843  assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
844  } else {
845  assert(!Info->hasDispatchPtr() &&
846  !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
847  !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
848  !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
849  !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
850  !Info->hasWorkItemIDZ());
851  }
852 
853  if (Info->hasPrivateMemoryInputPtr()) {
854  unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI);
855  MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass);
856  CCInfo.AllocateReg(PrivateMemoryPtrReg);
857  }
858 
859  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
860  if (Info->hasPrivateSegmentBuffer()) {
861  unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
862  MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
863  CCInfo.AllocateReg(PrivateSegmentBufferReg);
864  }
865 
866  if (Info->hasDispatchPtr()) {
867  unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
868  MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
869  CCInfo.AllocateReg(DispatchPtrReg);
870  }
871 
872  if (Info->hasQueuePtr()) {
873  unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
874  MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
875  CCInfo.AllocateReg(QueuePtrReg);
876  }
877 
878  if (Info->hasKernargSegmentPtr()) {
879  unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
880  MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
881  CCInfo.AllocateReg(InputPtrReg);
882  }
883 
884  if (Info->hasDispatchID()) {
885  unsigned DispatchIDReg = Info->addDispatchID(*TRI);
886  MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
887  CCInfo.AllocateReg(DispatchIDReg);
888  }
889 
890  if (Info->hasFlatScratchInit()) {
891  unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
892  MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
893  CCInfo.AllocateReg(FlatScratchInitReg);
894  }
895 
896  if (!AMDGPU::isShader(CallConv))
897  analyzeFormalArgumentsCompute(CCInfo, Ins);
898  else
899  AnalyzeFormalArguments(CCInfo, Splits);
900 
902 
903  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
904 
905  const ISD::InputArg &Arg = Ins[i];
906  if (Skipped[i]) {
907  InVals.push_back(DAG.getUNDEF(Arg.VT));
908  continue;
909  }
910 
911  CCValAssign &VA = ArgLocs[ArgIdx++];
912  MVT VT = VA.getLocVT();
913 
914  if (VA.isMemLoc()) {
915  VT = Ins[i].VT;
916  EVT MemVT = VA.getLocVT();
917  const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) +
918  VA.getLocMemOffset();
919  // The first 36 bytes of the input buffer contains information about
920  // thread group and global sizes.
921  SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain,
922  Offset, Ins[i].Flags.isSExt(),
923  &Ins[i]);
924  Chains.push_back(Arg.getValue(1));
925 
926  auto *ParamTy =
927  dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
929  ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
930  // On SI local pointers are just offsets into LDS, so they are always
931  // less than 16-bits. On CI and newer they could potentially be
932  // real pointers, so we can't guarantee their size.
933  Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
934  DAG.getValueType(MVT::i16));
935  }
936 
937  InVals.push_back(Arg);
938  Info->setABIArgOffset(Offset + MemVT.getStoreSize());
939  continue;
940  }
941  assert(VA.isRegLoc() && "Parameter must be in a register!");
942 
943  unsigned Reg = VA.getLocReg();
944 
945  if (VT == MVT::i64) {
946  // For now assume it is a pointer
947  Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
948  &AMDGPU::SGPR_64RegClass);
949  Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass);
950  SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
951  InVals.push_back(Copy);
952  continue;
953  }
954 
955  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
956 
957  Reg = MF.addLiveIn(Reg, RC);
958  SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
959 
960  if (Arg.VT.isVector()) {
961 
962  // Build a vector from the registers
963  Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
964  unsigned NumElements = ParamType->getVectorNumElements();
965 
967  Regs.push_back(Val);
968  for (unsigned j = 1; j != NumElements; ++j) {
969  Reg = ArgLocs[ArgIdx++].getLocReg();
970  Reg = MF.addLiveIn(Reg, RC);
971 
972  SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
973  Regs.push_back(Copy);
974  }
975 
976  // Fill up the missing vector elements
977  NumElements = Arg.VT.getVectorNumElements() - NumElements;
978  Regs.append(NumElements, DAG.getUNDEF(VT));
979 
980  InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
981  continue;
982  }
983 
984  InVals.push_back(Val);
985  }
986 
987  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
988  // these from the dispatch pointer.
989 
990  // Start adding system SGPRs.
991  if (Info->hasWorkGroupIDX()) {
992  unsigned Reg = Info->addWorkGroupIDX();
993  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
994  CCInfo.AllocateReg(Reg);
995  }
996 
997  if (Info->hasWorkGroupIDY()) {
998  unsigned Reg = Info->addWorkGroupIDY();
999  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1000  CCInfo.AllocateReg(Reg);
1001  }
1002 
1003  if (Info->hasWorkGroupIDZ()) {
1004  unsigned Reg = Info->addWorkGroupIDZ();
1005  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1006  CCInfo.AllocateReg(Reg);
1007  }
1008 
1009  if (Info->hasWorkGroupInfo()) {
1010  unsigned Reg = Info->addWorkGroupInfo();
1011  MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1012  CCInfo.AllocateReg(Reg);
1013  }
1014 
1015  if (Info->hasPrivateSegmentWaveByteOffset()) {
1016  // Scratch wave offset passed in system SGPR.
1017  unsigned PrivateSegmentWaveByteOffsetReg;
1018 
1019  if (AMDGPU::isShader(CallConv)) {
1020  PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1021  Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1022  } else
1023  PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
1024 
1025  MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1026  CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1027  }
1028 
1029  // Now that we've figured out where the scratch register inputs are, see if
1030  // should reserve the arguments and use them directly.
1031  bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
1032  // Record that we know we have non-spill stack objects so we don't need to
1033  // check all stack objects later.
1034  if (HasStackObjects)
1035  Info->setHasNonSpillStackObjects(true);
1036 
1037  // Everything live out of a block is spilled with fast regalloc, so it's
1038  // almost certain that spilling will be required.
1039  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
1040  HasStackObjects = true;
1041 
1042  if (ST.isAmdCodeObjectV2(MF)) {
1043  if (HasStackObjects) {
1044  // If we have stack objects, we unquestionably need the private buffer
1045  // resource. For the Code Object V2 ABI, this will be the first 4 user
1046  // SGPR inputs. We can reserve those and use them directly.
1047 
1048  unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
1050  Info->setScratchRSrcReg(PrivateSegmentBufferReg);
1051 
1052  unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
1054  Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1055  } else {
1056  unsigned ReservedBufferReg
1058  unsigned ReservedOffsetReg
1060 
1061  // We tentatively reserve the last registers (skipping the last two
1062  // which may contain VCC). After register allocation, we'll replace
1063  // these with the ones immediately after those which were really
1064  // allocated. In the prologue copies will be inserted from the argument
1065  // to these reserved registers.
1066  Info->setScratchRSrcReg(ReservedBufferReg);
1067  Info->setScratchWaveOffsetReg(ReservedOffsetReg);
1068  }
1069  } else {
1070  unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
1071 
1072  // Without HSA, relocations are used for the scratch pointer and the
1073  // buffer resource setup is always inserted in the prologue. Scratch wave
1074  // offset is still in an input SGPR.
1075  Info->setScratchRSrcReg(ReservedBufferReg);
1076 
1077  if (HasStackObjects) {
1078  unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
1080  Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1081  } else {
1082  unsigned ReservedOffsetReg
1084  Info->setScratchWaveOffsetReg(ReservedOffsetReg);
1085  }
1086  }
1087 
1088  if (Info->hasWorkItemIDX()) {
1090  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1091  CCInfo.AllocateReg(Reg);
1092  }
1093 
1094  if (Info->hasWorkItemIDY()) {
1096  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1097  CCInfo.AllocateReg(Reg);
1098  }
1099 
1100  if (Info->hasWorkItemIDZ()) {
1102  MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1103  CCInfo.AllocateReg(Reg);
1104  }
1105 
1106  if (Chains.empty())
1107  return Chain;
1108 
1109  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
1110 }
1111 
1112 SDValue
1114  bool isVarArg,
1115  const SmallVectorImpl<ISD::OutputArg> &Outs,
1116  const SmallVectorImpl<SDValue> &OutVals,
1117  const SDLoc &DL, SelectionDAG &DAG) const {
1118  MachineFunction &MF = DAG.getMachineFunction();
1120 
1121  if (!AMDGPU::isShader(CallConv))
1122  return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
1123  OutVals, DL, DAG);
1124 
1125  Info->setIfReturnsVoid(Outs.size() == 0);
1126 
1128  SmallVector<SDValue, 48> SplitVals;
1129 
1130  // Split vectors into their elements.
1131  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
1132  const ISD::OutputArg &Out = Outs[i];
1133 
1134  if (Out.VT.isVector()) {
1135  MVT VT = Out.VT.getVectorElementType();
1136  ISD::OutputArg NewOut = Out;
1137  NewOut.Flags.setSplit();
1138  NewOut.VT = VT;
1139 
1140  // We want the original number of vector elements here, e.g.
1141  // three or five, not four or eight.
1142  unsigned NumElements = Out.ArgVT.getVectorNumElements();
1143 
1144  for (unsigned j = 0; j != NumElements; ++j) {
1145  SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
1146  DAG.getConstant(j, DL, MVT::i32));
1147  SplitVals.push_back(Elem);
1148  Splits.push_back(NewOut);
1149  NewOut.PartOffset += NewOut.VT.getStoreSize();
1150  }
1151  } else {
1152  SplitVals.push_back(OutVals[i]);
1153  Splits.push_back(Out);
1154  }
1155  }
1156 
1157  // CCValAssign - represent the assignment of the return value to a location.
1159 
1160  // CCState - Info about the registers and stack slots.
1161  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1162  *DAG.getContext());
1163 
1164  // Analyze outgoing return values.
1165  AnalyzeReturn(CCInfo, Splits);
1166 
1167  SDValue Flag;
1168  SmallVector<SDValue, 48> RetOps;
1169  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1170 
1171  // Copy the result values into the output registers.
1172  for (unsigned i = 0, realRVLocIdx = 0;
1173  i != RVLocs.size();
1174  ++i, ++realRVLocIdx) {
1175  CCValAssign &VA = RVLocs[i];
1176  assert(VA.isRegLoc() && "Can only return in registers!");
1177 
1178  SDValue Arg = SplitVals[realRVLocIdx];
1179 
1180  // Copied from other backends.
1181  switch (VA.getLocInfo()) {
1182  default: llvm_unreachable("Unknown loc info!");
1183  case CCValAssign::Full:
1184  break;
1185  case CCValAssign::BCvt:
1186  Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
1187  break;
1188  }
1189 
1190  Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
1191  Flag = Chain.getValue(1);
1192  RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1193  }
1194 
1195  // Update chain and glue.
1196  RetOps[0] = Chain;
1197  if (Flag.getNode())
1198  RetOps.push_back(Flag);
1199 
1200  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
1201  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
1202 }
1203 
1204 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
1205  SelectionDAG &DAG) const {
1206  unsigned Reg = StringSwitch<unsigned>(RegName)
1207  .Case("m0", AMDGPU::M0)
1208  .Case("exec", AMDGPU::EXEC)
1209  .Case("exec_lo", AMDGPU::EXEC_LO)
1210  .Case("exec_hi", AMDGPU::EXEC_HI)
1211  .Case("flat_scratch", AMDGPU::FLAT_SCR)
1212  .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
1213  .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
1214  .Default(AMDGPU::NoRegister);
1215 
1216  if (Reg == AMDGPU::NoRegister) {
1217  report_fatal_error(Twine("invalid register name \""
1218  + StringRef(RegName) + "\"."));
1219 
1220  }
1221 
1223  Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
1224  report_fatal_error(Twine("invalid register \""
1225  + StringRef(RegName) + "\" for subtarget."));
1226  }
1227 
1228  switch (Reg) {
1229  case AMDGPU::M0:
1230  case AMDGPU::EXEC_LO:
1231  case AMDGPU::EXEC_HI:
1232  case AMDGPU::FLAT_SCR_LO:
1233  case AMDGPU::FLAT_SCR_HI:
1234  if (VT.getSizeInBits() == 32)
1235  return Reg;
1236  break;
1237  case AMDGPU::EXEC:
1238  case AMDGPU::FLAT_SCR:
1239  if (VT.getSizeInBits() == 64)
1240  return Reg;
1241  break;
1242  default:
1243  llvm_unreachable("missing register type checking");
1244  }
1245 
1246  report_fatal_error(Twine("invalid type for register \""
1247  + StringRef(RegName) + "\"."));
1248 }
1249 
1250 // If kill is not the last instruction, split the block so kill is always a
1251 // proper terminator.
1253  MachineBasicBlock *BB) const {
1254  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1255 
1256  MachineBasicBlock::iterator SplitPoint(&MI);
1257  ++SplitPoint;
1258 
1259  if (SplitPoint == BB->end()) {
1260  // Don't bother with a new block.
1261  MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
1262  return BB;
1263  }
1264 
1265  MachineFunction *MF = BB->getParent();
1266  MachineBasicBlock *SplitBB
1268 
1269  MF->insert(++MachineFunction::iterator(BB), SplitBB);
1270  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
1271 
1272  SplitBB->transferSuccessorsAndUpdatePHIs(BB);
1273  BB->addSuccessor(SplitBB);
1274 
1275  MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
1276  return SplitBB;
1277 }
1278 
1279 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
1280 // wavefront. If the value is uniform and just happens to be in a VGPR, this
1281 // will only do one iteration. In the worst case, this will loop 64 times.
1282 //
1283 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
1285  const SIInstrInfo *TII,
1286  MachineRegisterInfo &MRI,
1287  MachineBasicBlock &OrigBB,
1288  MachineBasicBlock &LoopBB,
1289  const DebugLoc &DL,
1290  const MachineOperand &IdxReg,
1291  unsigned InitReg,
1292  unsigned ResultReg,
1293  unsigned PhiReg,
1294  unsigned InitSaveExecReg,
1295  int Offset,
1296  bool UseGPRIdxMode) {
1297  MachineBasicBlock::iterator I = LoopBB.begin();
1298 
1299  unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1300  unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1301  unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1302  unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1303 
1304  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
1305  .addReg(InitReg)
1306  .addMBB(&OrigBB)
1307  .addReg(ResultReg)
1308  .addMBB(&LoopBB);
1309 
1310  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
1311  .addReg(InitSaveExecReg)
1312  .addMBB(&OrigBB)
1313  .addReg(NewExec)
1314  .addMBB(&LoopBB);
1315 
1316  // Read the next variant <- also loop target.
1317  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
1318  .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
1319 
1320  // Compare the just read M0 value to all possible Idx values.
1321  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
1322  .addReg(CurrentIdxReg)
1323  .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
1324 
1325  if (UseGPRIdxMode) {
1326  unsigned IdxReg;
1327  if (Offset == 0) {
1328  IdxReg = CurrentIdxReg;
1329  } else {
1330  IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1331  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
1332  .addReg(CurrentIdxReg, RegState::Kill)
1333  .addImm(Offset);
1334  }
1335 
1336  MachineInstr *SetIdx =
1337  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
1338  .addReg(IdxReg, RegState::Kill);
1339  SetIdx->getOperand(2).setIsUndef();
1340  } else {
1341  // Move index from VCC into M0
1342  if (Offset == 0) {
1343  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1344  .addReg(CurrentIdxReg, RegState::Kill);
1345  } else {
1346  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
1347  .addReg(CurrentIdxReg, RegState::Kill)
1348  .addImm(Offset);
1349  }
1350  }
1351 
1352  // Update EXEC, save the original EXEC value to VCC.
1353  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
1354  .addReg(CondReg, RegState::Kill);
1355 
1356  MRI.setSimpleHint(NewExec, CondReg);
1357 
1358  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1359  MachineInstr *InsertPt =
1360  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
1361  .addReg(AMDGPU::EXEC)
1362  .addReg(NewExec);
1363 
1364  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1365  // s_cbranch_scc0?
1366 
1367  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1368  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
1369  .addMBB(&LoopBB);
1370 
1371  return InsertPt->getIterator();
1372 }
1373 
1374 // This has slightly sub-optimal regalloc when the source vector is killed by
1375 // the read. The register allocator does not understand that the kill is
1376 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
1377 // subregister from it, using 1 more VGPR than necessary. This was saved when
1378 // this was expanded after register allocation.
1381  MachineInstr &MI,
1382  unsigned InitResultReg,
1383  unsigned PhiReg,
1384  int Offset,
1385  bool UseGPRIdxMode) {
1386  MachineFunction *MF = MBB.getParent();
1387  MachineRegisterInfo &MRI = MF->getRegInfo();
1388  const DebugLoc &DL = MI.getDebugLoc();
1390 
1391  unsigned DstReg = MI.getOperand(0).getReg();
1392  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1393  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1394 
1395  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
1396 
1397  // Save the EXEC mask
1398  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
1399  .addReg(AMDGPU::EXEC);
1400 
1401  // To insert the loop we need to split the block. Move everything after this
1402  // point to a new block, and insert a new empty block between the two.
1404  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
1405  MachineFunction::iterator MBBI(MBB);
1406  ++MBBI;
1407 
1408  MF->insert(MBBI, LoopBB);
1409  MF->insert(MBBI, RemainderBB);
1410 
1411  LoopBB->addSuccessor(LoopBB);
1412  LoopBB->addSuccessor(RemainderBB);
1413 
1414  // Move the rest of the block into a new block.
1415  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
1416  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
1417 
1418  MBB.addSuccessor(LoopBB);
1419 
1420  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
1421 
1422  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
1423  InitResultReg, DstReg, PhiReg, TmpExec,
1424  Offset, UseGPRIdxMode);
1425 
1426  MachineBasicBlock::iterator First = RemainderBB->begin();
1427  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
1428  .addReg(SaveExec);
1429 
1430  return InsPt;
1431 }
1432 
1433 // Returns subreg index, offset
1434 static std::pair<unsigned, int>
1436  const TargetRegisterClass *SuperRC,
1437  unsigned VecReg,
1438  int Offset) {
1439  int NumElts = SuperRC->getSize() / 4;
1440 
1441  // Skip out of bounds offsets, or else we would end up using an undefined
1442  // register.
1443  if (Offset >= NumElts || Offset < 0)
1444  return std::make_pair(AMDGPU::sub0, Offset);
1445 
1446  return std::make_pair(AMDGPU::sub0 + Offset, 0);
1447 }
1448 
1449 // Return true if the index is an SGPR and was set.
1451  MachineRegisterInfo &MRI,
1452  MachineInstr &MI,
1453  int Offset,
1454  bool UseGPRIdxMode,
1455  bool IsIndirectSrc) {
1457  const DebugLoc &DL = MI.getDebugLoc();
1459 
1460  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
1461  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
1462 
1463  assert(Idx->getReg() != AMDGPU::NoRegister);
1464 
1465  if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
1466  return false;
1467 
1468  if (UseGPRIdxMode) {
1469  unsigned IdxMode = IsIndirectSrc ?
1471  if (Offset == 0) {
1472  MachineInstr *SetOn =
1473  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
1474  .addOperand(*Idx)
1475  .addImm(IdxMode);
1476 
1477  SetOn->getOperand(3).setIsUndef();
1478  } else {
1479  unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1480  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
1481  .addOperand(*Idx)
1482  .addImm(Offset);
1483  MachineInstr *SetOn =
1484  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
1485  .addReg(Tmp, RegState::Kill)
1486  .addImm(IdxMode);
1487 
1488  SetOn->getOperand(3).setIsUndef();
1489  }
1490 
1491  return true;
1492  }
1493 
1494  if (Offset == 0) {
1495  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1496  .addOperand(*Idx);
1497  } else {
1498  BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
1499  .addOperand(*Idx)
1500  .addImm(Offset);
1501  }
1502 
1503  return true;
1504 }
1505 
1506 // Control flow needs to be inserted if indexing with a VGPR.
1509  const SISubtarget &ST) {
1510  const SIInstrInfo *TII = ST.getInstrInfo();
1511  const SIRegisterInfo &TRI = TII->getRegisterInfo();
1512  MachineFunction *MF = MBB.getParent();
1513  MachineRegisterInfo &MRI = MF->getRegInfo();
1514 
1515  unsigned Dst = MI.getOperand(0).getReg();
1516  unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
1517  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
1518 
1519  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
1520 
1521  unsigned SubReg;
1522  std::tie(SubReg, Offset)
1523  = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
1524 
1525  bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
1526 
1527  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
1529  const DebugLoc &DL = MI.getDebugLoc();
1530 
1531  if (UseGPRIdxMode) {
1532  // TODO: Look at the uses to avoid the copy. This may require rescheduling
1533  // to avoid interfering with other uses, so probably requires a new
1534  // optimization pass.
1535  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
1536  .addReg(SrcReg, RegState::Undef, SubReg)
1537  .addReg(SrcReg, RegState::Implicit)
1538  .addReg(AMDGPU::M0, RegState::Implicit);
1539  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
1540  } else {
1541  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
1542  .addReg(SrcReg, RegState::Undef, SubReg)
1543  .addReg(SrcReg, RegState::Implicit);
1544  }
1545 
1546  MI.eraseFromParent();
1547 
1548  return &MBB;
1549  }
1550 
1551 
1552  const DebugLoc &DL = MI.getDebugLoc();
1554 
1555  unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1556  unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1557 
1558  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
1559 
1560  if (UseGPRIdxMode) {
1561  MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
1562  .addImm(0) // Reset inside loop.
1564  SetOn->getOperand(3).setIsUndef();
1565 
1566  // Disable again after the loop.
1567  BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
1568  }
1569 
1570  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
1571  MachineBasicBlock *LoopBB = InsPt->getParent();
1572 
1573  if (UseGPRIdxMode) {
1574  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
1575  .addReg(SrcReg, RegState::Undef, SubReg)
1576  .addReg(SrcReg, RegState::Implicit)
1577  .addReg(AMDGPU::M0, RegState::Implicit);
1578  } else {
1579  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
1580  .addReg(SrcReg, RegState::Undef, SubReg)
1581  .addReg(SrcReg, RegState::Implicit);
1582  }
1583 
1584  MI.eraseFromParent();
1585 
1586  return LoopBB;
1587 }
1588 
1589 static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) {
1590  switch (VecRC->getSize()) {
1591  case 4:
1592  return AMDGPU::V_MOVRELD_B32_V1;
1593  case 8:
1594  return AMDGPU::V_MOVRELD_B32_V2;
1595  case 16:
1596  return AMDGPU::V_MOVRELD_B32_V4;
1597  case 32:
1598  return AMDGPU::V_MOVRELD_B32_V8;
1599  case 64:
1600  return AMDGPU::V_MOVRELD_B32_V16;
1601  default:
1602  llvm_unreachable("unsupported size for MOVRELD pseudos");
1603  }
1604 }
1605 
1608  const SISubtarget &ST) {
1609  const SIInstrInfo *TII = ST.getInstrInfo();
1610  const SIRegisterInfo &TRI = TII->getRegisterInfo();
1611  MachineFunction *MF = MBB.getParent();
1612  MachineRegisterInfo &MRI = MF->getRegInfo();
1613 
1614  unsigned Dst = MI.getOperand(0).getReg();
1615  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
1616  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
1617  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
1618  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
1619  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
1620 
1621  // This can be an immediate, but will be folded later.
1622  assert(Val->getReg());
1623 
1624  unsigned SubReg;
1625  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
1626  SrcVec->getReg(),
1627  Offset);
1628  bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
1629 
1630  if (Idx->getReg() == AMDGPU::NoRegister) {
1632  const DebugLoc &DL = MI.getDebugLoc();
1633 
1634  assert(Offset == 0);
1635 
1636  BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
1637  .addOperand(*SrcVec)
1638  .addOperand(*Val)
1639  .addImm(SubReg);
1640 
1641  MI.eraseFromParent();
1642  return &MBB;
1643  }
1644 
1645  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
1647  const DebugLoc &DL = MI.getDebugLoc();
1648 
1649  if (UseGPRIdxMode) {
1650  BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
1651  .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
1652  .addOperand(*Val)
1653  .addReg(Dst, RegState::ImplicitDefine)
1654  .addReg(SrcVec->getReg(), RegState::Implicit)
1655  .addReg(AMDGPU::M0, RegState::Implicit);
1656 
1657  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
1658  } else {
1659  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
1660 
1661  BuildMI(MBB, I, DL, MovRelDesc)
1662  .addReg(Dst, RegState::Define)
1663  .addReg(SrcVec->getReg())
1664  .addOperand(*Val)
1665  .addImm(SubReg - AMDGPU::sub0);
1666  }
1667 
1668  MI.eraseFromParent();
1669  return &MBB;
1670  }
1671 
1672  if (Val->isReg())
1673  MRI.clearKillFlags(Val->getReg());
1674 
1675  const DebugLoc &DL = MI.getDebugLoc();
1676 
1677  if (UseGPRIdxMode) {
1679 
1680  MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
1681  .addImm(0) // Reset inside loop.
1683  SetOn->getOperand(3).setIsUndef();
1684 
1685  // Disable again after the loop.
1686  BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
1687  }
1688 
1689  unsigned PhiReg = MRI.createVirtualRegister(VecRC);
1690 
1691  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
1692  Offset, UseGPRIdxMode);
1693  MachineBasicBlock *LoopBB = InsPt->getParent();
1694 
1695  if (UseGPRIdxMode) {
1696  BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
1697  .addReg(PhiReg, RegState::Undef, SubReg) // vdst
1698  .addOperand(*Val) // src0
1700  .addReg(PhiReg, RegState::Implicit)
1701  .addReg(AMDGPU::M0, RegState::Implicit);
1702  } else {
1703  const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
1704 
1705  BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
1706  .addReg(Dst, RegState::Define)
1707  .addReg(PhiReg)
1708  .addOperand(*Val)
1709  .addImm(SubReg - AMDGPU::sub0);
1710  }
1711 
1712  MI.eraseFromParent();
1713 
1714  return LoopBB;
1715 }
1716 
1718  MachineInstr &MI, MachineBasicBlock *BB) const {
1719 
1720  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1721  MachineFunction *MF = BB->getParent();
1723 
1724  if (TII->isMIMG(MI)) {
1725  if (!MI.memoperands_empty())
1726  return BB;
1727  // Add a memoperand for mimg instructions so that they aren't assumed to
1728  // be ordered memory instuctions.
1729 
1730  MachinePointerInfo PtrInfo(MFI->getImagePSV());
1732  if (MI.mayStore())
1733  Flags |= MachineMemOperand::MOStore;
1734 
1735  if (MI.mayLoad())
1736  Flags |= MachineMemOperand::MOLoad;
1737 
1738  auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
1739  MI.addMemOperand(*MF, MMO);
1740  return BB;
1741  }
1742 
1743  switch (MI.getOpcode()) {
1744  case AMDGPU::SI_INIT_M0: {
1745  BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
1746  TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1747  .addOperand(MI.getOperand(0));
1748  MI.eraseFromParent();
1749  return BB;
1750  }
1751  case AMDGPU::GET_GROUPSTATICSIZE: {
1752  DebugLoc DL = MI.getDebugLoc();
1753  BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
1754  .addOperand(MI.getOperand(0))
1755  .addImm(MFI->getLDSSize());
1756  MI.eraseFromParent();
1757  return BB;
1758  }
1759  case AMDGPU::SI_INDIRECT_SRC_V1:
1760  case AMDGPU::SI_INDIRECT_SRC_V2:
1761  case AMDGPU::SI_INDIRECT_SRC_V4:
1762  case AMDGPU::SI_INDIRECT_SRC_V8:
1763  case AMDGPU::SI_INDIRECT_SRC_V16:
1764  return emitIndirectSrc(MI, *BB, *getSubtarget());
1765  case AMDGPU::SI_INDIRECT_DST_V1:
1766  case AMDGPU::SI_INDIRECT_DST_V2:
1767  case AMDGPU::SI_INDIRECT_DST_V4:
1768  case AMDGPU::SI_INDIRECT_DST_V8:
1769  case AMDGPU::SI_INDIRECT_DST_V16:
1770  return emitIndirectDst(MI, *BB, *getSubtarget());
1771  case AMDGPU::SI_KILL:
1772  return splitKillBlock(MI, BB);
1773  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
1774  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
1775 
1776  unsigned Dst = MI.getOperand(0).getReg();
1777  unsigned Src0 = MI.getOperand(1).getReg();
1778  unsigned Src1 = MI.getOperand(2).getReg();
1779  const DebugLoc &DL = MI.getDebugLoc();
1780  unsigned SrcCond = MI.getOperand(3).getReg();
1781 
1782  unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1783  unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1784 
1785  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
1786  .addReg(Src0, 0, AMDGPU::sub0)
1787  .addReg(Src1, 0, AMDGPU::sub0)
1788  .addReg(SrcCond);
1789  BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
1790  .addReg(Src0, 0, AMDGPU::sub1)
1791  .addReg(Src1, 0, AMDGPU::sub1)
1792  .addReg(SrcCond);
1793 
1794  BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
1795  .addReg(DstLo)
1796  .addImm(AMDGPU::sub0)
1797  .addReg(DstHi)
1798  .addImm(AMDGPU::sub1);
1799  MI.eraseFromParent();
1800  return BB;
1801  }
1802  case AMDGPU::SI_BR_UNDEF: {
1803  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1804  const DebugLoc &DL = MI.getDebugLoc();
1805  MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
1806  .addOperand(MI.getOperand(0));
1807  Br->getOperand(1).setIsUndef(true); // read undef SCC
1808  MI.eraseFromParent();
1809  return BB;
1810  }
1811  default:
1813  }
1814 }
1815 
1817  // This currently forces unfolding various combinations of fsub into fma with
1818  // free fneg'd operands. As long as we have fast FMA (controlled by
1819  // isFMAFasterThanFMulAndFAdd), we should perform these.
1820 
1821  // When fma is quarter rate, for f64 where add / sub are at best half rate,
1822  // most of these combines appear to be cycle neutral but save on instruction
1823  // count / code size.
1824  return true;
1825 }
1826 
1828  EVT VT) const {
1829  if (!VT.isVector()) {
1830  return MVT::i1;
1831  }
1832  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
1833 }
1834 
1836  // TODO: Should i16 be used always if legal? For now it would force VALU
1837  // shifts.
1838  return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
1839 }
1840 
1841 // Answering this is somewhat tricky and depends on the specific device which
1842 // have different rates for fma or all f64 operations.
1843 //
1844 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
1845 // regardless of which device (although the number of cycles differs between
1846 // devices), so it is always profitable for f64.
1847 //
1848 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
1849 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
1850 // which we can always do even without fused FP ops since it returns the same
1851 // result as the separate operations and since it is always full
1852 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
1853 // however does not support denormals, so we do report fma as faster if we have
1854 // a fast fma device and require denormals.
1855 //
1857  VT = VT.getScalarType();
1858 
1859  if (!VT.isSimple())
1860  return false;
1861 
1862  switch (VT.getSimpleVT().SimpleTy) {
1863  case MVT::f32:
1864  // This is as fast on some subtargets. However, we always have full rate f32
1865  // mad available which returns the same result as the separate operations
1866  // which we should prefer over fma. We can't use this if we want to support
1867  // denormals, so only report this in these cases.
1869  case MVT::f64:
1870  return true;
1871  case MVT::f16:
1873  default:
1874  break;
1875  }
1876 
1877  return false;
1878 }
1879 
1880 //===----------------------------------------------------------------------===//
1881 // Custom DAG Lowering Operations
1882 //===----------------------------------------------------------------------===//
1883 
1885  switch (Op.getOpcode()) {
1886  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
1887  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
1888  case ISD::LOAD: {
1889  SDValue Result = LowerLOAD(Op, DAG);
1890  assert((!Result.getNode() ||
1891  Result.getNode()->getNumValues() == 2) &&
1892  "Load should return a value and a chain");
1893  return Result;
1894  }
1895 
1896  case ISD::FSIN:
1897  case ISD::FCOS:
1898  return LowerTrig(Op, DAG);
1899  case ISD::SELECT: return LowerSELECT(Op, DAG);
1900  case ISD::FDIV: return LowerFDIV(Op, DAG);
1901  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
1902  case ISD::STORE: return LowerSTORE(Op, DAG);
1903  case ISD::GlobalAddress: {
1904  MachineFunction &MF = DAG.getMachineFunction();
1906  return LowerGlobalAddress(MFI, Op, DAG);
1907  }
1908  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
1909  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
1910  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
1911  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
1912  case ISD::TRAP: return lowerTRAP(Op, DAG);
1913  case ISD::FP_ROUND:
1914  return lowerFP_ROUND(Op, DAG);
1915  }
1916  return SDValue();
1917 }
1918 
1919 /// \brief Helper function for LowerBRCOND
1920 static SDNode *findUser(SDValue Value, unsigned Opcode) {
1921 
1922  SDNode *Parent = Value.getNode();
1923  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
1924  I != E; ++I) {
1925 
1926  if (I.getUse().get() != Value)
1927  continue;
1928 
1929  if (I->getOpcode() == Opcode)
1930  return *I;
1931  }
1932  return nullptr;
1933 }
1934 
1935 bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
1936  if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1937  switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
1938  case AMDGPUIntrinsic::amdgcn_if:
1939  case AMDGPUIntrinsic::amdgcn_else:
1940  case AMDGPUIntrinsic::amdgcn_end_cf:
1941  case AMDGPUIntrinsic::amdgcn_loop:
1942  return true;
1943  default:
1944  return false;
1945  }
1946  }
1947 
1948  if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
1949  switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) {
1950  case AMDGPUIntrinsic::amdgcn_break:
1951  case AMDGPUIntrinsic::amdgcn_if_break:
1952  case AMDGPUIntrinsic::amdgcn_else_break:
1953  return true;
1954  default:
1955  return false;
1956  }
1957  }
1958 
1959  return false;
1960 }
1961 
1962 void SITargetLowering::createDebuggerPrologueStackObjects(
1963  MachineFunction &MF) const {
1964  // Create stack objects that are used for emitting debugger prologue.
1965  //
1966  // Debugger prologue writes work group IDs and work item IDs to scratch memory
1967  // at fixed location in the following format:
1968  // offset 0: work group ID x
1969  // offset 4: work group ID y
1970  // offset 8: work group ID z
1971  // offset 16: work item ID x
1972  // offset 20: work item ID y
1973  // offset 24: work item ID z
1975  int ObjectIdx = 0;
1976 
1977  // For each dimension:
1978  for (unsigned i = 0; i < 3; ++i) {
1979  // Create fixed stack object for work group ID.
1980  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
1981  Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
1982  // Create fixed stack object for work item ID.
1983  ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
1984  Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
1985  }
1986 }
1987 
1988 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
1989  const Triple &TT = getTargetMachine().getTargetTriple();
1990  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1992 }
1993 
1994 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
1995  return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
1997  !shouldEmitFixup(GV) &&
1999 }
2000 
2001 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
2002  return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
2003 }
2004 
2005 /// This transforms the control flow intrinsics to get the branch destination as
2006 /// last parameter, also switches branch target with BR if the need arise
2007 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
2008  SelectionDAG &DAG) const {
2009 
2010  SDLoc DL(BRCOND);
2011 
2012  SDNode *Intr = BRCOND.getOperand(1).getNode();
2013  SDValue Target = BRCOND.getOperand(2);
2014  SDNode *BR = nullptr;
2015  SDNode *SetCC = nullptr;
2016 
2017  if (Intr->getOpcode() == ISD::SETCC) {
2018  // As long as we negate the condition everything is fine
2019  SetCC = Intr;
2020  Intr = SetCC->getOperand(0).getNode();
2021 
2022  } else {
2023  // Get the target from BR if we don't negate the condition
2024  BR = findUser(BRCOND, ISD::BR);
2025  Target = BR->getOperand(1);
2026  }
2027 
2028  // FIXME: This changes the types of the intrinsics instead of introducing new
2029  // nodes with the correct types.
2030  // e.g. llvm.amdgcn.loop
2031 
2032  // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
2033  // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
2034 
2035  if (!isCFIntrinsic(Intr)) {
2036  // This is a uniform branch so we don't need to legalize.
2037  return BRCOND;
2038  }
2039 
2040  bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
2041  Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
2042 
2043  assert(!SetCC ||
2044  (SetCC->getConstantOperandVal(1) == 1 &&
2045  cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
2046  ISD::SETNE));
2047 
2048  // operands of the new intrinsic call
2050  if (HaveChain)
2051  Ops.push_back(BRCOND.getOperand(0));
2052 
2053  Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end());
2054  Ops.push_back(Target);
2055 
2056  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
2057 
2058  // build the new intrinsic call
2059  SDNode *Result = DAG.getNode(
2060  Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
2061  DAG.getVTList(Res), Ops).getNode();
2062 
2063  if (!HaveChain) {
2064  SDValue Ops[] = {
2065  SDValue(Result, 0),
2066  BRCOND.getOperand(0)
2067  };
2068 
2069  Result = DAG.getMergeValues(Ops, DL).getNode();
2070  }
2071 
2072  if (BR) {
2073  // Give the branch instruction our target
2074  SDValue Ops[] = {
2075  BR->getOperand(0),
2076  BRCOND.getOperand(2)
2077  };
2078  SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
2079  DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
2080  BR = NewBR.getNode();
2081  }
2082 
2083  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
2084 
2085  // Copy the intrinsic results to registers
2086  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
2088  if (!CopyToReg)
2089  continue;
2090 
2091  Chain = DAG.getCopyToReg(
2092  Chain, DL,
2093  CopyToReg->getOperand(1),
2094  SDValue(Result, i - 1),
2095  SDValue());
2096 
2097  DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
2098  }
2099 
2100  // Remove the old intrinsic from the chain
2102  SDValue(Intr, Intr->getNumValues() - 1),
2103  Intr->getOperand(0));
2104 
2105  return Chain;
2106 }
2107 
2108 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
2109  SDValue Op,
2110  const SDLoc &DL,
2111  EVT VT) const {
2112  return Op.getValueType().bitsLE(VT) ?
2113  DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
2114  DAG.getNode(ISD::FTRUNC, DL, VT, Op);
2115 }
2116 
2117 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
2118  assert(Op.getValueType() == MVT::f16 &&
2119  "Do not know how to custom lower FP_ROUND for non-f16 type");
2120 
2121  SDValue Src = Op.getOperand(0);
2122  EVT SrcVT = Src.getValueType();
2123  if (SrcVT != MVT::f64)
2124  return Op;
2125 
2126  SDLoc DL(Op);
2127 
2128  SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
2129  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
2130  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);;
2131 }
2132 
2133 SDValue SITargetLowering::getSegmentAperture(unsigned AS,
2134  SelectionDAG &DAG) const {
2135  SDLoc SL;
2136  MachineFunction &MF = DAG.getMachineFunction();
2138  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
2139  assert(UserSGPR != AMDGPU::NoRegister);
2140 
2142  DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
2143 
2144  // Offset into amd_queue_t for group_segment_aperture_base_hi /
2145  // private_segment_aperture_base_hi.
2146  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2147 
2148  SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
2149  DAG.getConstant(StructOffset, SL, MVT::i64));
2150 
2151  // TODO: Use custom target PseudoSourceValue.
2152  // TODO: We should use the value from the IR intrinsic call, but it might not
2153  // be available and how do we get it?
2156 
2157  MachinePointerInfo PtrInfo(V, StructOffset);
2158  return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
2159  MinAlign(64, StructOffset),
2162 }
2163 
2164 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
2165  SelectionDAG &DAG) const {
2166  SDLoc SL(Op);
2167  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
2168 
2169  SDValue Src = ASC->getOperand(0);
2170 
2171  // FIXME: Really support non-0 null pointers.
2172  SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
2173  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
2174 
2175  // flat -> local/private
2179  SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
2180  SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
2181 
2182  return DAG.getNode(ISD::SELECT, SL, MVT::i32,
2183  NonNull, Ptr, SegmentNullPtr);
2184  }
2185  }
2186 
2187  // local/private -> flat
2191  SDValue NonNull
2192  = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
2193 
2194  SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
2195  SDValue CvtPtr
2196  = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
2197 
2198  return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
2199  DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
2200  FlatNullPtr);
2201  }
2202  }
2203 
2204  // global <-> flat are no-ops and never emitted.
2205 
2206  const MachineFunction &MF = DAG.getMachineFunction();
2207  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2208  *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
2209  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
2210 
2211  return DAG.getUNDEF(ASC->getValueType(0));
2212 }
2213 
2214 bool
2216  // We can fold offsets for anything that doesn't require a GOT relocation.
2217  return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
2219  !shouldEmitGOTReloc(GA->getGlobal());
2220 }
2221 
2223  SDLoc DL, unsigned Offset, EVT PtrVT,
2224  unsigned GAFlags = SIInstrInfo::MO_NONE) {
2225  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
2226  // lowered to the following code sequence:
2227  //
2228  // For constant address space:
2229  // s_getpc_b64 s[0:1]
2230  // s_add_u32 s0, s0, $symbol
2231  // s_addc_u32 s1, s1, 0
2232  //
2233  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2234  // a fixup or relocation is emitted to replace $symbol with a literal
2235  // constant, which is a pc-relative offset from the encoding of the $symbol
2236  // operand to the global variable.
2237  //
2238  // For global address space:
2239  // s_getpc_b64 s[0:1]
2240  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2241  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2242  //
2243  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2244  // fixups or relocations are emitted to replace $symbol@*@lo and
2245  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2246  // which is a 64-bit pc-relative offset from the encoding of the $symbol
2247  // operand to the global variable.
2248  //
2249  // What we want here is an offset from the value returned by s_getpc
2250  // (which is the address of the s_add_u32 instruction) to the global
2251  // variable, but since the encoding of $symbol starts 4 bytes after the start
2252  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2253  // small. This requires us to add 4 to the global variable offset in order to
2254  // compute the correct address.
2255  SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
2256  GAFlags);
2257  SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
2258  GAFlags == SIInstrInfo::MO_NONE ?
2259  GAFlags : GAFlags + 1);
2260  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
2261 }
2262 
2263 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
2264  SDValue Op,
2265  SelectionDAG &DAG) const {
2266  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
2267 
2270  return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
2271 
2272  SDLoc DL(GSD);
2273  const GlobalValue *GV = GSD->getGlobal();
2274  EVT PtrVT = Op.getValueType();
2275 
2276  if (shouldEmitFixup(GV))
2277  return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
2278  else if (shouldEmitPCReloc(GV))
2279  return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
2281 
2282  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
2284 
2285  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
2287  const DataLayout &DataLayout = DAG.getDataLayout();
2288  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
2289  // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
2290  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
2291 
2292  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
2295 }
2296 
2297 SDValue SITargetLowering::lowerTRAP(SDValue Op,
2298  SelectionDAG &DAG) const {
2299  const MachineFunction &MF = DAG.getMachineFunction();
2301  "trap handler not supported",
2302  Op.getDebugLoc(),
2303  DS_Warning);
2304  DAG.getContext()->diagnose(NoTrap);
2305 
2306  // Emit s_endpgm.
2307 
2308  // FIXME: This should really be selected to s_trap, but that requires
2309  // setting up the trap handler for it o do anything.
2310  return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
2311  Op.getOperand(0));
2312 }
2313 
2315  const SDLoc &DL, SDValue V) const {
2316  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
2317  // the destination register.
2318  //
2319  // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
2320  // so we will end up with redundant moves to m0.
2321  //
2322  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
2323 
2324  // A Null SDValue creates a glue result.
2325  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
2326  V, Chain);
2327  return SDValue(M0, 0);
2328 }
2329 
2330 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
2331  SDValue Op,
2332  MVT VT,
2333  unsigned Offset) const {
2334  SDLoc SL(Op);
2335  SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
2336  DAG.getEntryNode(), Offset, false);
2337  // The local size values will have the hi 16-bits as zero.
2338  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
2339  DAG.getValueType(VT));
2340 }
2341 
2344  "non-hsa intrinsic with hsa target",
2345  DL.getDebugLoc());
2346  DAG.getContext()->diagnose(BadIntrin);
2347  return DAG.getUNDEF(VT);
2348 }
2349 
2352  "intrinsic not supported on subtarget",
2353  DL.getDebugLoc());
2354  DAG.getContext()->diagnose(BadIntrin);
2355  return DAG.getUNDEF(VT);
2356 }
2357 
2358 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2359  SelectionDAG &DAG) const {
2360  MachineFunction &MF = DAG.getMachineFunction();
2361  auto MFI = MF.getInfo<SIMachineFunctionInfo>();
2362  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2363 
2364  EVT VT = Op.getValueType();
2365  SDLoc DL(Op);
2366  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2367 
2368  // TODO: Should this propagate fast-math-flags?
2369 
2370  switch (IntrinsicID) {
2371  case Intrinsic::amdgcn_implicit_buffer_ptr: {
2372  unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
2373  return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
2374  }
2375  case Intrinsic::amdgcn_dispatch_ptr:
2376  case Intrinsic::amdgcn_queue_ptr: {
2377  if (!Subtarget->isAmdCodeObjectV2(MF)) {
2378  DiagnosticInfoUnsupported BadIntrin(
2379  *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
2380  DL.getDebugLoc());
2381  DAG.getContext()->diagnose(BadIntrin);
2382  return DAG.getUNDEF(VT);
2383  }
2384 
2385  auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
2387  return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
2388  TRI->getPreloadedValue(MF, Reg), VT);
2389  }
2390  case Intrinsic::amdgcn_implicitarg_ptr: {
2391  unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
2392  return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
2393  }
2394  case Intrinsic::amdgcn_kernarg_segment_ptr: {
2395  unsigned Reg
2396  = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
2397  return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
2398  }
2399  case Intrinsic::amdgcn_dispatch_id: {
2400  unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID);
2401  return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
2402  }
2403  case Intrinsic::amdgcn_rcp:
2404  return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
2405  case Intrinsic::amdgcn_rsq:
2406  case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
2407  return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
2408  case Intrinsic::amdgcn_rsq_legacy: {
2410  return emitRemovedIntrinsicError(DAG, DL, VT);
2411 
2412  return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
2413  }
2414  case Intrinsic::amdgcn_rcp_legacy: {
2416  return emitRemovedIntrinsicError(DAG, DL, VT);
2417  return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
2418  }
2419  case Intrinsic::amdgcn_rsq_clamp: {
2421  return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
2422 
2423  Type *Type = VT.getTypeForEVT(*DAG.getContext());
2426 
2427  SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
2428  SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
2429  DAG.getConstantFP(Max, DL, VT));
2430  return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
2431  DAG.getConstantFP(Min, DL, VT));
2432  }
2433  case Intrinsic::r600_read_ngroups_x:
2434  if (Subtarget->isAmdHsaOS())
2435  return emitNonHSAIntrinsicError(DAG, DL, VT);
2436 
2437  return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2439  case Intrinsic::r600_read_ngroups_y:
2440  if (Subtarget->isAmdHsaOS())
2441  return emitNonHSAIntrinsicError(DAG, DL, VT);
2442 
2443  return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2445  case Intrinsic::r600_read_ngroups_z:
2446  if (Subtarget->isAmdHsaOS())
2447  return emitNonHSAIntrinsicError(DAG, DL, VT);
2448 
2449  return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2451  case Intrinsic::r600_read_global_size_x:
2452  if (Subtarget->isAmdHsaOS())
2453  return emitNonHSAIntrinsicError(DAG, DL, VT);
2454 
2455  return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2457  case Intrinsic::r600_read_global_size_y:
2458  if (Subtarget->isAmdHsaOS())
2459  return emitNonHSAIntrinsicError(DAG, DL, VT);
2460 
2461  return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2463  case Intrinsic::r600_read_global_size_z:
2464  if (Subtarget->isAmdHsaOS())
2465  return emitNonHSAIntrinsicError(DAG, DL, VT);
2466 
2467  return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
2469  case Intrinsic::r600_read_local_size_x:
2470  if (Subtarget->isAmdHsaOS())
2471  return emitNonHSAIntrinsicError(DAG, DL, VT);
2472 
2473  return lowerImplicitZextParam(DAG, Op, MVT::i16,
2475  case Intrinsic::r600_read_local_size_y:
2476  if (Subtarget->isAmdHsaOS())
2477  return emitNonHSAIntrinsicError(DAG, DL, VT);
2478 
2479  return lowerImplicitZextParam(DAG, Op, MVT::i16,
2481  case Intrinsic::r600_read_local_size_z:
2482  if (Subtarget->isAmdHsaOS())
2483  return emitNonHSAIntrinsicError(DAG, DL, VT);
2484 
2485  return lowerImplicitZextParam(DAG, Op, MVT::i16,
2487  case Intrinsic::amdgcn_workgroup_id_x:
2488  case Intrinsic::r600_read_tgid_x:
2489  return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
2490  TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
2491  case Intrinsic::amdgcn_workgroup_id_y:
2492  case Intrinsic::r600_read_tgid_y:
2493  return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
2494  TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
2495  case Intrinsic::amdgcn_workgroup_id_z:
2496  case Intrinsic::r600_read_tgid_z:
2497  return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
2498  TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
2499  case Intrinsic::amdgcn_workitem_id_x:
2500  case Intrinsic::r600_read_tidig_x:
2501  return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
2502  TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
2503  case Intrinsic::amdgcn_workitem_id_y:
2504  case Intrinsic::r600_read_tidig_y:
2505  return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
2506  TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
2507  case Intrinsic::amdgcn_workitem_id_z:
2508  case Intrinsic::r600_read_tidig_z:
2509  return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
2510  TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
2511  case AMDGPUIntrinsic::SI_load_const: {
2512  SDValue Ops[] = {
2513  Op.getOperand(1),
2514  Op.getOperand(2)
2515  };
2516 
2521  VT.getStoreSize(), 4);
2523  Op->getVTList(), Ops, VT, MMO);
2524  }
2525  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
2526  return lowerFDIV_FAST(Op, DAG);
2527  }
2528  case AMDGPUIntrinsic::SI_vs_load_input:
2529  return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
2530  Op.getOperand(1),
2531  Op.getOperand(2),
2532  Op.getOperand(3));
2533 
2534  case AMDGPUIntrinsic::SI_fs_constant: {
2535  SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
2536  SDValue Glue = M0.getValue(1);
2537  return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
2538  DAG.getConstant(2, DL, MVT::i32), // P0
2539  Op.getOperand(1), Op.getOperand(2), Glue);
2540  }
2541  case AMDGPUIntrinsic::SI_packf16:
2542  if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
2543  return DAG.getUNDEF(MVT::i32);
2544  return Op;
2545  case AMDGPUIntrinsic::SI_fs_interp: {
2546  SDValue IJ = Op.getOperand(4);
2548  DAG.getConstant(0, DL, MVT::i32));
2550  DAG.getConstant(1, DL, MVT::i32));
2551  I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I);
2552  J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J);
2553  SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
2554  SDValue Glue = M0.getValue(1);
2555  SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
2557  I, Op.getOperand(1), Op.getOperand(2), Glue);
2558  Glue = SDValue(P1.getNode(), 1);
2559  return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
2560  Op.getOperand(1), Op.getOperand(2), Glue);
2561  }
2562  case Intrinsic::amdgcn_interp_mov: {
2563  SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
2564  SDValue Glue = M0.getValue(1);
2565  return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
2566  Op.getOperand(2), Op.getOperand(3), Glue);
2567  }
2568  case Intrinsic::amdgcn_interp_p1: {
2569  SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
2570  SDValue Glue = M0.getValue(1);
2571  return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
2572  Op.getOperand(2), Op.getOperand(3), Glue);
2573  }
2574  case Intrinsic::amdgcn_interp_p2: {
2575  SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
2576  SDValue Glue = SDValue(M0.getNode(), 1);
2577  return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
2578  Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
2579  Glue);
2580  }
2581  case Intrinsic::amdgcn_sin:
2582  return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
2583 
2584  case Intrinsic::amdgcn_cos:
2585  return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
2586 
2587  case Intrinsic::amdgcn_log_clamp: {
2589  return SDValue();
2590 
2591  DiagnosticInfoUnsupported BadIntrin(
2592  *MF.getFunction(), "intrinsic not supported on subtarget",
2593  DL.getDebugLoc());
2594  DAG.getContext()->diagnose(BadIntrin);
2595  return DAG.getUNDEF(VT);
2596  }
2597  case Intrinsic::amdgcn_ldexp:
2598  return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
2599  Op.getOperand(1), Op.getOperand(2));
2600 
2601  case Intrinsic::amdgcn_fract:
2602  return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
2603 
2604  case Intrinsic::amdgcn_class:
2605  return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
2606  Op.getOperand(1), Op.getOperand(2));
2607  case Intrinsic::amdgcn_div_fmas:
2608  return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
2609  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
2610  Op.getOperand(4));
2611 
2612  case Intrinsic::amdgcn_div_fixup:
2613  return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
2614  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
2615 
2616  case Intrinsic::amdgcn_trig_preop:
2617  return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
2618  Op.getOperand(1), Op.getOperand(2));
2619  case Intrinsic::amdgcn_div_scale: {
2620  // 3rd parameter required to be a constant.
2621  const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
2622  if (!Param)
2623  return DAG.getUNDEF(VT);
2624 
2625  // Translate to the operands expected by the machine instruction. The
2626  // first parameter must be the same as the first instruction.
2627  SDValue Numerator = Op.getOperand(1);
2628  SDValue Denominator = Op.getOperand(2);
2629 
2630  // Note this order is opposite of the machine instruction's operations,
2631  // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
2632  // intrinsic has the numerator as the first operand to match a normal
2633  // division operation.
2634 
2635  SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
2636 
2637  return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
2638  Denominator, Numerator);
2639  }
2640  case Intrinsic::amdgcn_icmp: {
2641  const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
2642  int CondCode = CD->getSExtValue();
2643 
2644  if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
2645  CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE)
2646  return DAG.getUNDEF(VT);
2647 
2648  ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
2649  ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
2650  return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
2651  Op.getOperand(2), DAG.getCondCode(CCOpcode));
2652  }
2653  case Intrinsic::amdgcn_fcmp: {
2654  const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
2655  int CondCode = CD->getSExtValue();
2656 
2657  if (CondCode <= FCmpInst::Predicate::FCMP_FALSE ||
2658  CondCode >= FCmpInst::Predicate::FCMP_TRUE)
2659  return DAG.getUNDEF(VT);
2660 
2661  FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
2662  ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
2663  return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
2664  Op.getOperand(2), DAG.getCondCode(CCOpcode));
2665  }
2666  case Intrinsic::amdgcn_fmul_legacy:
2667  return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
2668  Op.getOperand(1), Op.getOperand(2));
2669  case Intrinsic::amdgcn_sffbh:
2670  case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name.
2671  return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
2672  default:
2673  return AMDGPUTargetLowering::LowerOperation(Op, DAG);
2674  }
2675 }
2676 
2677 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
2678  SelectionDAG &DAG) const {
2679  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
2680  SDLoc DL(Op);
2681  switch (IntrID) {
2682  case Intrinsic::amdgcn_atomic_inc:
2683  case Intrinsic::amdgcn_atomic_dec: {
2684  MemSDNode *M = cast<MemSDNode>(Op);
2685  unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
2687  SDValue Ops[] = {
2688  M->getOperand(0), // Chain
2689  M->getOperand(2), // Ptr
2690  M->getOperand(3) // Value
2691  };
2692 
2693  return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
2694  M->getMemoryVT(), M->getMemOperand());
2695  }
2696  case Intrinsic::amdgcn_buffer_load:
2697  case Intrinsic::amdgcn_buffer_load_format: {
2698  SDValue Ops[] = {
2699  Op.getOperand(0), // Chain
2700  Op.getOperand(2), // rsrc
2701  Op.getOperand(3), // vindex
2702  Op.getOperand(4), // offset
2703  Op.getOperand(5), // glc
2704  Op.getOperand(6) // slc
2705  };
2706  MachineFunction &MF = DAG.getMachineFunction();
2708 
2709  unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
2711  EVT VT = Op.getValueType();
2712  EVT IntVT = VT.changeTypeToInteger();
2713 
2717  VT.getStoreSize(), VT.getStoreSize());
2718 
2719  return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
2720  }
2721  default:
2722  return SDValue();
2723  }
2724 }
2725 
2726 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
2727  SelectionDAG &DAG) const {
2728  MachineFunction &MF = DAG.getMachineFunction();
2729  SDLoc DL(Op);
2730  SDValue Chain = Op.getOperand(0);
2731  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
2732 
2733  switch (IntrinsicID) {
2734  case AMDGPUIntrinsic::SI_sendmsg:
2735  case Intrinsic::amdgcn_s_sendmsg: {
2736  Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
2737  SDValue Glue = Chain.getValue(1);
2738  return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
2739  Op.getOperand(2), Glue);
2740  }
2741  case Intrinsic::amdgcn_s_sendmsghalt: {
2742  Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
2743  SDValue Glue = Chain.getValue(1);
2744  return DAG.getNode(AMDGPUISD::SENDMSGHALT, DL, MVT::Other, Chain,
2745  Op.getOperand(2), Glue);
2746  }
2747  case AMDGPUIntrinsic::SI_tbuffer_store: {
2748  SDValue Ops[] = {
2749  Chain,
2750  Op.getOperand(2),
2751  Op.getOperand(3),
2752  Op.getOperand(4),
2753  Op.getOperand(5),
2754  Op.getOperand(6),
2755  Op.getOperand(7),
2756  Op.getOperand(8),
2757  Op.getOperand(9),
2758  Op.getOperand(10),
2759  Op.getOperand(11),
2760  Op.getOperand(12),
2761  Op.getOperand(13),
2762  Op.getOperand(14)
2763  };
2764 
2765  EVT VT = Op.getOperand(3).getValueType();
2766 
2770  VT.getStoreSize(), 4);
2772  Op->getVTList(), Ops, VT, MMO);
2773  }
2774  case AMDGPUIntrinsic::AMDGPU_kill: {
2775  SDValue Src = Op.getOperand(2);
2776  if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
2777  if (!K->isNegative())
2778  return Chain;
2779 
2780  SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
2781  return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
2782  }
2783 
2784  SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
2785  return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
2786  }
2787  case AMDGPUIntrinsic::SI_export: {
2788  const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2));
2789  const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3));
2790  const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4));
2791  const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5));
2792  const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6));
2793 
2794  const SDValue Ops[] = {
2795  Chain,
2796  DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),
2797  DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1),
2798  DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8),
2799  DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1),
2800  Op.getOperand(7), // src0
2801  Op.getOperand(8), // src1
2802  Op.getOperand(9), // src2
2803  Op.getOperand(10) // src3
2804  };
2805 
2806  unsigned Opc = Done->isNullValue() ?
2808  return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
2809  }
2810  default:
2811  return SDValue();
2812  }
2813 }
2814 
2815 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2816  SDLoc DL(Op);
2817  LoadSDNode *Load = cast<LoadSDNode>(Op);
2818  ISD::LoadExtType ExtType = Load->getExtensionType();
2819  EVT MemVT = Load->getMemoryVT();
2820 
2821  if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
2822  // FIXME: Copied from PPC
2823  // First, load into 32 bits, then truncate to 1 bit.
2824 
2825  SDValue Chain = Load->getChain();
2826  SDValue BasePtr = Load->getBasePtr();
2827  MachineMemOperand *MMO = Load->getMemOperand();
2828 
2829  EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
2830 
2831  SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
2832  BasePtr, RealMemVT, MMO);
2833 
2834  SDValue Ops[] = {
2835  DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
2836  NewLD.getValue(1)
2837  };
2838 
2839  return DAG.getMergeValues(Ops, DL);
2840  }
2841 
2842  if (!MemVT.isVector())
2843  return SDValue();
2844 
2846  "Custom lowering for non-i32 vectors hasn't been implemented.");
2847 
2848  unsigned AS = Load->getAddressSpace();
2849  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
2850  AS, Load->getAlignment())) {
2851  SDValue Ops[2];
2852  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2853  return DAG.getMergeValues(Ops, DL);
2854  }
2855 
2856  MachineFunction &MF = DAG.getMachineFunction();
2858  // If there is a possibilty that flat instruction access scratch memory
2859  // then we need to use the same legalization rules we use for private.
2860  if (AS == AMDGPUAS::FLAT_ADDRESS)
2861  AS = MFI->hasFlatScratchInit() ?
2863 
2864  unsigned NumElements = MemVT.getVectorNumElements();
2865  switch (AS) {
2867  if (isMemOpUniform(Load))
2868  return SDValue();
2869  // Non-uniform loads will be selected to MUBUF instructions, so they
2870  // have the same legalization requirements as global and private
2871  // loads.
2872  //
2874  case AMDGPUAS::GLOBAL_ADDRESS: {
2877  return SDValue();
2878  // Non-uniform loads will be selected to MUBUF instructions, so they
2879  // have the same legalization requirements as global and private
2880  // loads.
2881  //
2882  }
2885  if (NumElements > 4)
2886  return SplitVectorLoad(Op, DAG);
2887  // v4 loads are supported for private and global memory.
2888  return SDValue();
2890  // Depending on the setting of the private_element_size field in the
2891  // resource descriptor, we can only make private accesses up to a certain
2892  // size.
2893  switch (Subtarget->getMaxPrivateElementSize()) {
2894  case 4:
2895  return scalarizeVectorLoad(Load, DAG);
2896  case 8:
2897  if (NumElements > 2)
2898  return SplitVectorLoad(Op, DAG);
2899  return SDValue();
2900  case 16:
2901  // Same as global/flat
2902  if (NumElements > 4)
2903  return SplitVectorLoad(Op, DAG);
2904  return SDValue();
2905  default:
2906  llvm_unreachable("unsupported private_element_size");
2907  }
2908  }
2909  case AMDGPUAS::LOCAL_ADDRESS: {
2910  if (NumElements > 2)
2911  return SplitVectorLoad(Op, DAG);
2912 
2913  if (NumElements == 2)
2914  return SDValue();
2915 
2916  // If properly aligned, if we split we might be able to use ds_read_b64.
2917  return SplitVectorLoad(Op, DAG);
2918  }
2919  default:
2920  return SDValue();
2921  }
2922 }
2923 
2924 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
2925  if (Op.getValueType() != MVT::i64)
2926  return SDValue();
2927 
2928  SDLoc DL(Op);
2929  SDValue Cond = Op.getOperand(0);
2930 
2931  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2932  SDValue One = DAG.getConstant(1, DL, MVT::i32);
2933 
2934  SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
2935  SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
2936 
2937  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
2938  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
2939 
2940  SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
2941 
2942  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
2943  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
2944 
2945  SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
2946 
2947  SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
2948  return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
2949 }
2950 
2951 // Catch division cases where we can use shortcuts with rcp and rsq
2952 // instructions.
2953 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
2954  SelectionDAG &DAG) const {
2955  SDLoc SL(Op);
2956  SDValue LHS = Op.getOperand(0);
2957  SDValue RHS = Op.getOperand(1);
2958  EVT VT = Op.getValueType();
2959  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
2960 
2961  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
2962  if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
2963  VT == MVT::f16) {
2964  if (CLHS->isExactlyValue(1.0)) {
2965  // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
2966  // the CI documentation has a worst case error of 1 ulp.
2967  // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
2968  // use it as long as we aren't trying to use denormals.
2969  //
2970  // v_rcp_f16 and v_rsq_f16 DO support denormals.
2971 
2972  // 1.0 / sqrt(x) -> rsq(x)
2973 
2974  // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
2975  // error seems really high at 2^29 ULP.
2976  if (RHS.getOpcode() == ISD::FSQRT)
2977  return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
2978 
2979  // 1.0 / x -> rcp(x)
2980  return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
2981  }
2982 
2983  // Same as for 1.0, but expand the sign out of the constant.
2984  if (CLHS->isExactlyValue(-1.0)) {
2985  // -1.0 / x -> rcp (fneg x)
2986  SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2987  return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
2988  }
2989  }
2990  }
2991 
2992  const SDNodeFlags *Flags = Op->getFlags();
2993 
2994  if (Unsafe || Flags->hasAllowReciprocal()) {
2995  // Turn into multiply by the reciprocal.
2996  // x / y -> x * (1.0 / y)
2998  Flags.setUnsafeAlgebra(true);
2999  SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
3000  return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
3001  }
3002 
3003  return SDValue();
3004 }
3005 
3006 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
3007  EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
3008  if (GlueChain->getNumValues() <= 1) {
3009  return DAG.getNode(Opcode, SL, VT, A, B);
3010  }
3011 
3012  assert(GlueChain->getNumValues() == 3);
3013 
3014  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
3015  switch (Opcode) {
3016  default: llvm_unreachable("no chain equivalent for opcode");
3017  case ISD::FMUL:
3018  Opcode = AMDGPUISD::FMUL_W_CHAIN;
3019  break;
3020  }
3021 
3022  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
3023  GlueChain.getValue(2));
3024 }
3025 
3026 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
3027  EVT VT, SDValue A, SDValue B, SDValue C,
3028  SDValue GlueChain) {
3029  if (GlueChain->getNumValues() <= 1) {
3030  return DAG.getNode(Opcode, SL, VT, A, B, C);
3031  }
3032 
3033  assert(GlueChain->getNumValues() == 3);
3034 
3035  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
3036  switch (Opcode) {
3037  default: llvm_unreachable("no chain equivalent for opcode");
3038  case ISD::FMA:
3039  Opcode = AMDGPUISD::FMA_W_CHAIN;
3040  break;
3041  }
3042 
3043  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
3044  GlueChain.getValue(2));
3045 }
3046 
3047 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
3048  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
3049  return FastLowered;
3050 
3051  SDLoc SL(Op);
3052  SDValue Src0 = Op.getOperand(0);
3053  SDValue Src1 = Op.getOperand(1);
3054 
3055  SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3056  SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3057 
3058  SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
3059  SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
3060 
3061  SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
3062  SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
3063 
3064  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
3065 }
3066 
3067 // Faster 2.5 ULP division that does not support denormals.
3068 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
3069  SDLoc SL(Op);
3070  SDValue LHS = Op.getOperand(1);
3071  SDValue RHS = Op.getOperand(2);
3072 
3073  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
3074 
3075  const APFloat K0Val(BitsToFloat(0x6f800000));
3076  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
3077 
3078  const APFloat K1Val(BitsToFloat(0x2f800000));
3079  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
3080 
3081  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
3082 
3083  EVT SetCCVT =
3085 
3086  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
3087 
3088  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
3089 
3090  // TODO: Should this propagate fast-math-flags?
3091  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
3092 
3093  // rcp does not support denormals.
3094  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
3095 
3096  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
3097 
3098  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
3099 }
3100 
3101 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
3102  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
3103  return FastLowered;
3104 
3105  SDLoc SL(Op);
3106  SDValue LHS = Op.getOperand(0);
3107  SDValue RHS = Op.getOperand(1);
3108 
3109  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
3110 
3111  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
3112 
3113  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
3114  RHS, RHS, LHS);
3115  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
3116  LHS, RHS, LHS);
3117 
3118  // Denominator is scaled to not be denormal, so using rcp is ok.
3119  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
3120  DenominatorScaled);
3121  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
3122  DenominatorScaled);
3123 
3124  const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
3127 
3128  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
3129 
3130  if (!Subtarget->hasFP32Denormals()) {
3131  SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
3132  const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
3133  SL, MVT::i32);
3134  SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
3135  DAG.getEntryNode(),
3136  EnableDenormValue, BitField);
3137  SDValue Ops[3] = {
3138  NegDivScale0,
3139  EnableDenorm.getValue(0),
3140  EnableDenorm.getValue(1)
3141  };
3142 
3143  NegDivScale0 = DAG.getMergeValues(Ops, SL);
3144  }
3145 
3146  SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
3147  ApproxRcp, One, NegDivScale0);
3148 
3149  SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
3150  ApproxRcp, Fma0);
3151 
3152  SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
3153  Fma1, Fma1);
3154 
3155  SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
3156  NumeratorScaled, Mul);
3157 
3158  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
3159 
3160  SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
3161  NumeratorScaled, Fma3);
3162 
3163  if (!Subtarget->hasFP32Denormals()) {
3164  const SDValue DisableDenormValue =
3166  SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
3167  Fma4.getValue(1),
3168  DisableDenormValue,
3169  BitField,
3170  Fma4.getValue(2));
3171 
3172  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
3173  DisableDenorm, DAG.getRoot());
3174  DAG.setRoot(OutputChain);
3175  }
3176 
3177  SDValue Scale = NumeratorScaled.getValue(1);
3178  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
3179  Fma4, Fma1, Fma3, Scale);
3180 
3181  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
3182 }
3183 
3184 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
3185  if (DAG.getTarget().Options.UnsafeFPMath)
3186  return lowerFastUnsafeFDIV(Op, DAG);
3187 
3188  SDLoc SL(Op);
3189  SDValue X = Op.getOperand(0);
3190  SDValue Y = Op.getOperand(1);
3191 
3192  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
3193 
3194  SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
3195 
3196  SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
3197 
3198  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
3199 
3200  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
3201 
3202  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
3203 
3204  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
3205 
3206  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
3207 
3208  SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
3209 
3210  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
3211  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
3212 
3213  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
3214  NegDivScale0, Mul, DivScale1);
3215 
3216  SDValue Scale;
3217 
3219  // Workaround a hardware bug on SI where the condition output from div_scale
3220  // is not usable.
3221 
3222  const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
3223 
3224  // Figure out if the scale to use for div_fmas.
3225  SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
3226  SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
3227  SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
3228  SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
3229 
3230  SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
3231  SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
3232 
3233  SDValue Scale0Hi
3234  = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
3235  SDValue Scale1Hi
3236  = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
3237 
3238  SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
3239  SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
3240  Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
3241  } else {
3242  Scale = DivScale1.getValue(1);
3243  }
3244 
3245  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
3246  Fma4, Fma3, Mul, Scale);
3247 
3248  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
3249 }
3250 
3251 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
3252  EVT VT = Op.getValueType();
3253 
3254  if (VT == MVT::f32)
3255  return LowerFDIV32(Op, DAG);
3256 
3257  if (VT == MVT::f64)
3258  return LowerFDIV64(Op, DAG);
3259 
3260  if (VT == MVT::f16)
3261  return LowerFDIV16(Op, DAG);
3262 
3263  llvm_unreachable("Unexpected type for fdiv");
3264 }
3265 
3266 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3267  SDLoc DL(Op);
3268  StoreSDNode *Store = cast<StoreSDNode>(Op);
3269  EVT VT = Store->getMemoryVT();
3270 
3271  if (VT == MVT::i1) {
3272  return DAG.getTruncStore(Store->getChain(), DL,
3273  DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
3274  Store->getBasePtr(), MVT::i1, Store->getMemOperand());
3275  }
3276 
3277  assert(VT.isVector() &&
3278  Store->getValue().getValueType().getScalarType() == MVT::i32);
3279 
3280  unsigned AS = Store->getAddressSpace();
3281  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
3282  AS, Store->getAlignment())) {
3283  return expandUnalignedStore(Store, DAG);
3284  }
3285 
3286  MachineFunction &MF = DAG.getMachineFunction();
3288  // If there is a possibilty that flat instruction access scratch memory
3289  // then we need to use the same legalization rules we use for private.
3290  if (AS == AMDGPUAS::FLAT_ADDRESS)
3291  AS = MFI->hasFlatScratchInit() ?
3293 
3294  unsigned NumElements = VT.getVectorNumElements();
3295  switch (AS) {
3298  if (NumElements > 4)
3299  return SplitVectorStore(Op, DAG);
3300  return SDValue();
3302  switch (Subtarget->getMaxPrivateElementSize()) {
3303  case 4:
3304  return scalarizeVectorStore(Store, DAG);
3305  case 8:
3306  if (NumElements > 2)
3307  return SplitVectorStore(Op, DAG);
3308  return SDValue();
3309  case 16:
3310  if (NumElements > 4)
3311  return SplitVectorStore(Op, DAG);
3312  return SDValue();
3313  default:
3314  llvm_unreachable("unsupported private_element_size");
3315  }
3316  }
3317  case AMDGPUAS::LOCAL_ADDRESS: {
3318  if (NumElements > 2)
3319  return SplitVectorStore(Op, DAG);
3320 
3321  if (NumElements == 2)
3322  return Op;
3323 
3324  // If properly aligned, if we split we might be able to use ds_write_b64.
3325  return SplitVectorStore(Op, DAG);
3326  }
3327  default:
3328  llvm_unreachable("unhandled address space");
3329  }
3330 }
3331 
3332 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
3333  SDLoc DL(Op);
3334  EVT VT = Op.getValueType();
3335  SDValue Arg = Op.getOperand(0);
3336  // TODO: Should this propagate fast-math-flags?
3337  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
3338  DAG.getNode(ISD::FMUL, DL, VT, Arg,
3339  DAG.getConstantFP(0.5/M_PI, DL,
3340  VT)));
3341 
3342  switch (Op.getOpcode()) {
3343  case ISD::FCOS:
3344  return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
3345  case ISD::FSIN:
3346  return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
3347  default:
3348  llvm_unreachable("Wrong trig opcode");
3349  }
3350 }
3351 
3352 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
3353  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
3354  assert(AtomicNode->isCompareAndSwap());
3355  unsigned AS = AtomicNode->getAddressSpace();
3356 
3357  // No custom lowering required for local address space
3358  if (!isFlatGlobalAddrSpace(AS))
3359  return Op;
3360 
3361  // Non-local address space requires custom lowering for atomic compare
3362  // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
3363  SDLoc DL(Op);
3364  SDValue ChainIn = Op.getOperand(0);
3365  SDValue Addr = Op.getOperand(1);
3366  SDValue Old = Op.getOperand(2);
3367  SDValue New = Op.getOperand(3);
3368  EVT VT = Op.getValueType();
3369  MVT SimpleVT = VT.getSimpleVT();
3370  MVT VecType = MVT::getVectorVT(SimpleVT, 2);
3371 
3372  SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
3373  SDValue Ops[] = { ChainIn, Addr, NewOld };
3374 
3376  Ops, VT, AtomicNode->getMemOperand());
3377 }
3378 
3379 //===----------------------------------------------------------------------===//
3380 // Custom DAG optimizations
3381 //===----------------------------------------------------------------------===//
3382 
3383 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
3384  DAGCombinerInfo &DCI) const {
3385  EVT VT = N->getValueType(0);
3386  EVT ScalarVT = VT.getScalarType();
3387  if (ScalarVT != MVT::f32)
3388  return SDValue();
3389 
3390  SelectionDAG &DAG = DCI.DAG;
3391  SDLoc DL(N);
3392 
3393  SDValue Src = N->getOperand(0);
3394  EVT SrcVT = Src.getValueType();
3395 
3396  // TODO: We could try to match extracting the higher bytes, which would be
3397  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
3398  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
3399  // about in practice.
3400  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
3401  if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
3402  SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
3403  DCI.AddToWorklist(Cvt.getNode());
3404  return Cvt;
3405  }
3406  }
3407 
3408  return SDValue();
3409 }
3410 
3411 /// \brief Return true if the given offset Size in bytes can be folded into
3412 /// the immediate offsets of a memory instruction for the given address space.
3413 static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
3414  const SISubtarget &STI) {
3415  switch (AS) {
3416  case AMDGPUAS::GLOBAL_ADDRESS: {
3417  // MUBUF instructions a 12-bit offset in bytes.
3418  return isUInt<12>(OffsetSize);
3419  }
3421  // SMRD instructions have an 8-bit offset in dwords on SI and
3422  // a 20-bit offset in bytes on VI.
3424  return isUInt<20>(OffsetSize);
3425  else
3426  return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
3427  }
3429  case AMDGPUAS::REGION_ADDRESS: {
3430  // The single offset versions have a 16-bit offset in bytes.
3431  return isUInt<16>(OffsetSize);
3432  }
3434  // Indirect register addressing does not use any offsets.
3435  default:
3436  return 0;
3437  }
3438 }
3439 
3440 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
3441 
3442 // This is a variant of
3443 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
3444 //
3445 // The normal DAG combiner will do this, but only if the add has one use since
3446 // that would increase the number of instructions.
3447 //
3448 // This prevents us from seeing a constant offset that can be folded into a
3449 // memory instruction's addressing mode. If we know the resulting add offset of
3450 // a pointer can be folded into an addressing offset, we can replace the pointer
3451 // operand with the add of new constant offset. This eliminates one of the uses,
3452 // and may allow the remaining use to also be simplified.
3453 //
3454 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
3455  unsigned AddrSpace,
3456  DAGCombinerInfo &DCI) const {
3457  SDValue N0 = N->getOperand(0);
3458  SDValue N1 = N->getOperand(1);
3459 
3460  if (N0.getOpcode() != ISD::ADD)
3461  return SDValue();
3462 
3463  const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
3464  if (!CN1)
3465  return SDValue();
3466 
3467  const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
3468  if (!CAdd)
3469  return SDValue();
3470 
3471  // If the resulting offset is too large, we can't fold it into the addressing
3472  // mode offset.
3473  APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
3474  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
3475  return SDValue();
3476 
3477  SelectionDAG &DAG = DCI.DAG;
3478  SDLoc SL(N);
3479  EVT VT = N->getValueType(0);
3480 
3481  SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
3482  SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
3483 
3484  return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
3485 }
3486 
3487 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
3488  DAGCombinerInfo &DCI) const {
3489  SDValue Ptr = N->getBasePtr();
3490  SelectionDAG &DAG = DCI.DAG;
3491  SDLoc SL(N);
3492 
3493  // TODO: We could also do this for multiplies.
3494  unsigned AS = N->getAddressSpace();
3495  if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
3496  SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
3497  if (NewPtr) {
3498  SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
3499 
3500  NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
3501  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
3502  }
3503  }
3504 
3505  return SDValue();
3506 }
3507 
3508 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
3509  return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
3510  (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
3511  (Opc == ISD::XOR && Val == 0);
3512 }
3513 
3514 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
3515 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
3516 // integer combine opportunities since most 64-bit operations are decomposed
3517 // this way. TODO: We won't want this for SALU especially if it is an inline
3518 // immediate.
3519 SDValue SITargetLowering::splitBinaryBitConstantOp(
3520  DAGCombinerInfo &DCI,
3521  const SDLoc &SL,
3522  unsigned Opc, SDValue LHS,
3523  const ConstantSDNode *CRHS) const {
3524  uint64_t Val = CRHS->getZExtValue();
3525  uint32_t ValLo = Lo_32(Val);
3526  uint32_t ValHi = Hi_32(Val);
3527  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3528 
3529  if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
3530  bitOpWithConstantIsReducible(Opc, ValHi)) ||
3531  (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
3532  // If we need to materialize a 64-bit immediate, it will be split up later
3533  // anyway. Avoid creating the harder to understand 64-bit immediate
3534  // materialization.
3535  return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
3536  }
3537 
3538  return SDValue();
3539 }
3540 
3541 SDValue SITargetLowering::performAndCombine(SDNode *N,
3542  DAGCombinerInfo &DCI) const {
3543  if (DCI.isBeforeLegalize())
3544  return SDValue();
3545 
3546  SelectionDAG &DAG = DCI.DAG;
3547  EVT VT = N->getValueType(0);
3548  SDValue LHS = N->getOperand(0);
3549  SDValue RHS = N->getOperand(1);
3550 
3551 
3552  if (VT == MVT::i64) {
3553  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
3554  if (CRHS) {
3555  if (SDValue Split
3556  = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
3557  return Split;
3558  }
3559  }
3560 
3561  // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
3562  // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
3563  if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
3564  ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
3565  ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
3566 
3567  SDValue X = LHS.getOperand(0);
3568  SDValue Y = RHS.getOperand(0);
3569  if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
3570  return SDValue();
3571 
3572  if (LCC == ISD::SETO) {
3573  if (X != LHS.getOperand(1))
3574  return SDValue();
3575 
3576  if (RCC == ISD::SETUNE) {
3578  if (!C1 || !C1->isInfinity() || C1->isNegative())
3579  return SDValue();
3580 
3587 
3588  static_assert(((~(SIInstrFlags::S_NAN |
3591  SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
3592  "mask not equal");
3593 
3594  SDLoc DL(N);
3595  return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
3596  X, DAG.getConstant(Mask, DL, MVT::i32));
3597  }
3598  }
3599  }
3600 
3601  return SDValue();
3602 }
3603 
3604 SDValue SITargetLowering::performOrCombine(SDNode *N,
3605  DAGCombinerInfo &DCI) const {
3606  SelectionDAG &DAG = DCI.DAG;
3607  SDValue LHS = N->getOperand(0);
3608  SDValue RHS = N->getOperand(1);
3609 
3610  EVT VT = N->getValueType(0);
3611  if (VT == MVT::i1) {
3612  // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
3613  if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
3614  RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
3615  SDValue Src = LHS.getOperand(0);
3616  if (Src != RHS.getOperand(0))
3617  return SDValue();
3618 
3619  const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
3620  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
3621  if (!CLHS || !CRHS)
3622  return SDValue();
3623 
3624  // Only 10 bits are used.
3625  static const uint32_t MaxMask = 0x3ff;
3626 
3627  uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
3628  SDLoc DL(N);
3629  return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
3630  Src, DAG.getConstant(NewMask, DL, MVT::i32));
3631  }
3632 
3633  return SDValue();
3634  }
3635 
3636  if (VT != MVT::i64)
3637  return SDValue();
3638 
3639  // TODO: This could be a generic combine with a predicate for extracting the
3640  // high half of an integer being free.
3641 
3642  // (or i64:x, (zero_extend i32:y)) ->
3643  // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
3644  if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
3645  RHS.getOpcode() != ISD::ZERO_EXTEND)
3646  std::swap(LHS, RHS);
3647 
3648  if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
3649  SDValue ExtSrc = RHS.getOperand(0);
3650  EVT SrcVT = ExtSrc.getValueType();
3651  if (SrcVT == MVT::i32) {
3652  SDLoc SL(N);
3653  SDValue LowLHS, HiBits;
3654  std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
3655  SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
3656 
3657  DCI.AddToWorklist(LowOr.getNode());
3658  DCI.AddToWorklist(HiBits.getNode());
3659 
3661  LowOr, HiBits);
3662  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3663  }
3664  }
3665 
3666  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3667  if (CRHS) {
3668  if (SDValue Split
3669  = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
3670  return Split;
3671  }
3672 
3673  return SDValue();
3674 }
3675 
3676 SDValue SITargetLowering::performXorCombine(SDNode *N,
3677  DAGCombinerInfo &DCI) const {
3678  EVT VT = N->getValueType(0);
3679  if (VT != MVT::i64)
3680  return SDValue();
3681 
3682  SDValue LHS = N->getOperand(0);
3683  SDValue RHS = N->getOperand(1);
3684 
3685  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
3686  if (CRHS) {
3687  if (SDValue Split
3688  = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
3689  return Split;
3690  }
3691 
3692  return SDValue();
3693 }
3694 
3695 SDValue SITargetLowering::performClassCombine(SDNode *N,
3696  DAGCombinerInfo &DCI) const {
3697  SelectionDAG &DAG = DCI.DAG;
3698  SDValue Mask = N->getOperand(1);
3699 
3700  // fp_class x, 0 -> false
3701  if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
3702  if (CMask->isNullValue())
3703  return DAG.getConstant(0, SDLoc(N), MVT::i1);
3704  }
3705 
3706  if (N->getOperand(0).isUndef())
3707  return DAG.getUNDEF(MVT::i1);
3708 
3709  return SDValue();
3710 }
3711 
3712 // Constant fold canonicalize.
3713 SDValue SITargetLowering::performFCanonicalizeCombine(
3714  SDNode *N,
3715  DAGCombinerInfo &DCI) const {
3717  if (!CFP)
3718  return SDValue();
3719 
3720  SelectionDAG &DAG = DCI.DAG;
3721  const APFloat &C = CFP->getValueAPF();
3722 
3723  // Flush denormals to 0 if not enabled.
3724  if (C.isDenormal()) {
3725  EVT VT = N->getValueType(0);
3726  if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
3727  return DAG.getConstantFP(0.0, SDLoc(N), VT);
3728 
3729  if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
3730  return DAG.getConstantFP(0.0, SDLoc(N), VT);
3731 
3732  if (VT == MVT::f16 && !Subtarget->hasFP16Denormals())
3733  return DAG.getConstantFP(0.0, SDLoc(N), VT);
3734  }
3735 
3736  if (C.isNaN()) {
3737  EVT VT = N->getValueType(0);
3738  APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
3739  if (C.isSignaling()) {
3740  // Quiet a signaling NaN.
3741  return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
3742  }
3743 
3744  // Make sure it is the canonical NaN bitpattern.
3745  //
3746  // TODO: Can we use -1 as the canonical NaN value since it's an inline
3747  // immediate?
3748  if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
3749  return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
3750  }
3751 
3752  return SDValue(CFP, 0);
3753 }
3754 
3755 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
3756  switch (Opc) {
3757  case ISD::FMAXNUM:
3758  return AMDGPUISD::FMAX3;
3759  case ISD::SMAX:
3760  return AMDGPUISD::SMAX3;
3761  case ISD::UMAX:
3762  return AMDGPUISD::UMAX3;
3763  case ISD::FMINNUM:
3764  return AMDGPUISD::FMIN3;
3765  case ISD::SMIN:
3766  return AMDGPUISD::SMIN3;
3767  case ISD::UMIN:
3768  return AMDGPUISD::UMIN3;
3769  default:
3770  llvm_unreachable("Not a min/max opcode");
3771  }
3772 }
3773 
3775  SDValue Op0, SDValue Op1, bool Signed) {
3777  if (!K1)
3778  return SDValue();
3779 
3781  if (!K0)
3782  return SDValue();
3783 
3784  if (Signed) {
3785  if (K0->getAPIntValue().sge(K1->getAPIntValue()))
3786  return SDValue();
3787  } else {
3788  if (K0->getAPIntValue().uge(K1->getAPIntValue()))
3789  return SDValue();
3790  }
3791 
3792  EVT VT = K0->getValueType(0);
3793 
3794  MVT NVT = MVT::i32;
3795  unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3796 
3797  SDValue Tmp1, Tmp2, Tmp3;
3798  Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
3799  Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
3800  Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
3801 
3802  if (VT == MVT::i16) {
3803  Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
3804  Tmp1, Tmp2, Tmp3);
3805 
3806  return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
3807  } else
3808  return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
3809  Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
3810 }
3811 
3812 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
3814  return true;
3815 
3816  return DAG.isKnownNeverNaN(Op);
3817 }
3818 
3820  SDValue Op0, SDValue Op1) {
3822  if (!K1)
3823  return SDValue();
3824 
3826  if (!K0)
3827  return SDValue();
3828 
3829  // Ordered >= (although NaN inputs should have folded away by now).
3831  if (Cmp == APFloat::cmpGreaterThan)
3832  return SDValue();
3833 
3834  // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
3835  // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
3836  // give the other result, which is different from med3 with a NaN input.
3837  SDValue Var = Op0.getOperand(0);
3838  if (!isKnownNeverSNan(DAG, Var))
3839  return SDValue();
3840 
3841  return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
3842  Var, SDValue(K0, 0), SDValue(K1, 0));
3843 }
3844 
3845 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
3846  DAGCombinerInfo &DCI) const {
3847  SelectionDAG &DAG = DCI.DAG;
3848 
3849  unsigned Opc = N->getOpcode();
3850  SDValue Op0 = N->getOperand(0);
3851  SDValue Op1 = N->getOperand(1);
3852 
3853  // Only do this if the inner op has one use since this will just increases
3854  // register pressure for no benefit.
3855 
3856  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
3857  // max(max(a, b), c) -> max3(a, b, c)
3858  // min(min(a, b), c) -> min3(a, b, c)
3859  if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
3860  SDLoc DL(N);
3861  return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
3862  DL,
3863  N->getValueType(0),
3864  Op0.getOperand(0),
3865  Op0.getOperand(1),
3866  Op1);
3867  }
3868 
3869  // Try commuted.
3870  // max(a, max(b, c)) -> max3(a, b, c)
3871  // min(a, min(b, c)) -> min3(a, b, c)
3872  if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
3873  SDLoc DL(N);
3874  return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
3875  DL,
3876  N->getValueType(0),
3877  Op0,
3878  Op1.getOperand(0),
3879  Op1.getOperand(1));
3880  }
3881  }
3882 
3883  // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
3884  if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
3885  if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
3886  return Med3;
3887  }
3888 
3889  if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
3890  if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
3891  return Med3;
3892  }
3893 
3894  // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
3895  if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
3896  (Opc == AMDGPUISD::FMIN_LEGACY &&
3897  Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
3898  N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
3899  if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
3900  return Res;
3901  }
3902 
3903  return SDValue();
3904 }
3905 
3906 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
3907  const SDNode *N0,
3908  const SDNode *N1) const {
3909  EVT VT = N0->getValueType(0);
3910 
3911  // Only do this if we are not trying to support denormals. v_mad_f32 does not
3912  // support denormals ever.
3913  if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
3914  (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
3915  return ISD::FMAD;
3916 
3917  const TargetOptions &Options = DAG.getTarget().Options;
3918  if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
3919  Options.UnsafeFPMath ||
3920  (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() &&
3921  cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) &&
3923  return ISD::FMA;
3924  }
3925 
3926  return 0;
3927 }
3928 
3929 SDValue SITargetLowering::performFAddCombine(SDNode *N,
3930  DAGCombinerInfo &DCI) const {
3931  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3932  return SDValue();
3933 
3934  SelectionDAG &DAG = DCI.DAG;
3935  EVT VT = N->getValueType(0);
3936  assert(!VT.isVector());
3937 
3938  SDLoc SL(N);
3939  SDValue LHS = N->getOperand(0);
3940  SDValue RHS = N->getOperand(1);
3941 
3942  // These should really be instruction patterns, but writing patterns with
3943  // source modiifiers is a pain.
3944 
3945  // fadd (fadd (a, a), b) -> mad 2.0, a, b
3946  if (LHS.getOpcode() == ISD::FADD) {
3947  SDValue A = LHS.getOperand(0);
3948  if (A == LHS.getOperand(1)) {
3949  unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
3950  if (FusedOp != 0) {
3951  const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
3952  return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
3953  }
3954  }
3955  }
3956 
3957  // fadd (b, fadd (a, a)) -> mad 2.0, a, b
3958  if (RHS.getOpcode() == ISD::FADD) {
3959  SDValue A = RHS.getOperand(0);
3960  if (A == RHS.getOperand(1)) {
3961  unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
3962  if (FusedOp != 0) {
3963  const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
3964  return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
3965  }
3966  }
3967  }
3968 
3969  return SDValue();
3970 }
3971 
3972 SDValue SITargetLowering::performFSubCombine(SDNode *N,
3973  DAGCombinerInfo &DCI) const {
3974  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3975  return SDValue();
3976 
3977  SelectionDAG &DAG = DCI.DAG;
3978  SDLoc SL(N);
3979  EVT VT = N->getValueType(0);
3980  assert(!VT.isVector());
3981 
3982  // Try to get the fneg to fold into the source modifier. This undoes generic
3983  // DAG combines and folds them into the mad.
3984  //
3985  // Only do this if we are not trying to support denormals. v_mad_f32 does
3986  // not support denormals ever.
3987  SDValue LHS = N->getOperand(0);
3988  SDValue RHS = N->getOperand(1);
3989  if (LHS.getOpcode() == ISD::FADD) {
3990  // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
3991  SDValue A = LHS.getOperand(0);
3992  if (A == LHS.getOperand(1)) {
3993  unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
3994  if (FusedOp != 0){
3995  const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
3996  SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3997 
3998  return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
3999  }
4000  }
4001  }
4002 
4003  if (RHS.getOpcode() == ISD::FADD) {
4004  // (fsub c, (fadd a, a)) -> mad -2.0, a, c
4005 
4006  SDValue A = RHS.getOperand(0);
4007  if (A == RHS.getOperand(1)) {
4008  unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
4009  if (FusedOp != 0){
4010  const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
4011  return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
4012  }
4013  }
4014  }
4015 
4016  return SDValue();
4017 }
4018 
4019 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
4020  DAGCombinerInfo &DCI) const {
4021  SelectionDAG &DAG = DCI.DAG;
4022  SDLoc SL(N);
4023 
4024  SDValue LHS = N->getOperand(0);
4025  SDValue RHS = N->getOperand(1);
4026  EVT VT = LHS.getValueType();
4027 
4028  if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
4029  VT != MVT::f16))
4030  return SDValue();
4031 
4032  // Match isinf pattern
4033  // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
4034  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
4035  if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
4036  const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4037  if (!CRHS)
4038  return SDValue();
4039 
4040  const APFloat &APF = CRHS->getValueAPF();
4041  if (APF.isInfinity() && !APF.isNegative()) {
4043  return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
4044  DAG.getConstant(Mask, SL, MVT::i32));
4045  }
4046  }
4047 
4048  return SDValue();
4049 }
4050 
4051 SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
4052  DAGCombinerInfo &DCI) const {
4053  SelectionDAG &DAG = DCI.DAG;
4054  SDLoc SL(N);
4055  unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
4056 
4057  SDValue Src = N->getOperand(0);
4058  SDValue Srl = N->getOperand(0);
4059  if (Srl.getOpcode() == ISD::ZERO_EXTEND)
4060  Srl = Srl.getOperand(0);
4061 
4062  // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
4063  if (Srl.getOpcode() == ISD::SRL) {
4064  // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
4065  // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
4066  // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
4067 
4068  if (const ConstantSDNode *C =
4069  dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
4070  Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
4071  EVT(MVT::i32));
4072 
4073  unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
4074  if (SrcOffset < 32 && SrcOffset % 8 == 0) {
4075  return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
4076  MVT::f32, Srl);
4077  }
4078  }
4079  }
4080 
4081  APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
4082 
4083  APInt KnownZero, KnownOne;
4084  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4085  !DCI.isBeforeLegalizeOps());
4086  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4087  if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
4088  TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
4089  DCI.CommitTargetLoweringOpt(TLO);
4090  }
4091 
4092  return SDValue();
4093 }
4094 
4096  DAGCombinerInfo &DCI) const {
4097  switch (N->getOpcode()) {
4098  default:
4100  case ISD::FADD:
4101  return performFAddCombine(N, DCI);
4102  case ISD::FSUB:
4103  return performFSubCombine(N, DCI);
4104  case ISD::SETCC:
4105  return performSetCCCombine(N, DCI);
4106  case ISD::FMAXNUM:
4107  case ISD::FMINNUM:
4108  case ISD::SMAX:
4109  case ISD::SMIN:
4110  case ISD::UMAX:
4111  case ISD::UMIN:
4113  case AMDGPUISD::FMAX_LEGACY: {
4114  if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
4115  N->getValueType(0) != MVT::f64 &&
4117  return performMinMaxCombine(N, DCI);
4118  break;
4119  }
4120  case ISD::LOAD:
4121  case ISD::STORE:
4122  case ISD::ATOMIC_LOAD:
4123  case ISD::ATOMIC_STORE:
4124  case ISD::ATOMIC_CMP_SWAP:
4126  case ISD::ATOMIC_SWAP:
4127  case ISD::ATOMIC_LOAD_ADD:
4128  case ISD::ATOMIC_LOAD_SUB:
4129  case ISD::ATOMIC_LOAD_AND:
4130  case ISD::ATOMIC_LOAD_OR:
4131  case ISD::ATOMIC_LOAD_XOR:
4132  case ISD::ATOMIC_LOAD_NAND:
4133  case ISD::ATOMIC_LOAD_MIN:
4134  case ISD::ATOMIC_LOAD_MAX:
4135  case ISD::ATOMIC_LOAD_UMIN:
4136  case ISD::ATOMIC_LOAD_UMAX:
4137  case AMDGPUISD::ATOMIC_INC:
4138  case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
4139  if (DCI.isBeforeLegalize())
4140  break;
4141  return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
4142  }
4143  case ISD::AND:
4144  return performAndCombine(N, DCI);
4145  case ISD::OR:
4146  return performOrCombine(N, DCI);
4147  case ISD::XOR:
4148  return performXorCombine(N, DCI);
4149  case AMDGPUISD::FP_CLASS:
4150  return performClassCombine(N, DCI);
4151  case ISD::FCANONICALIZE:
4152  return performFCanonicalizeCombine(N, DCI);
4153  case AMDGPUISD::FRACT:
4154  case AMDGPUISD::RCP:
4155  case AMDGPUISD::RSQ:
4156  case AMDGPUISD::RCP_LEGACY:
4157  case AMDGPUISD::RSQ_LEGACY:
4158  case AMDGPUISD::RSQ_CLAMP:
4159  case AMDGPUISD::LDEXP: {
4160  SDValue Src = N->getOperand(0);
4161  if (Src.isUndef())
4162  return Src;
4163  break;
4164  }
4165  case ISD::SINT_TO_FP:
4166  case ISD::UINT_TO_FP:
4167  return performUCharToFloatCombine(N, DCI);
4172  return performCvtF32UByteNCombine(N, DCI);
4173  }
4175 }
4176 
4177 /// \brief Helper function for adjustWritemask
4178 static unsigned SubIdx2Lane(unsigned Idx) {
4179  switch (Idx) {
4180  default: return 0;
4181  case AMDGPU::sub0: return 0;
4182  case AMDGPU::sub1: return 1;
4183  case AMDGPU::sub2: return 2;
4184  case AMDGPU::sub3: return 3;
4185  }
4186 }
4187 
4188 /// \brief Adjust the writemask of MIMG instructions
4189 void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
4190  SelectionDAG &DAG) const {
4191  SDNode *Users[4] = { };
4192  unsigned Lane = 0;
4193  unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
4194  unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
4195  unsigned NewDmask = 0;
4196 
4197  // Try to figure out the used register components
4198  for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
4199  I != E; ++I) {
4200 
4201  // Abort if we can't understand the usage
4202  if (!I->isMachineOpcode() ||
4203  I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
4204  return;
4205 
4206  // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
4207  // Note that subregs are packed, i.e. Lane==0 is the first bit set
4208  // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
4209  // set, etc.
4210  Lane = SubIdx2Lane(I->getConstantOperandVal(1));
4211 
4212  // Set which texture component corresponds to the lane.
4213  unsigned Comp;
4214  for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
4215  assert(Dmask);
4216  Comp = countTrailingZeros(Dmask);
4217  Dmask &= ~(1 << Comp);
4218  }
4219 
4220  // Abort if we have more than one user per component
4221  if (Users[Lane])
4222  return;
4223 
4224  Users[Lane] = *I;
4225  NewDmask |= 1 << Comp;
4226  }
4227 
4228  // Abort if there's no change
4229  if (NewDmask == OldDmask)
4230  return;
4231 
4232  // Adjust the writemask in the node
4233  std::vector<SDValue> Ops;
4234  Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
4235  Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
4236  Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
4237  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
4238 
4239  // If we only got one lane, replace it with a copy
4240  // (if NewDmask has only one bit set...)
4241  if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
4242  SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
4243  MVT::i32);
4244  SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4245  SDLoc(), Users[Lane]->getValueType(0),
4246  SDValue(Node, 0), RC);
4247  DAG.ReplaceAllUsesWith(Users[Lane], Copy);
4248  return;
4249  }
4250 
4251  // Update the users of the node with the new indices
4252  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
4253 
4254  SDNode *User = Users[i];
4255  if (!User)
4256  continue;
4257 
4258  SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
4259  DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
4260 
4261  switch (Idx) {
4262  default: break;
4263  case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
4264  case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
4265  case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
4266  }
4267  }
4268 }
4269 
4270 static bool isFrameIndexOp(SDValue Op) {
4271  if (Op.getOpcode() == ISD::AssertZext)
4272  Op = Op.getOperand(0);
4273 
4274  return isa<FrameIndexSDNode>(Op);
4275 }
4276 
4277 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
4278 /// with frame index operands.
4279 /// LLVM assumes that inputs are to these instructions are registers.
4281  SelectionDAG &DAG) const {
4282 
4284  for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
4285  if (!isFrameIndexOp(Node->getOperand(i))) {
4286  Ops.push_back(Node->getOperand(i));
4287  continue;
4288  }
4289 
4290  SDLoc DL(Node);
4291  Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
4292  Node->getOperand(i).getValueType(),
4293  Node->getOperand(i)), 0));
4294  }
4295 
4296  DAG.UpdateNodeOperands(Node, Ops);
4297 }
4298 
4299 /// \brief Fold the instructions after selecting them.
4301  SelectionDAG &DAG) const {
4302  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4303  unsigned Opcode = Node->getMachineOpcode();
4304 
4305  if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
4306  !TII->isGather4(Opcode))
4307  adjustWritemask(Node, DAG);
4308 
4309  if (Opcode == AMDGPU::INSERT_SUBREG ||
4310  Opcode == AMDGPU::REG_SEQUENCE) {
4311  legalizeTargetIndependentNode(Node, DAG);
4312  return Node;
4313  }
4314  return Node;
4315 }
4316 
4317 /// \brief Assign the register class depending on the number of
4318 /// bits set in the writemask
4320  SDNode *Node) const {
4321  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4322 
4324 
4325  if (TII->isVOP3(MI.getOpcode())) {
4326  // Make sure constant bus requirements are respected.
4327  TII->legalizeOperandsVOP3(MRI, MI);
4328  return;
4329  }
4330 
4331  if (TII->isMIMG(MI)) {
4332  unsigned VReg = MI.getOperand(0).getReg();
4333  const TargetRegisterClass *RC = MRI.getRegClass(VReg);
4334  // TODO: Need mapping tables to handle other cases (register classes).
4335  if (RC != &AMDGPU::VReg_128RegClass)
4336  return;
4337 
4338  unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
4339  unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
4340  unsigned BitsSet = 0;
4341  for (unsigned i = 0; i < 4; ++i)
4342  BitsSet += Writemask & (1 << i) ? 1 : 0;
4343  switch (BitsSet) {
4344  default: return;
4345  case 1: RC = &AMDGPU::VGPR_32RegClass; break;
4346  case 2: RC = &AMDGPU::VReg_64RegClass; break;
4347  case 3: RC = &AMDGPU::VReg_96RegClass; break;
4348  }
4349 
4350  unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
4351  MI.setDesc(TII->get(NewOpcode));
4352  MRI.setRegClass(VReg, RC);
4353  return;
4354  }
4355 
4356  // Replace unused atomics with the no return version.
4357  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
4358  if (NoRetAtomicOp != -1) {
4359  if (!Node->hasAnyUseOfValue(0)) {
4360  MI.setDesc(TII->get(NoRetAtomicOp));
4361  MI.RemoveOperand(0);
4362  return;
4363  }
4364 
4365  // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
4366  // instruction, because the return type of these instructions is a vec2 of
4367  // the memory type, so it can be tied to the input operand.
4368  // This means these instructions always have a use, so we need to add a
4369  // special case to check if the atomic has only one extract_subreg use,
4370  // which itself has no uses.
4371  if ((Node->hasNUsesOfValue(1, 0) &&
4372  Node->use_begin()->isMachineOpcode() &&
4373  Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
4374  !Node->use_begin()->hasAnyUseOfValue(0))) {
4375  unsigned Def = MI.getOperand(0).getReg();
4376 
4377  // Change this into a noret atomic.
4378  MI.setDesc(TII->get(NoRetAtomicOp));
4379  MI.RemoveOperand(0);
4380 
4381  // If we only remove the def operand from the atomic instruction, the
4382  // extract_subreg will be left with a use of a vreg without a def.
4383  // So we need to insert an implicit_def to avoid machine verifier
4384  // errors.
4385  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
4386  TII->get(AMDGPU::IMPLICIT_DEF), Def);
4387  }
4388  return;
4389  }
4390 }
4391 
4392 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
4393  uint64_t Val) {
4394  SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
4395  return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
4396 }
4397 
4399  const SDLoc &DL,
4400  SDValue Ptr) const {
4401  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4402 
4403  // Build the half of the subregister with the constants before building the
4404  // full 128-bit register. If we are building multiple resource descriptors,
4405  // this will allow CSEing of the 2-component register.
4406  const SDValue Ops0[] = {
4407  DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
4408  buildSMovImm32(DAG, DL, 0),
4409  DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
4410  buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
4411  DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
4412  };
4413 
4414  SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
4415  MVT::v2i32, Ops0), 0);
4416 
4417  // Combine the constants and the pointer.
4418  const SDValue Ops1[] = {
4419  DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
4420  Ptr,
4421  DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
4422  SubRegHi,
4423  DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
4424  };
4425 
4426  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
4427 }
4428 
4429 /// \brief Return a resource descriptor with the 'Add TID' bit enabled
4430 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
4431 /// of the resource descriptor) to create an offset, which is added to
4432 /// the resource pointer.
4434  SDValue Ptr, uint32_t RsrcDword1,
4435  uint64_t RsrcDword2And3) const {
4436  SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
4437  SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
4438  if (RsrcDword1) {
4439  PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
4440  DAG.getConstant(RsrcDword1, DL, MVT::i32)),
4441  0);
4442  }
4443 
4444  SDValue DataLo = buildSMovImm32(DAG, DL,
4445  RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
4446  SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
4447 
4448  const SDValue Ops[] = {
4449  DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
4450  PtrLo,
4451  DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
4452  PtrHi,
4453  DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
4454  DataLo,
4455  DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
4456  DataHi,
4457  DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
4458  };
4459 
4460  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
4461 }
4462 
4464  const TargetRegisterClass *RC,
4465  unsigned Reg, EVT VT) const {
4466  SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
4467 
4468  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
4469  cast<RegisterSDNode>(VReg)->getReg(), VT);
4470 }
4471 
4472 //===----------------------------------------------------------------------===//
4473 // SI Inline Assembly Support
4474 //===----------------------------------------------------------------------===//
4475 
4476 std::pair<unsigned, const TargetRegisterClass *>
4478  StringRef Constraint,
4479  MVT VT) const {
4480  if (!isTypeLegal(VT))
4481  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4482 
4483  if (Constraint.size() == 1) {
4484  switch (Constraint[0]) {
4485  case 's':
4486  case 'r':
4487  switch (VT.getSizeInBits()) {
4488  default:
4489  return std::make_pair(0U, nullptr);
4490  case 32:
4491  case 16:
4492  return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
4493  case 64:
4494  return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
4495  case 128:
4496  return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
4497  case 256:
4498  return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
4499  }
4500 
4501  case 'v':
4502  switch (VT.getSizeInBits()) {
4503  default:
4504  return std::make_pair(0U, nullptr);
4505  case 32:
4506  case 16:
4507  return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
4508  case 64:
4509  return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
4510  case 96:
4511  return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
4512  case 128:
4513  return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
4514  case 256:
4515  return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
4516  case 512:
4517  return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
4518  }
4519  }
4520  }
4521 
4522  if (Constraint.size() > 1) {
4523  const TargetRegisterClass *RC = nullptr;
4524  if (Constraint[1] == 'v') {
4525  RC = &AMDGPU::VGPR_32RegClass;
4526  } else if (Constraint[1] == 's') {
4527  RC = &AMDGPU::SGPR_32RegClass;
4528  }
4529 
4530  if (RC) {
4531  uint32_t Idx;
4532  bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
4533  if (!Failed && Idx < RC->getNumRegs())
4534  return std::make_pair(RC->getRegister(Idx), RC);
4535  }
4536  }
4537  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4538 }
4539 
4542  if (Constraint.size() == 1) {
4543  switch (Constraint[0]) {
4544  default: break;
4545  case 's':
4546  case 'v':
4547  return C_RegisterClass;
4548  }
4549  }
4550  return TargetLowering::getConstraintType(Constraint);
4551 }
value_iterator value_begin() const
std::enable_if< std::numeric_limits< T >::is_signed, bool >::type getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:494
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:500
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:315
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:467
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array...
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:762
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:61
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:102
const AMDGPURegisterInfo * getRegisterInfo() const override=0
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:524
Interface definition for SIRegisterInfo.
SDValue getValue(unsigned R) const
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
const DebugLoc & getDebugLoc() const
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:55
int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const
Given a MIMG Opcode that writes all 4 channels, return the equivalent opcode that writes Channels Cha...
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
LLVMContext * getContext() const
Definition: SelectionDAG.h:333
Diagnostic information for unsupported feature in backend.
AMDGPU specific subclass of TargetSubtarget.
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, SDLoc DL, EVT VT)
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1309
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
Definition: SelectionDAG.h:804
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isNaN() const
Definition: APFloat.h:1033
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR (an vector value) starting with the ...
Definition: ISDOpcodes.h:304
bool isKnownNeverNaN(SDValue Op) const
Test whether the given SDValue is known to never be NaN.
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
size_t i
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:572
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
LocInfo getLocInfo() const
unsigned getRegister(unsigned i) const
Return the specified register in the class.
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
static MVT getVectorVT(MVT VT, unsigned NumElements)
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getDestAddressSpace() const
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:313
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
void AnalyzeFormalArguments(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
const TargetMachine & getTargetMachine() const
SDVTList getVTList() const
unsigned createVirtualRegister(const TargetRegisterClass *RegClass)
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool hasUnalignedBufferAccess() const
constexpr uint32_t Lo_32(uint64_t Value)
Lo_32 - This function returns the low 32 bits of a 64 bit value.
Definition: MathExtras.h:253
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:329
unsigned addLiveIn(unsigned PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:163
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:605
bool hasFastFMAF32() const
void setIsUndef(bool Val=true)
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI)
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
This class represents a function call, abstracting a target machine's calling convention.
const SIInstrInfo * getInstrInfo() const override
unsigned getSrcAddressSpace() const
const GlobalValue * getGlobal() const
static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC)
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:655
float BitsToFloat(uint32_t Bits)
BitsToFloat - This function takes a 32-bit integer and returns the bit equivalent float...
Definition: MathExtras.h:558
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
Type * getTypeForEVT(LLVMContext &Context) const
getTypeForEVT - This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:204
unsigned getSizeInBits() const
bool isMemOpUniform(const SDNode *N) const
bool isSignaling() const
Definition: APFloat.h:1037
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:711
unsigned getNumOperands() const
Return the number of values used by this operation.
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain)
A debug info location.
Definition: DebugLoc.h:34
const SDValue & getOperand(unsigned Num) const
bool isSGPRClass(const TargetRegisterClass *RC) const
const Function * getFunction() const
getFunction - Return the LLVM function that this machine code represents
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
Definition: SelectionDAG.h:817
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:471
Address space for local memory.
Definition: AMDGPU.h:141
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT TVT, unsigned Alignment=0, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
[US]{MIN/MAX} - Binary minimum or maximum or signed or unsigned integers.
Definition: ISDOpcodes.h:330
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
iv Induction Variable Users
Definition: IVUsers.cpp:51
unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI)
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:387
SDValue scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode)
const SDValue & getBasePtr() const
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1 at the ...
Definition: ISDOpcodes.h:299
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
const AMDGPUImagePseudoSourceValue * getImagePSV() const
bool isAmdCodeObjectV2(const MachineFunction &MF) const
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:369
bool bitsLT(EVT VT) const
bitsLT - Return true if this has less bits than VT.
Definition: ValueTypes.h:212
bool isRegLoc() const
bool isAllOnesValue() const
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:159
static bool isFlatGlobalAddrSpace(unsigned AS)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
const Triple & getTargetTriple() const
static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand &Opnd)
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors...
bool isVector() const
isVector - Return true if this is a vector value type.
Definition: ValueTypes.h:133
void setPrivateSegmentWaveByteOffset(unsigned Reg)
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:374
void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx)
Sets stack object index for Dim's work group ID to ObjectIdx.
unsigned getSize() const
Return the size of the register in bytes, which is also the size of a stack slot allocated to hold a ...
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, SDLoc DL, EVT VT)
bool isNegative() const
Return true if the value is negative.
unsigned reservedPrivateSegmentWaveByteOffsetReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch wave offset in case spilling is needed...
static unsigned findFirstFreeSGPR(CCState &CCInfo)
const AMDGPUBufferPseudoSourceValue * getBufferPSV() const
A description of a memory reference used in the backend.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
struct fuzzer::@269 Flags
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
const HexagonInstrInfo * TII
Shift and rotation operations.
Definition: ISDOpcodes.h:344
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:359
void setHasNonSpillStackObjects(bool StackObject=true)
unsigned addDispatchID(const SIRegisterInfo &TRI)
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:351
void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx)
Sets stack object index for Dim's work item ID to ObjectIdx.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s), MachineInstr opcode, and operands.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:327
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
CopyToReg - This node has three operands: a chain, a register number to set to this value...
Definition: ISDOpcodes.h:170
unsigned SubReg
virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:592
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getRegClass(unsigned Reg) const
Return the register class of the specified virtual register.
LLVM_ATTRIBUTE_ALWAYS_INLINE R Default(const T &Value) const
Definition: StringSwitch.h:244
unsigned getAddressSpace() const
Reg
All possible values of the reg field in the ModR/M byte.
SimpleValueType SimpleTy
APInt bitcastToAPInt() const
Definition: APFloat.h:1012
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
The memory access is dereferenceable (i.e., doesn't trap).
EVT getScalarType() const
getScalarType - If this is a vector type, return the element type, otherwise return this...
Definition: ValueTypes.h:233
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN, ptr, amt) For double-word atomic operations: ValLo, ValHi, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amtLo, amtHi) ValLo, ValHi, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN, ptr, amtLo, amtHi) These correspond to the atomicrmw instruction.
Definition: ISDOpcodes.h:719
bool isUndef() const
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
unsigned getStoreSize() const
getStoreSize - Return the number of bytes overwritten by a store of the specified value type...
This is an SDNode representing atomic operations.
bool getScalarizeGlobalBehavior() const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, unsigned Alignment=0, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:60
unsigned getNumOperands() const
Access to explicit operands of the instruction.
Definition: MachineInstr.h:277
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
EVT getVectorElementType() const
getVectorElementType - Given a vector type, return the type of each element.
Definition: ValueTypes.h:239
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
void RemoveOperand(unsigned i)
Erase an operand from an instruction, leaving it with one fewer operand than it started with...
LLVM_ATTRIBUTE_ALWAYS_INLINE StringSwitch & Case(const char(&S)[N], const T &Value)
Definition: StringSwitch.h:74
bool isShuffleMaskLegal(const SmallVectorImpl< int > &, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations, those with specific masks.
unsigned getLocReg() const
Class to represent function types.
Definition: DerivedTypes.h:102
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:410
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose...
const AMDGPUSubtarget * Subtarget
bool hasStackObjects() const
Return true if there are any stack objects in this function.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
value_iterator value_end() const
MachineBasicBlock * MBB
bool hasVGPRIndexMode() const
const RegList & Regs
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const SISubtarget &ST)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always beneficiates from combining into FMA for a given value type...
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
const SDValue & getBasePtr() const
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
Definition: SelectionDAG.h:737
static GCRegistry::Add< OcamlGC > B("ocaml","ocaml 3.10-compatible GC")
cmpResult
IEEE-754R 5.11: Floating Point Comparison Relations.
Definition: APFloat.h:153
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out...
Definition: ISDOpcodes.h:842
const APInt & getAPIntValue() const
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
EVT getMemoryVT() const
Return the type of the in-memory value.
CodeGenOpt::Level getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
int64_t getImm() const
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:487
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:656
void addMemOperand(MachineFunction &MF, MachineMemOperand *MO)
Add a MachineMemOperand to the machine instruction.
QueuePtr(false)
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE size_t size() const
size - Get the string size.
Definition: StringRef.h:135
Generation getGeneration() const
unsigned getUndefRegState(bool B)
void markPSInputAllocated(unsigned Index)
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:151
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:328
bool bitsLE(EVT VT) const
bitsLE - Return true if this has no more bits than VT.
Definition: ValueTypes.h:218
const SIRegisterInfo & getRegisterInfo() const
Definition: SIInstrInfo.h:117
Class to represent pointers.
Definition: DerivedTypes.h:443
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
This class is used to represent ISD::STORE nodes.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:273
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:453
Address space for region memory.
Definition: AMDGPU.h:143
uint32_t FloatToBits(float Float)
FloatToBits - This function takes a float and returns the bit equivalent 32-bit integer.
Definition: MathExtras.h:580
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:121
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:131
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a vector with the specified, possibly variable...
Definition: ISDOpcodes.h:274
bool isPSInputAllocated(unsigned Index) const
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition: APInt.h:518
bool isDenormal() const
Definition: APFloat.h:1036
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const override
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
MinAlign - A and B are either alignments or offsets.
Definition: MathExtras.h:589
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
unsigned getStoreSize() const
getStoreSize - Return the number of bytes overwritten by a store of the specified value type...
Definition: ValueTypes.h:268
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:43
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:133
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:395
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:166
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fuse-fp-ops=xxx option.
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:551
const SDValue & getBasePtr() const
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors...
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:309
unsigned const MachineRegisterInfo * MRI
std::size_t countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:111
unsigned getVectorNumElements() const
bool hasFloatingPointExceptions() const
Return true if target supports floating point exceptions.
MVT - Machine Value Type.
bool isShader(CallingConv::ID cc)
const SDValue & getOperand(unsigned i) const
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise. ...
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:48
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type...
Address space for constant memory (VTX2)
Definition: AMDGPU.h:140
Simple binary floating point operators.
Definition: ISDOpcodes.h:246
void setTargetDAGCombine(ISD::NodeType NT)
Targets should invoke this method for each target independent node that they want to provide a custom...
unsigned getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const
Returns the physical register that Value is stored in.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align=0, bool Vol=false, bool ReadMem=true, bool WriteMem=true, unsigned Size=0)
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
bool sge(const APInt &RHS) const
Signed greather or equal comparison.
Definition: APInt.h:1135
MVT getLocVT() const
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Address space for private memory.
Definition: AMDGPU.h:138
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
bool isVector() const
isVector - Return true if this is a vector value type.
static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, SDValue Op0, SDValue Op1)
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:818
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL...
Definition: ISDOpcodes.h:279
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:279
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const SISubtarget &ST)
unsigned getLiveInVirtReg(unsigned PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in physical ...
EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const override
Returns the target specific optimal type for load and store operations as a result of memset...
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:587
bool hasUnalignedScratchAccess() const
SI DAG Lowering interface definition.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
This class provides iterator support for SDUse operands that use a specific SDNode.
SITargetLowering(const TargetMachine &tm, const SISubtarget &STI)
bool hasAllowReciprocal() const
uint32_t Offset
unsigned PartOffset
Offset in bytes of current input value relative to the beginning of original argument.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang","erlang-compatible garbage collector")
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:880
unsigned addQueuePtr(const SIRegisterInfo &TRI)
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1119
unsigned getOpcode() const
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:676
bool hasFP32Denormals() const
Value * getOperand(unsigned i) const
Definition: User.h:145
uint64_t getDefaultRsrcDataFormat() const
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:57
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:371
self_iterator getIterator()
Definition: ilist_node.h:81
The memory access is non-temporal.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:281
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool Immutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
Address space for flat memory.
Definition: AMDGPU.h:142
const SDValue & getValue() const
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
bool has16BitInsts() const
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:49
static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, SDValue Op0, SDValue Op1, bool Signed)
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:350
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo...
Definition: ISDOpcodes.h:705
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:392
unsigned getSubReg() const
EVT - Extended Value Type.
Definition: ValueTypes.h:31
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1337
const APFloat & getValueAPF() const
void setScratchWaveOffsetReg(unsigned Reg)
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
bool shouldAssumeDSOLocal(const Module &M, const GlobalValue *GV) const
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements)
getVectorVT - Returns the EVT that represents a vector NumElements in length, where each element is o...
Definition: ValueTypes.h:70
This class contains a discriminated union of information about pointers in memory operands...
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Calling convention used for Mesa pixel shaders.
Definition: CallingConv.h:188
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool hasFP64Denormals() const
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:378
unsigned addDispatchPtr(const SIRegisterInfo &TRI)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, unsigned Alignment=0, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands...
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
The memory access writes data.
bool isInlineConstant(const APInt &Imm) const
bool bitsGT(EVT VT) const
bitsGT - Return true if this has more bits than VT.
Definition: ValueTypes.h:200
static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset, bool UseGPRIdxMode, bool IsIndirectSrc)
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
TokenFactor - This node takes multiple tokens as input and produces a single token result...
Definition: ISDOpcodes.h:50
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
EVT is not used in-tree, but is used by out-of-tree target.
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:689
bool memoperands_empty() const
Return true if we don't have any memory operands which described the the memory access done by this i...
Definition: MachineInstr.h:363
Iterator for intrusive lists based on ilist_node.
CCState - This class holds information needed while lowering arguments and return values...
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
const DebugLoc & getDebugLoc() const
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override
Return the preferred vector type legalization action.
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:266
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
bool isInfinity() const
Return true if the value is an infinity.
bool isNegative() const
Definition: APFloat.h:1035
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:285
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
const SIRegisterInfo * getRegisterInfo() const override
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:166
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:504
MachineOperand class - Representation of each machine instruction operand.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:843
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:230
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:639
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:175
CCValAssign - Represent assignment of one arg/retval to a location.
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:566
An SDNode that represents everything that will be needed to construct a MachineInstr.
const SDValue & getChain() const
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:347
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:510
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain)
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
This is an abstract virtual class for memory operations.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const fltSemantics & getFltSemantics() const
Definition: Type.h:167
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, SDLoc DL, unsigned Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array...
static bool canFoldOffset(unsigned OffsetSize, unsigned AS, const SISubtarget &STI)
Return true if the given offset Size in bytes can be folded into the immediate offsets of a memory in...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:584
static GCRegistry::Add< ShadowStackGC > C("shadow-stack","Very portable GC for uncooperative code generators")
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:586
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SynchronizationScope SynchScope=CrossThread, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Target - Wrapper for Target specific information.
Class for arbitrary precision integers.
Definition: APInt.h:77
const Value * getValue() const
Return the base address of the memory access.
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:354
Interface for the AMDGPU Implementation of the Intrinsic Info class.
int64_t getSExtValue() const
op_iterator op_begin() const
static use_iterator use_end()
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:400
bool isAmdHsaOS() const
AddrMode
ARM Addressing Modes.
Definition: ARMBaseInfo.h:235
bool isMemLoc() const
unsigned getVectorNumElements() const
Definition: DerivedTypes.h:438
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:250
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const
Return the preferred vector type legalization action.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *IsFast) const override
Determine if the target supports unaligned memory accesses.
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:303
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:503
unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI)
Flags
Flags values. These may be or'd together.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
unsigned addFlatScratchInit(const SIRegisterInfo &TRI)
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
The memory access reads data.
bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
const SISubtarget * getSubtarget() const
Representation of each machine instruction.
Definition: MachineInstr.h:52
These are IR-level optimization flags that may be propagated to SDNodes.
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:259
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
unsigned PartOffset
Offset in bytes of current output value relative to the beginning of original argument.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
void AnalyzeReturn(CCState &State, const SmallVectorImpl< ISD::OutputArg > &Outs) const
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:333
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
bool isUndef() const
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:610
unsigned getMaxPrivateElementSize() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:418
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:536
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
unsigned getSizeInBits() const
getSizeInBits - Return the size of the specified value type in bits.
Definition: ValueTypes.h:256
void ReplaceAllUsesWith(SDValue From, SDValue Op)
Modify anything using 'From' to use 'To' instead.
unsigned getOrigArgIndex() const
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.cpp:230
The memory access always returns the same value (or traps).
bool isAllocated(unsigned Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void clearKillFlags(unsigned Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
op_iterator op_end() const
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument...
bool shouldEmitConstantsToTextSection(const Triple &TT)
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types...
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:287
static volatile int Zero
bool hasUnsafeAlgebra() const
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:43
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
uint64_t getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type...
Definition: DataLayout.h:391
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:890
constexpr bool isUInt< 16 >(uint64_t x)
Definition: MathExtras.h:312
void setSimpleHint(unsigned VReg, unsigned PrefReg)
Specify the preferred register allocation hint for the specified virtual register.
EVT getValueType() const
Return the ValueType of the referenced return value.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getCondCode(ISD::CondCode Cond)
static bool isFrameIndexOp(SDValue Op)
void setUnsafeAlgebra(bool b)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode)
void setABIArgOffset(unsigned NewOffset)
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:291
unsigned getReg() const
getReg - Returns the register number.
bool isFloatingPoint() const
isFloatingPoint - Return true if this is a FP, or a vector FP type.
Definition: ValueTypes.h:118
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void insert(iterator MBBI, MachineBasicBlock *MBB)
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, unsigned Alignment=1, bool *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
bool isSimple() const
isSimple - Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:107
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array...
constexpr uint32_t Hi_32(uint64_t Value)
Hi_32 - This function returns the high 32 bits of a 64 bit value.
Definition: MathExtras.h:248
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:537
LLVM Value Representation.
Definition: Value.h:71
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:249
unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed...
SDValue getRegister(unsigned Reg, EVT VT)
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned char TargetFlags=0) const
EVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type...
Definition: ValueTypes.h:95
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:239
SDValue getValueType(EVT)
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:81
bool isInfinity() const
Definition: APFloat.h:1032
const MachineInstrBuilder & addOperand(const MachineOperand &MO) const
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:331
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.Val alone...
cmpResult compare(const APFloat &RHS) const
Definition: APFloat.h:1018
Primary interface to the complete machine description for the target machine.
IRTranslator LLVM IR MI
void setRegClass(unsigned Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:47
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations...
Definition: ISDOpcodes.h:253
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:377
uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml","ocaml 3.10-compatible collector")
bool hasFP16Denormals() const
unsigned getLocMemOffset() const
MVT getVectorElementType() const
Conversion operators.
Definition: ISDOpcodes.h:397
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
int * Ptr
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:381
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:698
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:406
static void Split(std::vector< std::string > &V, StringRef S)
Split - Splits a string of comma separated items in to a vector of strings.
unsigned getAlignment() const
unsigned AllocateReg(unsigned Reg)
AllocateReg - Attempt to allocate one register.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation...
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
static GCRegistry::Add< ErlangGC > A("erlang","erlang-compatible garbage collector")
FNEG, FABS, FSQRT, FSIN, FCOS, FPOWI, FPOW, FLOG, FLOG2, FLOG10, FEXP, FEXP2, FCEIL, FTRUNC, FRINT, FNEARBYINT, FROUND, FFLOOR - Perform various unary floating point operations.
Definition: ISDOpcodes.h:516
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:167
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:694
const fltSemantics & getSemantics() const
Definition: APFloat.h:1043
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:321
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:872
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself...
MVT getSimpleVT() const
getSimpleVT - Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:226
const SDNodeFlags * getFlags() const
This could be defined as a virtual function and implemented more simply and directly, but it is not to avoid creating a vtable for this class.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode...
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &IdxReg, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode)
unsigned getRegisterByName(const char *RegName, EVT VT, SelectionDAG &DAG) const override
Return the register ID of the name passed in.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned char TargetFlags=0)
Definition: SelectionDAG.h:529
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:139
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
uint64_t getZExtValue() const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
unsigned getVectorNumElements() const
getVectorNumElements - Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:248
This class is used to represent ISD::LOAD nodes.