LLVM  9.0.0svn
SystemZISelLowering.cpp
Go to the documentation of this file.
1 //===-- SystemZISelLowering.cpp - SystemZ DAG lowering implementation -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SystemZTargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "SystemZISelLowering.h"
14 #include "SystemZCallingConv.h"
17 #include "SystemZTargetMachine.h"
22 #include "llvm/IR/Intrinsics.h"
23 #include "llvm/IR/IntrinsicInst.h"
25 #include "llvm/Support/KnownBits.h"
26 #include <cctype>
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "systemz-lower"
31 
32 namespace {
33 // Represents information about a comparison.
34 struct Comparison {
35  Comparison(SDValue Op0In, SDValue Op1In)
36  : Op0(Op0In), Op1(Op1In), Opcode(0), ICmpType(0), CCValid(0), CCMask(0) {}
37 
38  // The operands to the comparison.
39  SDValue Op0, Op1;
40 
41  // The opcode that should be used to compare Op0 and Op1.
42  unsigned Opcode;
43 
44  // A SystemZICMP value. Only used for integer comparisons.
45  unsigned ICmpType;
46 
47  // The mask of CC values that Opcode can produce.
48  unsigned CCValid;
49 
50  // The mask of CC values for which the original condition is true.
51  unsigned CCMask;
52 };
53 } // end anonymous namespace
54 
55 // Classify VT as either 32 or 64 bit.
56 static bool is32Bit(EVT VT) {
57  switch (VT.getSimpleVT().SimpleTy) {
58  case MVT::i32:
59  return true;
60  case MVT::i64:
61  return false;
62  default:
63  llvm_unreachable("Unsupported type");
64  }
65 }
66 
67 // Return a version of MachineOperand that can be safely used before the
68 // final use.
70  if (Op.isReg())
71  Op.setIsKill(false);
72  return Op;
73 }
74 
76  const SystemZSubtarget &STI)
77  : TargetLowering(TM), Subtarget(STI) {
78  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));
79 
80  // Set up the register classes.
81  if (Subtarget.hasHighWord())
82  addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
83  else
84  addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
85  addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
86  if (Subtarget.hasVector()) {
87  addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
88  addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
89  } else {
90  addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
91  addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
92  }
93  if (Subtarget.hasVectorEnhancements1())
94  addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass);
95  else
96  addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
97 
98  if (Subtarget.hasVector()) {
99  addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
100  addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
101  addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
102  addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
103  addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
104  addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
105  }
106 
107  // Compute derived properties from the register classes
109 
110  // Set up special registers.
112 
113  // TODO: It may be better to default to latency-oriented scheduling, however
114  // LLVM's current latency-oriented scheduler can't handle physreg definitions
115  // such as SystemZ has with CC, so set this to the register-pressure
116  // scheduler, because it can.
118 
121 
122  // Instructions are strings of 2-byte aligned 2-byte values.
124  // For performance reasons we prefer 16-byte alignment.
126 
127  // Handle operations that are handled in a similar way for all types.
128  for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
130  ++I) {
131  MVT VT = MVT::SimpleValueType(I);
132  if (isTypeLegal(VT)) {
133  // Lower SET_CC into an IPM-based sequence.
135 
136  // Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
138 
139  // Lower SELECT_CC and BR_CC into separate comparisons and branches.
142  }
143  }
144 
145  // Expand jump table branches as address arithmetic followed by an
146  // indirect jump.
148 
149  // Expand BRCOND into a BR_CC (see above).
151 
152  // Handle integer types.
153  for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
155  ++I) {
156  MVT VT = MVT::SimpleValueType(I);
157  if (isTypeLegal(VT)) {
158  // Expand individual DIV and REMs into DIVREMs.
165 
166  // Support addition/subtraction with overflow.
169 
170  // Support addition/subtraction with carry.
173 
174  // Support carry in as value rather than glue.
177 
178  // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
179  // stores, putting a serialization instruction after the stores.
182 
183  // Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
184  // available, or if the operand is constant.
186 
187  // Use POPCNT on z196 and above.
188  if (Subtarget.hasPopulationCount())
190  else
192 
193  // No special instructions for these.
196 
197  // Use *MUL_LOHI where possible instead of MULH*.
202 
203  // Only z196 and above have native support for conversions to unsigned.
204  // On z10, promoting to i64 doesn't generate an inexact condition for
205  // values that are outside the i32 range but in the i64 range, so use
206  // the default expansion.
207  if (!Subtarget.hasFPExtension())
209  }
210  }
211 
212  // Type legalization will convert 8- and 16-bit atomic operations into
213  // forms that operate on i32s (but still keeping the original memory VT).
214  // Lower them into full i32 operations.
226 
227  // Even though i128 is not a legal type, we still need to custom lower
228  // the atomic operations in order to exploit SystemZ instructions.
231 
232  // We can use the CC result of compare-and-swap to implement
233  // the "success" result of ATOMIC_CMP_SWAP_WITH_SUCCESS.
237 
239 
240  // Traps are legal, as we will convert them to "j .+2".
242 
243  // z10 has instructions for signed but not unsigned FP conversion.
244  // Handle unsigned 32-bit types as signed 64-bit types.
245  if (!Subtarget.hasFPExtension()) {
248  }
249 
250  // We have native support for a 64-bit CTLZ, via FLOGR.
254 
255  // Give LowerOperation the chance to replace 64-bit ORs with subregs.
257 
258  // FIXME: Can we support these natively?
262 
263  // We have native instructions for i8, i16 and i32 extensions, but not i1.
265  for (MVT VT : MVT::integer_valuetypes()) {
269  }
270 
271  // Handle the various types of symbolic address.
277 
278  // We need to handle dynamic allocations specially because of the
279  // 160-byte area at the bottom of the stack.
282 
283  // Use custom expanders so that we can force the function to use
284  // a frame pointer.
287 
288  // Handle prefetches with PFD or PFDRL.
290 
291  for (MVT VT : MVT::vector_valuetypes()) {
292  // Assume by default that all vector operations need to be expanded.
293  for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode)
294  if (getOperationAction(Opcode, VT) == Legal)
295  setOperationAction(Opcode, VT, Expand);
296 
297  // Likewise all truncating stores and extending loads.
298  for (MVT InnerVT : MVT::vector_valuetypes()) {
299  setTruncStoreAction(VT, InnerVT, Expand);
300  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
301  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
302  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
303  }
304 
305  if (isTypeLegal(VT)) {
306  // These operations are legal for anything that can be stored in a
307  // vector register, even if there is no native support for the format
308  // as such. In particular, we can do these for v4f32 even though there
309  // are no specific instructions for that format.
315 
316  // Likewise, except that we need to replace the nodes with something
317  // more specific.
320  }
321  }
322 
323  // Handle integer vector types.
324  for (MVT VT : MVT::integer_vector_valuetypes()) {
325  if (isTypeLegal(VT)) {
326  // These operations have direct equivalents.
331  if (VT != MVT::v2i64)
336  if (Subtarget.hasVectorEnhancements1())
338  else
342 
343  // Convert a GPR scalar to a vector by inserting it into element 0.
345 
346  // Use a series of unpacks for extensions.
349 
350  // Detect shifts by a scalar amount and convert them into
351  // V*_BY_SCALAR.
355 
356  // At present ROTL isn't matched by DAGCombiner. ROTR should be
357  // converted into ROTL.
360 
361  // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
362  // and inverting the result as necessary.
364  }
365  }
366 
367  if (Subtarget.hasVector()) {
368  // There should be no need to check for float types other than v2f64
369  // since <2 x f32> isn't a legal type.
378  }
379 
380  // Handle floating-point types.
381  for (unsigned I = MVT::FIRST_FP_VALUETYPE;
383  ++I) {
384  MVT VT = MVT::SimpleValueType(I);
385  if (isTypeLegal(VT)) {
386  // We can use FI for FRINT.
388 
389  // We can use the extended form of FI for other rounding operations.
390  if (Subtarget.hasFPExtension()) {
396  }
397 
398  // No special instructions for these.
404  }
405  }
406 
407  // Handle floating-point vector types.
408  if (Subtarget.hasVector()) {
409  // Scalar-to-vector conversion is just a subreg.
412 
413  // Some insertions and extractions can be done directly but others
414  // need to go via integers.
419 
420  // These operations have direct equivalents.
435  }
436 
437  // The vector enhancements facility 1 has instructions for these.
438  if (Subtarget.hasVectorEnhancements1()) {
453 
458 
463 
468 
473 
478  }
479 
480  // We have fused multiply-addition for f32 and f64 but not f128.
483  if (Subtarget.hasVectorEnhancements1())
485  else
487 
488  // We don't have a copysign instruction on vector registers.
489  if (Subtarget.hasVectorEnhancements1())
491 
492  // Needed so that we don't try to implement f128 constant loads using
493  // a load-and-extend of a f80 constant (in cases where the constant
494  // would fit in an f80).
495  for (MVT VT : MVT::fp_valuetypes())
497 
498  // We don't have extending load instruction on vector registers.
499  if (Subtarget.hasVectorEnhancements1()) {
502  }
503 
504  // Floating-point truncation and stores need to be done separately.
508 
509  // We have 64-bit FPR<->GPR moves, but need special handling for
510  // 32-bit forms.
511  if (!Subtarget.hasVector()) {
514  }
515 
516  // VASTART and VACOPY need to deal with the SystemZ-specific varargs
517  // structure, but VAEND is a no-op.
521 
522  // Codes for which we want to perform some z-specific combinations.
536 
537  // Handle intrinsics.
540 
541  // We want to use MVC in preference to even a single load/store pair.
542  MaxStoresPerMemcpy = 0;
544 
545  // The main memset sequence is a byte store followed by an MVC.
546  // Two STC or MV..I stores win over that, but the kind of fused stores
547  // generated by target-independent code don't when the byte value is
548  // variable. E.g. "STC <reg>;MHI <reg>,257;STH <reg>" is not better
549  // than "STC;MVC". Handle the choice in target-specific code instead.
550  MaxStoresPerMemset = 0;
552 }
553 
555  LLVMContext &, EVT VT) const {
556  if (!VT.isVector())
557  return MVT::i32;
559 }
560 
562  VT = VT.getScalarType();
563 
564  if (!VT.isSimple())
565  return false;
566 
567  switch (VT.getSimpleVT().SimpleTy) {
568  case MVT::f32:
569  case MVT::f64:
570  return true;
571  case MVT::f128:
572  return Subtarget.hasVectorEnhancements1();
573  default:
574  break;
575  }
576 
577  return false;
578 }
579 
580 
581 // Return true if Imm can be generated with a vector instruction, such as VGM.
583 analyzeFPImm(const APFloat &Imm, unsigned BitWidth, unsigned &Start,
584  unsigned &End, const SystemZInstrInfo *TII) {
585  APInt IntImm = Imm.bitcastToAPInt();
586  if (IntImm.getActiveBits() > 64)
587  return false;
588 
589  // See if this immediate could be generated with VGM.
590  bool Success = TII->isRxSBGMask(IntImm.getZExtValue(), BitWidth, Start, End);
591  if (!Success)
592  return false;
593  // isRxSBGMask returns the bit numbers for a full 64-bit value,
594  // with 0 denoting 1 << 63 and 63 denoting 1. Convert them to
595  // bit numbers for an BitsPerElement value, so that 0 denotes
596  // 1 << (BitsPerElement-1).
597  Start -= 64 - BitWidth;
598  End -= 64 - BitWidth;
599  return true;
600 }
601 
602 bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
603  // We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
604  if (Imm.isZero() || Imm.isNegZero())
605  return true;
606 
607  if (!Subtarget.hasVector())
608  return false;
609  const SystemZInstrInfo *TII =
610  static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
611  unsigned Start, End;
612  return analyzeFPImm(Imm, VT.getSizeInBits(), Start, End, TII);
613 }
614 
616  // We can use CGFI or CLGFI.
617  return isInt<32>(Imm) || isUInt<32>(Imm);
618 }
619 
621  // We can use ALGFI or SLGFI.
622  return isUInt<32>(Imm) || isUInt<32>(-Imm);
623 }
624 
626  unsigned,
627  unsigned,
628  bool *Fast) const {
629  // Unaligned accesses should never be slower than the expanded version.
630  // We check specifically for aligned accesses in the few cases where
631  // they are required.
632  if (Fast)
633  *Fast = true;
634  return true;
635 }
636 
637 // Information about the addressing mode for a memory access.
639  // True if a long displacement is supported.
641 
642  // True if use of index register is supported.
643  bool IndexReg;
644 
645  AddressingMode(bool LongDispl, bool IdxReg) :
646  LongDisplacement(LongDispl), IndexReg(IdxReg) {}
647 };
648 
649 // Return the desired addressing mode for a Load which has only one use (in
650 // the same block) which is a Store.
651 static AddressingMode getLoadStoreAddrMode(bool HasVector,
652  Type *Ty) {
653  // With vector support a Load->Store combination may be combined to either
654  // an MVC or vector operations and it seems to work best to allow the
655  // vector addressing mode.
656  if (HasVector)
657  return AddressingMode(false/*LongDispl*/, true/*IdxReg*/);
658 
659  // Otherwise only the MVC case is special.
660  bool MVC = Ty->isIntegerTy(8);
661  return AddressingMode(!MVC/*LongDispl*/, !MVC/*IdxReg*/);
662 }
663 
664 // Return the addressing mode which seems most desirable given an LLVM
665 // Instruction pointer.
666 static AddressingMode
668  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
669  switch (II->getIntrinsicID()) {
670  default: break;
671  case Intrinsic::memset:
672  case Intrinsic::memmove:
673  case Intrinsic::memcpy:
674  return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
675  }
676  }
677 
678  if (isa<LoadInst>(I) && I->hasOneUse()) {
679  auto *SingleUser = dyn_cast<Instruction>(*I->user_begin());
680  if (SingleUser->getParent() == I->getParent()) {
681  if (isa<ICmpInst>(SingleUser)) {
682  if (auto *C = dyn_cast<ConstantInt>(SingleUser->getOperand(1)))
683  if (C->getBitWidth() <= 64 &&
684  (isInt<16>(C->getSExtValue()) || isUInt<16>(C->getZExtValue())))
685  // Comparison of memory with 16 bit signed / unsigned immediate
686  return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
687  } else if (isa<StoreInst>(SingleUser))
688  // Load->Store
689  return getLoadStoreAddrMode(HasVector, I->getType());
690  }
691  } else if (auto *StoreI = dyn_cast<StoreInst>(I)) {
692  if (auto *LoadI = dyn_cast<LoadInst>(StoreI->getValueOperand()))
693  if (LoadI->hasOneUse() && LoadI->getParent() == I->getParent())
694  // Load->Store
695  return getLoadStoreAddrMode(HasVector, LoadI->getType());
696  }
697 
698  if (HasVector && (isa<LoadInst>(I) || isa<StoreInst>(I))) {
699 
700  // * Use LDE instead of LE/LEY for z13 to avoid partial register
701  // dependencies (LDE only supports small offsets).
702  // * Utilize the vector registers to hold floating point
703  // values (vector load / store instructions only support small
704  // offsets).
705 
706  Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
707  I->getOperand(0)->getType());
708  bool IsFPAccess = MemAccessTy->isFloatingPointTy();
709  bool IsVectorAccess = MemAccessTy->isVectorTy();
710 
711  // A store of an extracted vector element will be combined into a VSTE type
712  // instruction.
713  if (!IsVectorAccess && isa<StoreInst>(I)) {
714  Value *DataOp = I->getOperand(0);
715  if (isa<ExtractElementInst>(DataOp))
716  IsVectorAccess = true;
717  }
718 
719  // A load which gets inserted into a vector element will be combined into a
720  // VLE type instruction.
721  if (!IsVectorAccess && isa<LoadInst>(I) && I->hasOneUse()) {
722  User *LoadUser = *I->user_begin();
723  if (isa<InsertElementInst>(LoadUser))
724  IsVectorAccess = true;
725  }
726 
727  if (IsFPAccess || IsVectorAccess)
728  return AddressingMode(false/*LongDispl*/, true/*IdxReg*/);
729  }
730 
731  return AddressingMode(true/*LongDispl*/, true/*IdxReg*/);
732 }
733 
735  const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const {
736  // Punt on globals for now, although they can be used in limited
737  // RELATIVE LONG cases.
738  if (AM.BaseGV)
739  return false;
740 
741  // Require a 20-bit signed offset.
742  if (!isInt<20>(AM.BaseOffs))
743  return false;
744 
745  AddressingMode SupportedAM(true, true);
746  if (I != nullptr)
747  SupportedAM = supportedAddressingMode(I, Subtarget.hasVector());
748 
749  if (!SupportedAM.LongDisplacement && !isUInt<12>(AM.BaseOffs))
750  return false;
751 
752  if (!SupportedAM.IndexReg)
753  // No indexing allowed.
754  return AM.Scale == 0;
755  else
756  // Indexing is OK but no scale factor can be applied.
757  return AM.Scale == 0 || AM.Scale == 1;
758 }
759 
761  if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
762  return false;
763  unsigned FromBits = FromType->getPrimitiveSizeInBits();
764  unsigned ToBits = ToType->getPrimitiveSizeInBits();
765  return FromBits > ToBits;
766 }
767 
769  if (!FromVT.isInteger() || !ToVT.isInteger())
770  return false;
771  unsigned FromBits = FromVT.getSizeInBits();
772  unsigned ToBits = ToVT.getSizeInBits();
773  return FromBits > ToBits;
774 }
775 
776 //===----------------------------------------------------------------------===//
777 // Inline asm support
778 //===----------------------------------------------------------------------===//
779 
782  if (Constraint.size() == 1) {
783  switch (Constraint[0]) {
784  case 'a': // Address register
785  case 'd': // Data register (equivalent to 'r')
786  case 'f': // Floating-point register
787  case 'h': // High-part register
788  case 'r': // General-purpose register
789  case 'v': // Vector register
790  return C_RegisterClass;
791 
792  case 'Q': // Memory with base and unsigned 12-bit displacement
793  case 'R': // Likewise, plus an index
794  case 'S': // Memory with base and signed 20-bit displacement
795  case 'T': // Likewise, plus an index
796  case 'm': // Equivalent to 'T'.
797  return C_Memory;
798 
799  case 'I': // Unsigned 8-bit constant
800  case 'J': // Unsigned 12-bit constant
801  case 'K': // Signed 16-bit constant
802  case 'L': // Signed 20-bit displacement (on all targets we support)
803  case 'M': // 0x7fffffff
804  return C_Other;
805 
806  default:
807  break;
808  }
809  }
810  return TargetLowering::getConstraintType(Constraint);
811 }
812 
815  const char *constraint) const {
816  ConstraintWeight weight = CW_Invalid;
817  Value *CallOperandVal = info.CallOperandVal;
818  // If we don't have a value, we can't do a match,
819  // but allow it at the lowest weight.
820  if (!CallOperandVal)
821  return CW_Default;
822  Type *type = CallOperandVal->getType();
823  // Look at the constraint type.
824  switch (*constraint) {
825  default:
826  weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
827  break;
828 
829  case 'a': // Address register
830  case 'd': // Data register (equivalent to 'r')
831  case 'h': // High-part register
832  case 'r': // General-purpose register
833  if (CallOperandVal->getType()->isIntegerTy())
834  weight = CW_Register;
835  break;
836 
837  case 'f': // Floating-point register
838  if (type->isFloatingPointTy())
839  weight = CW_Register;
840  break;
841 
842  case 'v': // Vector register
843  if ((type->isVectorTy() || type->isFloatingPointTy()) &&
844  Subtarget.hasVector())
845  weight = CW_Register;
846  break;
847 
848  case 'I': // Unsigned 8-bit constant
849  if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
850  if (isUInt<8>(C->getZExtValue()))
851  weight = CW_Constant;
852  break;
853 
854  case 'J': // Unsigned 12-bit constant
855  if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
856  if (isUInt<12>(C->getZExtValue()))
857  weight = CW_Constant;
858  break;
859 
860  case 'K': // Signed 16-bit constant
861  if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
862  if (isInt<16>(C->getSExtValue()))
863  weight = CW_Constant;
864  break;
865 
866  case 'L': // Signed 20-bit displacement (on all targets we support)
867  if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
868  if (isInt<20>(C->getSExtValue()))
869  weight = CW_Constant;
870  break;
871 
872  case 'M': // 0x7fffffff
873  if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
874  if (C->getZExtValue() == 0x7fffffff)
875  weight = CW_Constant;
876  break;
877  }
878  return weight;
879 }
880 
881 // Parse a "{tNNN}" register constraint for which the register type "t"
882 // has already been verified. MC is the class associated with "t" and
883 // Map maps 0-based register numbers to LLVM register numbers.
884 static std::pair<unsigned, const TargetRegisterClass *>
886  const unsigned *Map, unsigned Size) {
887  assert(*(Constraint.end()-1) == '}' && "Missing '}'");
888  if (isdigit(Constraint[2])) {
889  unsigned Index;
890  bool Failed =
891  Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
892  if (!Failed && Index < Size && Map[Index])
893  return std::make_pair(Map[Index], RC);
894  }
895  return std::make_pair(0U, nullptr);
896 }
897 
898 std::pair<unsigned, const TargetRegisterClass *>
900  const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
901  if (Constraint.size() == 1) {
902  // GCC Constraint Letters
903  switch (Constraint[0]) {
904  default: break;
905  case 'd': // Data register (equivalent to 'r')
906  case 'r': // General-purpose register
907  if (VT == MVT::i64)
908  return std::make_pair(0U, &SystemZ::GR64BitRegClass);
909  else if (VT == MVT::i128)
910  return std::make_pair(0U, &SystemZ::GR128BitRegClass);
911  return std::make_pair(0U, &SystemZ::GR32BitRegClass);
912 
913  case 'a': // Address register
914  if (VT == MVT::i64)
915  return std::make_pair(0U, &SystemZ::ADDR64BitRegClass);
916  else if (VT == MVT::i128)
917  return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
918  return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);
919 
920  case 'h': // High-part register (an LLVM extension)
921  return std::make_pair(0U, &SystemZ::GRH32BitRegClass);
922 
923  case 'f': // Floating-point register
924  if (VT == MVT::f64)
925  return std::make_pair(0U, &SystemZ::FP64BitRegClass);
926  else if (VT == MVT::f128)
927  return std::make_pair(0U, &SystemZ::FP128BitRegClass);
928  return std::make_pair(0U, &SystemZ::FP32BitRegClass);
929 
930  case 'v': // Vector register
931  if (Subtarget.hasVector()) {
932  if (VT == MVT::f32)
933  return std::make_pair(0U, &SystemZ::VR32BitRegClass);
934  if (VT == MVT::f64)
935  return std::make_pair(0U, &SystemZ::VR64BitRegClass);
936  return std::make_pair(0U, &SystemZ::VR128BitRegClass);
937  }
938  break;
939  }
940  }
941  if (Constraint.size() > 0 && Constraint[0] == '{') {
942  // We need to override the default register parsing for GPRs and FPRs
943  // because the interpretation depends on VT. The internal names of
944  // the registers are also different from the external names
945  // (F0D and F0S instead of F0, etc.).
946  if (Constraint[1] == 'r') {
947  if (VT == MVT::i32)
948  return parseRegisterNumber(Constraint, &SystemZ::GR32BitRegClass,
949  SystemZMC::GR32Regs, 16);
950  if (VT == MVT::i128)
951  return parseRegisterNumber(Constraint, &SystemZ::GR128BitRegClass,
953  return parseRegisterNumber(Constraint, &SystemZ::GR64BitRegClass,
954  SystemZMC::GR64Regs, 16);
955  }
956  if (Constraint[1] == 'f') {
957  if (VT == MVT::f32)
958  return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
959  SystemZMC::FP32Regs, 16);
960  if (VT == MVT::f128)
961  return parseRegisterNumber(Constraint, &SystemZ::FP128BitRegClass,
963  return parseRegisterNumber(Constraint, &SystemZ::FP64BitRegClass,
964  SystemZMC::FP64Regs, 16);
965  }
966  if (Constraint[1] == 'v') {
967  if (VT == MVT::f32)
968  return parseRegisterNumber(Constraint, &SystemZ::VR32BitRegClass,
969  SystemZMC::VR32Regs, 32);
970  if (VT == MVT::f64)
971  return parseRegisterNumber(Constraint, &SystemZ::VR64BitRegClass,
972  SystemZMC::VR64Regs, 32);
973  return parseRegisterNumber(Constraint, &SystemZ::VR128BitRegClass,
975  }
976  }
977  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
978 }
979 
981 LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
982  std::vector<SDValue> &Ops,
983  SelectionDAG &DAG) const {
984  // Only support length 1 constraints for now.
985  if (Constraint.length() == 1) {
986  switch (Constraint[0]) {
987  case 'I': // Unsigned 8-bit constant
988  if (auto *C = dyn_cast<ConstantSDNode>(Op))
989  if (isUInt<8>(C->getZExtValue()))
990  Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
991  Op.getValueType()));
992  return;
993 
994  case 'J': // Unsigned 12-bit constant
995  if (auto *C = dyn_cast<ConstantSDNode>(Op))
996  if (isUInt<12>(C->getZExtValue()))
997  Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
998  Op.getValueType()));
999  return;
1000 
1001  case 'K': // Signed 16-bit constant
1002  if (auto *C = dyn_cast<ConstantSDNode>(Op))
1003  if (isInt<16>(C->getSExtValue()))
1004  Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
1005  Op.getValueType()));
1006  return;
1007 
1008  case 'L': // Signed 20-bit displacement (on all targets we support)
1009  if (auto *C = dyn_cast<ConstantSDNode>(Op))
1010  if (isInt<20>(C->getSExtValue()))
1011  Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
1012  Op.getValueType()));
1013  return;
1014 
1015  case 'M': // 0x7fffffff
1016  if (auto *C = dyn_cast<ConstantSDNode>(Op))
1017  if (C->getZExtValue() == 0x7fffffff)
1018  Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
1019  Op.getValueType()));
1020  return;
1021  }
1022  }
1023  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
1024 }
1025 
1026 //===----------------------------------------------------------------------===//
1027 // Calling conventions
1028 //===----------------------------------------------------------------------===//
1029 
1030 #include "SystemZGenCallingConv.inc"
1031 
1033  CallingConv::ID) const {
1034  static const MCPhysReg ScratchRegs[] = { SystemZ::R0D, SystemZ::R1D,
1035  SystemZ::R14D, 0 };
1036  return ScratchRegs;
1037 }
1038 
1040  Type *ToType) const {
1041  return isTruncateFree(FromType, ToType);
1042 }
1043 
1045  return CI->isTailCall();
1046 }
1047 
1048 // We do not yet support 128-bit single-element vector types. If the user
1049 // attempts to use such types as function argument or return type, prefer
1050 // to error out instead of emitting code violating the ABI.
1051 static void VerifyVectorType(MVT VT, EVT ArgVT) {
1052  if (ArgVT.isVector() && !VT.isVector())
1053  report_fatal_error("Unsupported vector argument or return type");
1054 }
1055 
1057  for (unsigned i = 0; i < Ins.size(); ++i)
1058  VerifyVectorType(Ins[i].VT, Ins[i].ArgVT);
1059 }
1060 
1062  for (unsigned i = 0; i < Outs.size(); ++i)
1063  VerifyVectorType(Outs[i].VT, Outs[i].ArgVT);
1064 }
1065 
1066 // Value is a value that has been passed to us in the location described by VA
1067 // (and so has type VA.getLocVT()). Convert Value to VA.getValVT(), chaining
1068 // any loads onto Chain.
1070  CCValAssign &VA, SDValue Chain,
1071  SDValue Value) {
1072  // If the argument has been promoted from a smaller type, insert an
1073  // assertion to capture this.
1074  if (VA.getLocInfo() == CCValAssign::SExt)
1075  Value = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Value,
1076  DAG.getValueType(VA.getValVT()));
1077  else if (VA.getLocInfo() == CCValAssign::ZExt)
1078  Value = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Value,
1079  DAG.getValueType(VA.getValVT()));
1080 
1081  if (VA.isExtInLoc())
1082  Value = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Value);
1083  else if (VA.getLocInfo() == CCValAssign::BCvt) {
1084  // If this is a short vector argument loaded from the stack,
1085  // extend from i64 to full vector size and then bitcast.
1086  assert(VA.getLocVT() == MVT::i64);
1087  assert(VA.getValVT().isVector());
1088  Value = DAG.getBuildVector(MVT::v2i64, DL, {Value, DAG.getUNDEF(MVT::i64)});
1089  Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
1090  } else
1091  assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
1092  return Value;
1093 }
1094 
1095 // Value is a value of type VA.getValVT() that we need to copy into
1096 // the location described by VA. Return a copy of Value converted to
1097 // VA.getValVT(). The caller is responsible for handling indirect values.
1099  CCValAssign &VA, SDValue Value) {
1100  switch (VA.getLocInfo()) {
1101  case CCValAssign::SExt:
1102  return DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Value);
1103  case CCValAssign::ZExt:
1104  return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
1105  case CCValAssign::AExt:
1106  return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
1107  case CCValAssign::BCvt:
1108  // If this is a short vector argument to be stored to the stack,
1109  // bitcast to v2i64 and then extract first element.
1110  assert(VA.getLocVT() == MVT::i64);
1111  assert(VA.getValVT().isVector());
1112  Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
1113  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
1114  DAG.getConstant(0, DL, MVT::i32));
1115  case CCValAssign::Full:
1116  return Value;
1117  default:
1118  llvm_unreachable("Unhandled getLocInfo()");
1119  }
1120 }
1121 
1123  SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
1124  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1125  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1126  MachineFunction &MF = DAG.getMachineFunction();
1127  MachineFrameInfo &MFI = MF.getFrameInfo();
1129  SystemZMachineFunctionInfo *FuncInfo =
1131  auto *TFL =
1132  static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
1133  EVT PtrVT = getPointerTy(DAG.getDataLayout());
1134 
1135  // Detect unsupported vector argument types.
1136  if (Subtarget.hasVector())
1137  VerifyVectorTypes(Ins);
1138 
1139  // Assign locations to all of the incoming arguments.
1141  SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1142  CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
1143 
1144  unsigned NumFixedGPRs = 0;
1145  unsigned NumFixedFPRs = 0;
1146  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
1147  SDValue ArgValue;
1148  CCValAssign &VA = ArgLocs[I];
1149  EVT LocVT = VA.getLocVT();
1150  if (VA.isRegLoc()) {
1151  // Arguments passed in registers
1152  const TargetRegisterClass *RC;
1153  switch (LocVT.getSimpleVT().SimpleTy) {
1154  default:
1155  // Integers smaller than i64 should be promoted to i64.
1156  llvm_unreachable("Unexpected argument type");
1157  case MVT::i32:
1158  NumFixedGPRs += 1;
1159  RC = &SystemZ::GR32BitRegClass;
1160  break;
1161  case MVT::i64:
1162  NumFixedGPRs += 1;
1163  RC = &SystemZ::GR64BitRegClass;
1164  break;
1165  case MVT::f32:
1166  NumFixedFPRs += 1;
1167  RC = &SystemZ::FP32BitRegClass;
1168  break;
1169  case MVT::f64:
1170  NumFixedFPRs += 1;
1171  RC = &SystemZ::FP64BitRegClass;
1172  break;
1173  case MVT::v16i8:
1174  case MVT::v8i16:
1175  case MVT::v4i32:
1176  case MVT::v2i64:
1177  case MVT::v4f32:
1178  case MVT::v2f64:
1179  RC = &SystemZ::VR128BitRegClass;
1180  break;
1181  }
1182 
1183  unsigned VReg = MRI.createVirtualRegister(RC);
1184  MRI.addLiveIn(VA.getLocReg(), VReg);
1185  ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
1186  } else {
1187  assert(VA.isMemLoc() && "Argument not register or memory");
1188 
1189  // Create the frame index object for this incoming parameter.
1190  int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
1191  VA.getLocMemOffset(), true);
1192 
1193  // Create the SelectionDAG nodes corresponding to a load
1194  // from this parameter. Unpromoted ints and floats are
1195  // passed as right-justified 8-byte values.
1196  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1197  if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
1198  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
1199  DAG.getIntPtrConstant(4, DL));
1200  ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
1202  }
1203 
1204  // Convert the value of the argument register into the value that's
1205  // being passed.
1206  if (VA.getLocInfo() == CCValAssign::Indirect) {
1207  InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
1208  MachinePointerInfo()));
1209  // If the original argument was split (e.g. i128), we need
1210  // to load all parts of it here (using the same address).
1211  unsigned ArgIndex = Ins[I].OrigArgIndex;
1212  assert (Ins[I].PartOffset == 0);
1213  while (I + 1 != E && Ins[I + 1].OrigArgIndex == ArgIndex) {
1214  CCValAssign &PartVA = ArgLocs[I + 1];
1215  unsigned PartOffset = Ins[I + 1].PartOffset;
1216  SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
1217  DAG.getIntPtrConstant(PartOffset, DL));
1218  InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
1219  MachinePointerInfo()));
1220  ++I;
1221  }
1222  } else
1223  InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
1224  }
1225 
1226  if (IsVarArg) {
1227  // Save the number of non-varargs registers for later use by va_start, etc.
1228  FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
1229  FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);
1230 
1231  // Likewise the address (in the form of a frame index) of where the
1232  // first stack vararg would be. The 1-byte size here is arbitrary.
1233  int64_t StackSize = CCInfo.getNextStackOffset();
1234  FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
1235 
1236  // ...and a similar frame index for the caller-allocated save area
1237  // that will be used to store the incoming registers.
1238  int64_t RegSaveOffset = TFL->getOffsetOfLocalArea();
1239  unsigned RegSaveIndex = MFI.CreateFixedObject(1, RegSaveOffset, true);
1240  FuncInfo->setRegSaveFrameIndex(RegSaveIndex);
1241 
1242  // Store the FPR varargs in the reserved frame slots. (We store the
1243  // GPRs as part of the prologue.)
1244  if (NumFixedFPRs < SystemZ::NumArgFPRs) {
1245  SDValue MemOps[SystemZ::NumArgFPRs];
1246  for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
1247  unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
1248  int FI = MFI.CreateFixedObject(8, RegSaveOffset + Offset, true);
1249  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
1250  unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
1251  &SystemZ::FP64BitRegClass);
1252  SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
1253  MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
1255  }
1256  // Join the stores, which are independent of one another.
1257  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1258  makeArrayRef(&MemOps[NumFixedFPRs],
1259  SystemZ::NumArgFPRs-NumFixedFPRs));
1260  }
1261  }
1262 
1263  return Chain;
1264 }
1265 
1266 static bool canUseSiblingCall(const CCState &ArgCCInfo,
1269  // Punt if there are any indirect or stack arguments, or if the call
1270  // needs the callee-saved argument register R6, or if the call uses
1271  // the callee-saved register arguments SwiftSelf and SwiftError.
1272  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
1273  CCValAssign &VA = ArgLocs[I];
1274  if (VA.getLocInfo() == CCValAssign::Indirect)
1275  return false;
1276  if (!VA.isRegLoc())
1277  return false;
1278  unsigned Reg = VA.getLocReg();
1279  if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
1280  return false;
1281  if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftError())
1282  return false;
1283  }
1284  return true;
1285 }
1286 
1287 SDValue
1289  SmallVectorImpl<SDValue> &InVals) const {
1290  SelectionDAG &DAG = CLI.DAG;
1291  SDLoc &DL = CLI.DL;
1293  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1295  SDValue Chain = CLI.Chain;
1296  SDValue Callee = CLI.Callee;
1297  bool &IsTailCall = CLI.IsTailCall;
1298  CallingConv::ID CallConv = CLI.CallConv;
1299  bool IsVarArg = CLI.IsVarArg;
1300  MachineFunction &MF = DAG.getMachineFunction();
1301  EVT PtrVT = getPointerTy(MF.getDataLayout());
1302 
1303  // Detect unsupported vector argument and return types.
1304  if (Subtarget.hasVector()) {
1305  VerifyVectorTypes(Outs);
1306  VerifyVectorTypes(Ins);
1307  }
1308 
1309  // Analyze the operands of the call, assigning locations to each operand.
1311  SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1312  ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
1313 
1314  // We don't support GuaranteedTailCallOpt, only automatically-detected
1315  // sibling calls.
1316  if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs, Outs))
1317  IsTailCall = false;
1318 
1319  // Get a count of how many bytes are to be pushed on the stack.
1320  unsigned NumBytes = ArgCCInfo.getNextStackOffset();
1321 
1322  // Mark the start of the call.
1323  if (!IsTailCall)
1324  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
1325 
1326  // Copy argument values to their designated locations.
1328  SmallVector<SDValue, 8> MemOpChains;
1329  SDValue StackPtr;
1330  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
1331  CCValAssign &VA = ArgLocs[I];
1332  SDValue ArgValue = OutVals[I];
1333 
1334  if (VA.getLocInfo() == CCValAssign::Indirect) {
1335  // Store the argument in a stack slot and pass its address.
1336  SDValue SpillSlot = DAG.CreateStackTemporary(Outs[I].ArgVT);
1337  int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
1338  MemOpChains.push_back(
1339  DAG.getStore(Chain, DL, ArgValue, SpillSlot,
1341  // If the original argument was split (e.g. i128), we need
1342  // to store all parts of it here (and pass just one address).
1343  unsigned ArgIndex = Outs[I].OrigArgIndex;
1344  assert (Outs[I].PartOffset == 0);
1345  while (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
1346  SDValue PartValue = OutVals[I + 1];
1347  unsigned PartOffset = Outs[I + 1].PartOffset;
1348  SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
1349  DAG.getIntPtrConstant(PartOffset, DL));
1350  MemOpChains.push_back(
1351  DAG.getStore(Chain, DL, PartValue, Address,
1353  ++I;
1354  }
1355  ArgValue = SpillSlot;
1356  } else
1357  ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
1358 
1359  if (VA.isRegLoc())
1360  // Queue up the argument copies and emit them at the end.
1361  RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
1362  else {
1363  assert(VA.isMemLoc() && "Argument not register or memory");
1364 
1365  // Work out the address of the stack slot. Unpromoted ints and
1366  // floats are passed as right-justified 8-byte values.
1367  if (!StackPtr.getNode())
1368  StackPtr = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, PtrVT);
1370  if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
1371  Offset += 4;
1372  SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
1373  DAG.getIntPtrConstant(Offset, DL));
1374 
1375  // Emit the store.
1376  MemOpChains.push_back(
1377  DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
1378  }
1379  }
1380 
1381  // Join the stores, which are independent of one another.
1382  if (!MemOpChains.empty())
1383  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
1384 
1385  // Accept direct calls by converting symbolic call addresses to the
1386  // associated Target* opcodes. Force %r1 to be used for indirect
1387  // tail calls.
1388  SDValue Glue;
1389  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1390  Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
1391  Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
1392  } else if (auto *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1393  Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
1394  Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
1395  } else if (IsTailCall) {
1396  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R1D, Callee, Glue);
1397  Glue = Chain.getValue(1);
1398  Callee = DAG.getRegister(SystemZ::R1D, Callee.getValueType());
1399  }
1400 
1401  // Build a sequence of copy-to-reg nodes, chained and glued together.
1402  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
1403  Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
1404  RegsToPass[I].second, Glue);
1405  Glue = Chain.getValue(1);
1406  }
1407 
1408  // The first call operand is the chain and the second is the target address.
1410  Ops.push_back(Chain);
1411  Ops.push_back(Callee);
1412 
1413  // Add argument registers to the end of the list so that they are
1414  // known live into the call.
1415  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
1416  Ops.push_back(DAG.getRegister(RegsToPass[I].first,
1417  RegsToPass[I].second.getValueType()));
1418 
1419  // Add a register mask operand representing the call-preserved registers.
1420  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
1421  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
1422  assert(Mask && "Missing call preserved mask for calling convention");
1423  Ops.push_back(DAG.getRegisterMask(Mask));
1424 
1425  // Glue the call to the argument copies, if any.
1426  if (Glue.getNode())
1427  Ops.push_back(Glue);
1428 
1429  // Emit the call.
1430  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1431  if (IsTailCall)
1432  return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops);
1433  Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops);
1434  Glue = Chain.getValue(1);
1435 
1436  // Mark the end of the call, which is glued to the call itself.
1437  Chain = DAG.getCALLSEQ_END(Chain,
1438  DAG.getConstant(NumBytes, DL, PtrVT, true),
1439  DAG.getConstant(0, DL, PtrVT, true),
1440  Glue, DL);
1441  Glue = Chain.getValue(1);
1442 
1443  // Assign locations to each value returned by this call.
1445  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
1446  RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
1447 
1448  // Copy all of the result registers out of their specified physreg.
1449  for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
1450  CCValAssign &VA = RetLocs[I];
1451 
1452  // Copy the value out, gluing the copy to the end of the call sequence.
1453  SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
1454  VA.getLocVT(), Glue);
1455  Chain = RetValue.getValue(1);
1456  Glue = RetValue.getValue(2);
1457 
1458  // Convert the value of the return register into the value that's
1459  // being returned.
1460  InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, RetValue));
1461  }
1462 
1463  return Chain;
1464 }
1465 
1468  MachineFunction &MF, bool isVarArg,
1469  const SmallVectorImpl<ISD::OutputArg> &Outs,
1470  LLVMContext &Context) const {
1471  // Detect unsupported vector return types.
1472  if (Subtarget.hasVector())
1473  VerifyVectorTypes(Outs);
1474 
1475  // Special case that we cannot easily detect in RetCC_SystemZ since
1476  // i128 is not a legal type.
1477  for (auto &Out : Outs)
1478  if (Out.ArgVT == MVT::i128)
1479  return false;
1480 
1482  CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
1483  return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
1484 }
1485 
1486 SDValue
1488  bool IsVarArg,
1489  const SmallVectorImpl<ISD::OutputArg> &Outs,
1490  const SmallVectorImpl<SDValue> &OutVals,
1491  const SDLoc &DL, SelectionDAG &DAG) const {
1492  MachineFunction &MF = DAG.getMachineFunction();
1493 
1494  // Detect unsupported vector return types.
1495  if (Subtarget.hasVector())
1496  VerifyVectorTypes(Outs);
1497 
1498  // Assign locations to each returned value.
1500  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
1501  RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
1502 
1503  // Quick exit for void returns
1504  if (RetLocs.empty())
1505  return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, Chain);
1506 
1507  // Copy the result values into the output registers.
1508  SDValue Glue;
1509  SmallVector<SDValue, 4> RetOps;
1510  RetOps.push_back(Chain);
1511  for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
1512  CCValAssign &VA = RetLocs[I];
1513  SDValue RetValue = OutVals[I];
1514 
1515  // Make the return register live on exit.
1516  assert(VA.isRegLoc() && "Can only return in registers!");
1517 
1518  // Promote the value as required.
1519  RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue);
1520 
1521  // Chain and glue the copies together.
1522  unsigned Reg = VA.getLocReg();
1523  Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue);
1524  Glue = Chain.getValue(1);
1525  RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT()));
1526  }
1527 
1528  // Update chain and glue.
1529  RetOps[0] = Chain;
1530  if (Glue.getNode())
1531  RetOps.push_back(Glue);
1532 
1533  return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
1534 }
1535 
1536 // Return true if Op is an intrinsic node with chain that returns the CC value
1537 // as its only (other) argument. Provide the associated SystemZISD opcode and
1538 // the mask of valid CC values if so.
1539 static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
1540  unsigned &CCValid) {
1541  unsigned Id = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1542  switch (Id) {
1543  case Intrinsic::s390_tbegin:
1544  Opcode = SystemZISD::TBEGIN;
1545  CCValid = SystemZ::CCMASK_TBEGIN;
1546  return true;
1547 
1548  case Intrinsic::s390_tbegin_nofloat:
1549  Opcode = SystemZISD::TBEGIN_NOFLOAT;
1550  CCValid = SystemZ::CCMASK_TBEGIN;
1551  return true;
1552 
1553  case Intrinsic::s390_tend:
1554  Opcode = SystemZISD::TEND;
1555  CCValid = SystemZ::CCMASK_TEND;
1556  return true;
1557 
1558  default:
1559  return false;
1560  }
1561 }
1562 
1563 // Return true if Op is an intrinsic node without chain that returns the
1564 // CC value as its final argument. Provide the associated SystemZISD
1565 // opcode and the mask of valid CC values if so.
1566 static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
1567  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1568  switch (Id) {
1569  case Intrinsic::s390_vpkshs:
1570  case Intrinsic::s390_vpksfs:
1571  case Intrinsic::s390_vpksgs:
1572  Opcode = SystemZISD::PACKS_CC;
1573  CCValid = SystemZ::CCMASK_VCMP;
1574  return true;
1575 
1576  case Intrinsic::s390_vpklshs:
1577  case Intrinsic::s390_vpklsfs:
1578  case Intrinsic::s390_vpklsgs:
1579  Opcode = SystemZISD::PACKLS_CC;
1580  CCValid = SystemZ::CCMASK_VCMP;
1581  return true;
1582 
1583  case Intrinsic::s390_vceqbs:
1584  case Intrinsic::s390_vceqhs:
1585  case Intrinsic::s390_vceqfs:
1586  case Intrinsic::s390_vceqgs:
1587  Opcode = SystemZISD::VICMPES;
1588  CCValid = SystemZ::CCMASK_VCMP;
1589  return true;
1590 
1591  case Intrinsic::s390_vchbs:
1592  case Intrinsic::s390_vchhs:
1593  case Intrinsic::s390_vchfs:
1594  case Intrinsic::s390_vchgs:
1595  Opcode = SystemZISD::VICMPHS;
1596  CCValid = SystemZ::CCMASK_VCMP;
1597  return true;
1598 
1599  case Intrinsic::s390_vchlbs:
1600  case Intrinsic::s390_vchlhs:
1601  case Intrinsic::s390_vchlfs:
1602  case Intrinsic::s390_vchlgs:
1603  Opcode = SystemZISD::VICMPHLS;
1604  CCValid = SystemZ::CCMASK_VCMP;
1605  return true;
1606 
1607  case Intrinsic::s390_vtm:
1608  Opcode = SystemZISD::VTM;
1609  CCValid = SystemZ::CCMASK_VCMP;
1610  return true;
1611 
1612  case Intrinsic::s390_vfaebs:
1613  case Intrinsic::s390_vfaehs:
1614  case Intrinsic::s390_vfaefs:
1615  Opcode = SystemZISD::VFAE_CC;
1616  CCValid = SystemZ::CCMASK_ANY;
1617  return true;
1618 
1619  case Intrinsic::s390_vfaezbs:
1620  case Intrinsic::s390_vfaezhs:
1621  case Intrinsic::s390_vfaezfs:
1622  Opcode = SystemZISD::VFAEZ_CC;
1623  CCValid = SystemZ::CCMASK_ANY;
1624  return true;
1625 
1626  case Intrinsic::s390_vfeebs:
1627  case Intrinsic::s390_vfeehs:
1628  case Intrinsic::s390_vfeefs:
1629  Opcode = SystemZISD::VFEE_CC;
1630  CCValid = SystemZ::CCMASK_ANY;
1631  return true;
1632 
1633  case Intrinsic::s390_vfeezbs:
1634  case Intrinsic::s390_vfeezhs:
1635  case Intrinsic::s390_vfeezfs:
1636  Opcode = SystemZISD::VFEEZ_CC;
1637  CCValid = SystemZ::CCMASK_ANY;
1638  return true;
1639 
1640  case Intrinsic::s390_vfenebs:
1641  case Intrinsic::s390_vfenehs:
1642  case Intrinsic::s390_vfenefs:
1643  Opcode = SystemZISD::VFENE_CC;
1644  CCValid = SystemZ::CCMASK_ANY;
1645  return true;
1646 
1647  case Intrinsic::s390_vfenezbs:
1648  case Intrinsic::s390_vfenezhs:
1649  case Intrinsic::s390_vfenezfs:
1650  Opcode = SystemZISD::VFENEZ_CC;
1651  CCValid = SystemZ::CCMASK_ANY;
1652  return true;
1653 
1654  case Intrinsic::s390_vistrbs:
1655  case Intrinsic::s390_vistrhs:
1656  case Intrinsic::s390_vistrfs:
1657  Opcode = SystemZISD::VISTR_CC;
1659  return true;
1660 
1661  case Intrinsic::s390_vstrcbs:
1662  case Intrinsic::s390_vstrchs:
1663  case Intrinsic::s390_vstrcfs:
1664  Opcode = SystemZISD::VSTRC_CC;
1665  CCValid = SystemZ::CCMASK_ANY;
1666  return true;
1667 
1668  case Intrinsic::s390_vstrczbs:
1669  case Intrinsic::s390_vstrczhs:
1670  case Intrinsic::s390_vstrczfs:
1671  Opcode = SystemZISD::VSTRCZ_CC;
1672  CCValid = SystemZ::CCMASK_ANY;
1673  return true;
1674 
1675  case Intrinsic::s390_vfcedbs:
1676  case Intrinsic::s390_vfcesbs:
1677  Opcode = SystemZISD::VFCMPES;
1678  CCValid = SystemZ::CCMASK_VCMP;
1679  return true;
1680 
1681  case Intrinsic::s390_vfchdbs:
1682  case Intrinsic::s390_vfchsbs:
1683  Opcode = SystemZISD::VFCMPHS;
1684  CCValid = SystemZ::CCMASK_VCMP;
1685  return true;
1686 
1687  case Intrinsic::s390_vfchedbs:
1688  case Intrinsic::s390_vfchesbs:
1689  Opcode = SystemZISD::VFCMPHES;
1690  CCValid = SystemZ::CCMASK_VCMP;
1691  return true;
1692 
1693  case Intrinsic::s390_vftcidb:
1694  case Intrinsic::s390_vftcisb:
1695  Opcode = SystemZISD::VFTCI;
1696  CCValid = SystemZ::CCMASK_VCMP;
1697  return true;
1698 
1699  case Intrinsic::s390_tdc:
1700  Opcode = SystemZISD::TDC;
1701  CCValid = SystemZ::CCMASK_TDC;
1702  return true;
1703 
1704  default:
1705  return false;
1706  }
1707 }
1708 
1709 // Emit an intrinsic with chain and an explicit CC register result.
1711  unsigned Opcode) {
1712  // Copy all operands except the intrinsic ID.
1713  unsigned NumOps = Op.getNumOperands();
1715  Ops.reserve(NumOps - 1);
1716  Ops.push_back(Op.getOperand(0));
1717  for (unsigned I = 2; I < NumOps; ++I)
1718  Ops.push_back(Op.getOperand(I));
1719 
1720  assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
1721  SDVTList RawVTs = DAG.getVTList(MVT::i32, MVT::Other);
1722  SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
1723  SDValue OldChain = SDValue(Op.getNode(), 1);
1724  SDValue NewChain = SDValue(Intr.getNode(), 1);
1725  DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
1726  return Intr.getNode();
1727 }
1728 
1729 // Emit an intrinsic with an explicit CC register result.
1731  unsigned Opcode) {
1732  // Copy all operands except the intrinsic ID.
1733  unsigned NumOps = Op.getNumOperands();
1735  Ops.reserve(NumOps - 1);
1736  for (unsigned I = 1; I < NumOps; ++I)
1737  Ops.push_back(Op.getOperand(I));
1738 
1739  SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), Op->getVTList(), Ops);
1740  return Intr.getNode();
1741 }
1742 
1743 // CC is a comparison that will be implemented using an integer or
1744 // floating-point comparison. Return the condition code mask for
1745 // a branch on true. In the integer case, CCMASK_CMP_UO is set for
1746 // unsigned comparisons and clear for signed ones. In the floating-point
1747 // case, CCMASK_CMP_UO has its normal mask meaning (unordered).
1748 static unsigned CCMaskForCondCode(ISD::CondCode CC) {
1749 #define CONV(X) \
1750  case ISD::SET##X: return SystemZ::CCMASK_CMP_##X; \
1751  case ISD::SETO##X: return SystemZ::CCMASK_CMP_##X; \
1752  case ISD::SETU##X: return SystemZ::CCMASK_CMP_UO | SystemZ::CCMASK_CMP_##X
1753 
1754  switch (CC) {
1755  default:
1756  llvm_unreachable("Invalid integer condition!");
1757 
1758  CONV(EQ);
1759  CONV(NE);
1760  CONV(GT);
1761  CONV(GE);
1762  CONV(LT);
1763  CONV(LE);
1764 
1765  case ISD::SETO: return SystemZ::CCMASK_CMP_O;
1766  case ISD::SETUO: return SystemZ::CCMASK_CMP_UO;
1767  }
1768 #undef CONV
1769 }
1770 
1771 // If C can be converted to a comparison against zero, adjust the operands
1772 // as necessary.
1773 static void adjustZeroCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
1774  if (C.ICmpType == SystemZICMP::UnsignedOnly)
1775  return;
1776 
1777  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1.getNode());
1778  if (!ConstOp1)
1779  return;
1780 
1781  int64_t Value = ConstOp1->getSExtValue();
1782  if ((Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_GT) ||
1783  (Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_LE) ||
1784  (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_LT) ||
1785  (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_GE)) {
1786  C.CCMask ^= SystemZ::CCMASK_CMP_EQ;
1787  C.Op1 = DAG.getConstant(0, DL, C.Op1.getValueType());
1788  }
1789 }
1790 
1791 // If a comparison described by C is suitable for CLI(Y), CHHSI or CLHHSI,
1792 // adjust the operands as necessary.
1793 static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
1794  Comparison &C) {
1795  // For us to make any changes, it must a comparison between a single-use
1796  // load and a constant.
1797  if (!C.Op0.hasOneUse() ||
1798  C.Op0.getOpcode() != ISD::LOAD ||
1799  C.Op1.getOpcode() != ISD::Constant)
1800  return;
1801 
1802  // We must have an 8- or 16-bit load.
1803  auto *Load = cast<LoadSDNode>(C.Op0);
1804  unsigned NumBits = Load->getMemoryVT().getStoreSizeInBits();
1805  if (NumBits != 8 && NumBits != 16)
1806  return;
1807 
1808  // The load must be an extending one and the constant must be within the
1809  // range of the unextended value.
1810  auto *ConstOp1 = cast<ConstantSDNode>(C.Op1);
1811  uint64_t Value = ConstOp1->getZExtValue();
1812  uint64_t Mask = (1 << NumBits) - 1;
1813  if (Load->getExtensionType() == ISD::SEXTLOAD) {
1814  // Make sure that ConstOp1 is in range of C.Op0.
1815  int64_t SignedValue = ConstOp1->getSExtValue();
1816  if (uint64_t(SignedValue) + (uint64_t(1) << (NumBits - 1)) > Mask)
1817  return;
1818  if (C.ICmpType != SystemZICMP::SignedOnly) {
1819  // Unsigned comparison between two sign-extended values is equivalent
1820  // to unsigned comparison between two zero-extended values.
1821  Value &= Mask;
1822  } else if (NumBits == 8) {
1823  // Try to treat the comparison as unsigned, so that we can use CLI.
1824  // Adjust CCMask and Value as necessary.
1825  if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_LT)
1826  // Test whether the high bit of the byte is set.
1827  Value = 127, C.CCMask = SystemZ::CCMASK_CMP_GT;
1828  else if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_GE)
1829  // Test whether the high bit of the byte is clear.
1830  Value = 128, C.CCMask = SystemZ::CCMASK_CMP_LT;
1831  else
1832  // No instruction exists for this combination.
1833  return;
1834  C.ICmpType = SystemZICMP::UnsignedOnly;
1835  }
1836  } else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
1837  if (Value > Mask)
1838  return;
1839  // If the constant is in range, we can use any comparison.
1840  C.ICmpType = SystemZICMP::Any;
1841  } else
1842  return;
1843 
1844  // Make sure that the first operand is an i32 of the right extension type.
1846  ISD::SEXTLOAD :
1847  ISD::ZEXTLOAD);
1848  if (C.Op0.getValueType() != MVT::i32 ||
1849  Load->getExtensionType() != ExtType) {
1850  C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32, Load->getChain(),
1851  Load->getBasePtr(), Load->getPointerInfo(),
1852  Load->getMemoryVT(), Load->getAlignment(),
1853  Load->getMemOperand()->getFlags());
1854  // Update the chain uses.
1855  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), C.Op0.getValue(1));
1856  }
1857 
1858  // Make sure that the second operand is an i32 with the right value.
1859  if (C.Op1.getValueType() != MVT::i32 ||
1860  Value != ConstOp1->getZExtValue())
1861  C.Op1 = DAG.getConstant(Value, DL, MVT::i32);
1862 }
1863 
1864 // Return true if Op is either an unextended load, or a load suitable
1865 // for integer register-memory comparisons of type ICmpType.
1866 static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
1867  auto *Load = dyn_cast<LoadSDNode>(Op.getNode());
1868  if (Load) {
1869  // There are no instructions to compare a register with a memory byte.
1870  if (Load->getMemoryVT() == MVT::i8)
1871  return false;
1872  // Otherwise decide on extension type.
1873  switch (Load->getExtensionType()) {
1874  case ISD::NON_EXTLOAD:
1875  return true;
1876  case ISD::SEXTLOAD:
1877  return ICmpType != SystemZICMP::UnsignedOnly;
1878  case ISD::ZEXTLOAD:
1879  return ICmpType != SystemZICMP::SignedOnly;
1880  default:
1881  break;
1882  }
1883  }
1884  return false;
1885 }
1886 
1887 // Return true if it is better to swap the operands of C.
1888 static bool shouldSwapCmpOperands(const Comparison &C) {
1889  // Leave f128 comparisons alone, since they have no memory forms.
1890  if (C.Op0.getValueType() == MVT::f128)
1891  return false;
1892 
1893  // Always keep a floating-point constant second, since comparisons with
1894  // zero can use LOAD TEST and comparisons with other constants make a
1895  // natural memory operand.
1896  if (isa<ConstantFPSDNode>(C.Op1))
1897  return false;
1898 
1899  // Never swap comparisons with zero since there are many ways to optimize
1900  // those later.
1901  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
1902  if (ConstOp1 && ConstOp1->getZExtValue() == 0)
1903  return false;
1904 
1905  // Also keep natural memory operands second if the loaded value is
1906  // only used here. Several comparisons have memory forms.
1907  if (isNaturalMemoryOperand(C.Op1, C.ICmpType) && C.Op1.hasOneUse())
1908  return false;
1909 
1910  // Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
1911  // In that case we generally prefer the memory to be second.
1912  if (isNaturalMemoryOperand(C.Op0, C.ICmpType) && C.Op0.hasOneUse()) {
1913  // The only exceptions are when the second operand is a constant and
1914  // we can use things like CHHSI.
1915  if (!ConstOp1)
1916  return true;
1917  // The unsigned memory-immediate instructions can handle 16-bit
1918  // unsigned integers.
1919  if (C.ICmpType != SystemZICMP::SignedOnly &&
1920  isUInt<16>(ConstOp1->getZExtValue()))
1921  return false;
1922  // The signed memory-immediate instructions can handle 16-bit
1923  // signed integers.
1924  if (C.ICmpType != SystemZICMP::UnsignedOnly &&
1925  isInt<16>(ConstOp1->getSExtValue()))
1926  return false;
1927  return true;
1928  }
1929 
1930  // Try to promote the use of CGFR and CLGFR.
1931  unsigned Opcode0 = C.Op0.getOpcode();
1932  if (C.ICmpType != SystemZICMP::UnsignedOnly && Opcode0 == ISD::SIGN_EXTEND)
1933  return true;
1934  if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::ZERO_EXTEND)
1935  return true;
1936  if (C.ICmpType != SystemZICMP::SignedOnly &&
1937  Opcode0 == ISD::AND &&
1938  C.Op0.getOperand(1).getOpcode() == ISD::Constant &&
1939  cast<ConstantSDNode>(C.Op0.getOperand(1))->getZExtValue() == 0xffffffff)
1940  return true;
1941 
1942  return false;
1943 }
1944 
1945 // Return a version of comparison CC mask CCMask in which the LT and GT
1946 // actions are swapped.
1947 static unsigned reverseCCMask(unsigned CCMask) {
1948  return ((CCMask & SystemZ::CCMASK_CMP_EQ) |
1950  (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
1951  (CCMask & SystemZ::CCMASK_CMP_UO));
1952 }
1953 
1954 // Check whether C tests for equality between X and Y and whether X - Y
1955 // or Y - X is also computed. In that case it's better to compare the
1956 // result of the subtraction against zero.
1957 static void adjustForSubtraction(SelectionDAG &DAG, const SDLoc &DL,
1958  Comparison &C) {
1959  if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
1960  C.CCMask == SystemZ::CCMASK_CMP_NE) {
1961  for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
1962  SDNode *N = *I;
1963  if (N->getOpcode() == ISD::SUB &&
1964  ((N->getOperand(0) == C.Op0 && N->getOperand(1) == C.Op1) ||
1965  (N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) {
1966  C.Op0 = SDValue(N, 0);
1967  C.Op1 = DAG.getConstant(0, DL, N->getValueType(0));
1968  return;
1969  }
1970  }
1971  }
1972 }
1973 
1974 // Check whether C compares a floating-point value with zero and if that
1975 // floating-point value is also negated. In this case we can use the
1976 // negation to set CC, so avoiding separate LOAD AND TEST and
1977 // LOAD (NEGATIVE/COMPLEMENT) instructions.
1978 static void adjustForFNeg(Comparison &C) {
1979  auto *C1 = dyn_cast<ConstantFPSDNode>(C.Op1);
1980  if (C1 && C1->isZero()) {
1981  for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
1982  SDNode *N = *I;
1983  if (N->getOpcode() == ISD::FNEG) {
1984  C.Op0 = SDValue(N, 0);
1985  C.CCMask = reverseCCMask(C.CCMask);
1986  return;
1987  }
1988  }
1989  }
1990 }
1991 
1992 // Check whether C compares (shl X, 32) with 0 and whether X is
1993 // also sign-extended. In that case it is better to test the result
1994 // of the sign extension using LTGFR.
1995 //
1996 // This case is important because InstCombine transforms a comparison
1997 // with (sext (trunc X)) into a comparison with (shl X, 32).
1998 static void adjustForLTGFR(Comparison &C) {
1999  // Check for a comparison between (shl X, 32) and 0.
2000  if (C.Op0.getOpcode() == ISD::SHL &&
2001  C.Op0.getValueType() == MVT::i64 &&
2002  C.Op1.getOpcode() == ISD::Constant &&
2003  cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
2004  auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
2005  if (C1 && C1->getZExtValue() == 32) {
2006  SDValue ShlOp0 = C.Op0.getOperand(0);
2007  // See whether X has any SIGN_EXTEND_INREG uses.
2008  for (auto I = ShlOp0->use_begin(), E = ShlOp0->use_end(); I != E; ++I) {
2009  SDNode *N = *I;
2010  if (N->getOpcode() == ISD::SIGN_EXTEND_INREG &&
2011  cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32) {
2012  C.Op0 = SDValue(N, 0);
2013  return;
2014  }
2015  }
2016  }
2017  }
2018 }
2019 
2020 // If C compares the truncation of an extending load, try to compare
2021 // the untruncated value instead. This exposes more opportunities to
2022 // reuse CC.
2023 static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,
2024  Comparison &C) {
2025  if (C.Op0.getOpcode() == ISD::TRUNCATE &&
2026  C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&
2027  C.Op1.getOpcode() == ISD::Constant &&
2028  cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
2029  auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
2030  if (L->getMemoryVT().getStoreSizeInBits() <= C.Op0.getValueSizeInBits()) {
2031  unsigned Type = L->getExtensionType();
2032  if ((Type == ISD::ZEXTLOAD && C.ICmpType != SystemZICMP::SignedOnly) ||
2033  (Type == ISD::SEXTLOAD && C.ICmpType != SystemZICMP::UnsignedOnly)) {
2034  C.Op0 = C.Op0.getOperand(0);
2035  C.Op1 = DAG.getConstant(0, DL, C.Op0.getValueType());
2036  }
2037  }
2038  }
2039 }
2040 
2041 // Return true if shift operation N has an in-range constant shift value.
2042 // Store it in ShiftVal if so.
2043 static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
2044  auto *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
2045  if (!Shift)
2046  return false;
2047 
2048  uint64_t Amount = Shift->getZExtValue();
2049  if (Amount >= N.getValueSizeInBits())
2050  return false;
2051 
2052  ShiftVal = Amount;
2053  return true;
2054 }
2055 
2056 // Check whether an AND with Mask is suitable for a TEST UNDER MASK
2057 // instruction and whether the CC value is descriptive enough to handle
2058 // a comparison of type Opcode between the AND result and CmpVal.
2059 // CCMask says which comparison result is being tested and BitSize is
2060 // the number of bits in the operands. If TEST UNDER MASK can be used,
2061 // return the corresponding CC mask, otherwise return 0.
2062 static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
2063  uint64_t Mask, uint64_t CmpVal,
2064  unsigned ICmpType) {
2065  assert(Mask != 0 && "ANDs with zero should have been removed by now");
2066 
2067  // Check whether the mask is suitable for TMHH, TMHL, TMLH or TMLL.
2068  if (!SystemZ::isImmLL(Mask) && !SystemZ::isImmLH(Mask) &&
2069  !SystemZ::isImmHL(Mask) && !SystemZ::isImmHH(Mask))
2070  return 0;
2071 
2072  // Work out the masks for the lowest and highest bits.
2073  unsigned HighShift = 63 - countLeadingZeros(Mask);
2074  uint64_t High = uint64_t(1) << HighShift;
2075  uint64_t Low = uint64_t(1) << countTrailingZeros(Mask);
2076 
2077  // Signed ordered comparisons are effectively unsigned if the sign
2078  // bit is dropped.
2079  bool EffectivelyUnsigned = (ICmpType != SystemZICMP::SignedOnly);
2080 
2081  // Check for equality comparisons with 0, or the equivalent.
2082  if (CmpVal == 0) {
2083  if (CCMask == SystemZ::CCMASK_CMP_EQ)
2084  return SystemZ::CCMASK_TM_ALL_0;
2085  if (CCMask == SystemZ::CCMASK_CMP_NE)
2087  }
2088  if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) {
2089  if (CCMask == SystemZ::CCMASK_CMP_LT)
2090  return SystemZ::CCMASK_TM_ALL_0;
2091  if (CCMask == SystemZ::CCMASK_CMP_GE)
2093  }
2094  if (EffectivelyUnsigned && CmpVal < Low) {
2095  if (CCMask == SystemZ::CCMASK_CMP_LE)
2096  return SystemZ::CCMASK_TM_ALL_0;
2097  if (CCMask == SystemZ::CCMASK_CMP_GT)
2099  }
2100 
2101  // Check for equality comparisons with the mask, or the equivalent.
2102  if (CmpVal == Mask) {
2103  if (CCMask == SystemZ::CCMASK_CMP_EQ)
2104  return SystemZ::CCMASK_TM_ALL_1;
2105  if (CCMask == SystemZ::CCMASK_CMP_NE)
2107  }
2108  if (EffectivelyUnsigned && CmpVal >= Mask - Low && CmpVal < Mask) {
2109  if (CCMask == SystemZ::CCMASK_CMP_GT)
2110  return SystemZ::CCMASK_TM_ALL_1;
2111  if (CCMask == SystemZ::CCMASK_CMP_LE)
2113  }
2114  if (EffectivelyUnsigned && CmpVal > Mask - Low && CmpVal <= Mask) {
2115  if (CCMask == SystemZ::CCMASK_CMP_GE)
2116  return SystemZ::CCMASK_TM_ALL_1;
2117  if (CCMask == SystemZ::CCMASK_CMP_LT)
2119  }
2120 
2121  // Check for ordered comparisons with the top bit.
2122  if (EffectivelyUnsigned && CmpVal >= Mask - High && CmpVal < High) {
2123  if (CCMask == SystemZ::CCMASK_CMP_LE)
2124  return SystemZ::CCMASK_TM_MSB_0;
2125  if (CCMask == SystemZ::CCMASK_CMP_GT)
2126  return SystemZ::CCMASK_TM_MSB_1;
2127  }
2128  if (EffectivelyUnsigned && CmpVal > Mask - High && CmpVal <= High) {
2129  if (CCMask == SystemZ::CCMASK_CMP_LT)
2130  return SystemZ::CCMASK_TM_MSB_0;
2131  if (CCMask == SystemZ::CCMASK_CMP_GE)
2132  return SystemZ::CCMASK_TM_MSB_1;
2133  }
2134 
2135  // If there are just two bits, we can do equality checks for Low and High
2136  // as well.
2137  if (Mask == Low + High) {
2138  if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == Low)
2140  if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == Low)
2142  if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == High)
2144  if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == High)
2146  }
2147 
2148  // Looks like we've exhausted our options.
2149  return 0;
2150 }
2151 
2152 // See whether C can be implemented as a TEST UNDER MASK instruction.
2153 // Update the arguments with the TM version if so.
2154 static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
2155  Comparison &C) {
2156  // Check that we have a comparison with a constant.
2157  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
2158  if (!ConstOp1)
2159  return;
2160  uint64_t CmpVal = ConstOp1->getZExtValue();
2161 
2162  // Check whether the nonconstant input is an AND with a constant mask.
2163  Comparison NewC(C);
2164  uint64_t MaskVal;
2165  ConstantSDNode *Mask = nullptr;
2166  if (C.Op0.getOpcode() == ISD::AND) {
2167  NewC.Op0 = C.Op0.getOperand(0);
2168  NewC.Op1 = C.Op0.getOperand(1);
2169  Mask = dyn_cast<ConstantSDNode>(NewC.Op1);
2170  if (!Mask)
2171  return;
2172  MaskVal = Mask->getZExtValue();
2173  } else {
2174  // There is no instruction to compare with a 64-bit immediate
2175  // so use TMHH instead if possible. We need an unsigned ordered
2176  // comparison with an i64 immediate.
2177  if (NewC.Op0.getValueType() != MVT::i64 ||
2178  NewC.CCMask == SystemZ::CCMASK_CMP_EQ ||
2179  NewC.CCMask == SystemZ::CCMASK_CMP_NE ||
2180  NewC.ICmpType == SystemZICMP::SignedOnly)
2181  return;
2182  // Convert LE and GT comparisons into LT and GE.
2183  if (NewC.CCMask == SystemZ::CCMASK_CMP_LE ||
2184  NewC.CCMask == SystemZ::CCMASK_CMP_GT) {
2185  if (CmpVal == uint64_t(-1))
2186  return;
2187  CmpVal += 1;
2188  NewC.CCMask ^= SystemZ::CCMASK_CMP_EQ;
2189  }
2190  // If the low N bits of Op1 are zero than the low N bits of Op0 can
2191  // be masked off without changing the result.
2192  MaskVal = -(CmpVal & -CmpVal);
2193  NewC.ICmpType = SystemZICMP::UnsignedOnly;
2194  }
2195  if (!MaskVal)
2196  return;
2197 
2198  // Check whether the combination of mask, comparison value and comparison
2199  // type are suitable.
2200  unsigned BitSize = NewC.Op0.getValueSizeInBits();
2201  unsigned NewCCMask, ShiftVal;
2202  if (NewC.ICmpType != SystemZICMP::SignedOnly &&
2203  NewC.Op0.getOpcode() == ISD::SHL &&
2204  isSimpleShift(NewC.Op0, ShiftVal) &&
2205  (MaskVal >> ShiftVal != 0) &&
2206  ((CmpVal >> ShiftVal) << ShiftVal) == CmpVal &&
2207  (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
2208  MaskVal >> ShiftVal,
2209  CmpVal >> ShiftVal,
2210  SystemZICMP::Any))) {
2211  NewC.Op0 = NewC.Op0.getOperand(0);
2212  MaskVal >>= ShiftVal;
2213  } else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
2214  NewC.Op0.getOpcode() == ISD::SRL &&
2215  isSimpleShift(NewC.Op0, ShiftVal) &&
2216  (MaskVal << ShiftVal != 0) &&
2217  ((CmpVal << ShiftVal) >> ShiftVal) == CmpVal &&
2218  (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
2219  MaskVal << ShiftVal,
2220  CmpVal << ShiftVal,
2222  NewC.Op0 = NewC.Op0.getOperand(0);
2223  MaskVal <<= ShiftVal;
2224  } else {
2225  NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal, CmpVal,
2226  NewC.ICmpType);
2227  if (!NewCCMask)
2228  return;
2229  }
2230 
2231  // Go ahead and make the change.
2232  C.Opcode = SystemZISD::TM;
2233  C.Op0 = NewC.Op0;
2234  if (Mask && Mask->getZExtValue() == MaskVal)
2235  C.Op1 = SDValue(Mask, 0);
2236  else
2237  C.Op1 = DAG.getConstant(MaskVal, DL, C.Op0.getValueType());
2238  C.CCValid = SystemZ::CCMASK_TM;
2239  C.CCMask = NewCCMask;
2240 }
2241 
2242 // See whether the comparison argument contains a redundant AND
2243 // and remove it if so. This sometimes happens due to the generic
2244 // BRCOND expansion.
2245 static void adjustForRedundantAnd(SelectionDAG &DAG, const SDLoc &DL,
2246  Comparison &C) {
2247  if (C.Op0.getOpcode() != ISD::AND)
2248  return;
2249  auto *Mask = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
2250  if (!Mask)
2251  return;
2252  KnownBits Known = DAG.computeKnownBits(C.Op0.getOperand(0));
2253  if ((~Known.Zero).getZExtValue() & ~Mask->getZExtValue())
2254  return;
2255 
2256  C.Op0 = C.Op0.getOperand(0);
2257 }
2258 
2259 // Return a Comparison that tests the condition-code result of intrinsic
2260 // node Call against constant integer CC using comparison code Cond.
2261 // Opcode is the opcode of the SystemZISD operation for the intrinsic
2262 // and CCValid is the set of possible condition-code results.
2263 static Comparison getIntrinsicCmp(SelectionDAG &DAG, unsigned Opcode,
2264  SDValue Call, unsigned CCValid, uint64_t CC,
2265  ISD::CondCode Cond) {
2266  Comparison C(Call, SDValue());
2267  C.Opcode = Opcode;
2268  C.CCValid = CCValid;
2269  if (Cond == ISD::SETEQ)
2270  // bit 3 for CC==0, bit 0 for CC==3, always false for CC>3.
2271  C.CCMask = CC < 4 ? 1 << (3 - CC) : 0;
2272  else if (Cond == ISD::SETNE)
2273  // ...and the inverse of that.
2274  C.CCMask = CC < 4 ? ~(1 << (3 - CC)) : -1;
2275  else if (Cond == ISD::SETLT || Cond == ISD::SETULT)
2276  // bits above bit 3 for CC==0 (always false), bits above bit 0 for CC==3,
2277  // always true for CC>3.
2278  C.CCMask = CC < 4 ? ~0U << (4 - CC) : -1;
2279  else if (Cond == ISD::SETGE || Cond == ISD::SETUGE)
2280  // ...and the inverse of that.
2281  C.CCMask = CC < 4 ? ~(~0U << (4 - CC)) : 0;
2282  else if (Cond == ISD::SETLE || Cond == ISD::SETULE)
2283  // bit 3 and above for CC==0, bit 0 and above for CC==3 (always true),
2284  // always true for CC>3.
2285  C.CCMask = CC < 4 ? ~0U << (3 - CC) : -1;
2286  else if (Cond == ISD::SETGT || Cond == ISD::SETUGT)
2287  // ...and the inverse of that.
2288  C.CCMask = CC < 4 ? ~(~0U << (3 - CC)) : 0;
2289  else
2290  llvm_unreachable("Unexpected integer comparison type");
2291  C.CCMask &= CCValid;
2292  return C;
2293 }
2294 
2295 // Decide how to implement a comparison of type Cond between CmpOp0 with CmpOp1.
2296 static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
2297  ISD::CondCode Cond, const SDLoc &DL) {
2298  if (CmpOp1.getOpcode() == ISD::Constant) {
2299  uint64_t Constant = cast<ConstantSDNode>(CmpOp1)->getZExtValue();
2300  unsigned Opcode, CCValid;
2301  if (CmpOp0.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
2302  CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) &&
2303  isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid))
2304  return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
2305  if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
2306  CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 &&
2307  isIntrinsicWithCC(CmpOp0, Opcode, CCValid))
2308  return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
2309  }
2310  Comparison C(CmpOp0, CmpOp1);
2311  C.CCMask = CCMaskForCondCode(Cond);
2312  if (C.Op0.getValueType().isFloatingPoint()) {
2313  C.CCValid = SystemZ::CCMASK_FCMP;
2314  C.Opcode = SystemZISD::FCMP;
2315  adjustForFNeg(C);
2316  } else {
2317  C.CCValid = SystemZ::CCMASK_ICMP;
2318  C.Opcode = SystemZISD::ICMP;
2319  // Choose the type of comparison. Equality and inequality tests can
2320  // use either signed or unsigned comparisons. The choice also doesn't
2321  // matter if both sign bits are known to be clear. In those cases we
2322  // want to give the main isel code the freedom to choose whichever
2323  // form fits best.
2324  if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
2325  C.CCMask == SystemZ::CCMASK_CMP_NE ||
2326  (DAG.SignBitIsZero(C.Op0) && DAG.SignBitIsZero(C.Op1)))
2327  C.ICmpType = SystemZICMP::Any;
2328  else if (C.CCMask & SystemZ::CCMASK_CMP_UO)
2329  C.ICmpType = SystemZICMP::UnsignedOnly;
2330  else
2331  C.ICmpType = SystemZICMP::SignedOnly;
2332  C.CCMask &= ~SystemZ::CCMASK_CMP_UO;
2333  adjustForRedundantAnd(DAG, DL, C);
2334  adjustZeroCmp(DAG, DL, C);
2335  adjustSubwordCmp(DAG, DL, C);
2336  adjustForSubtraction(DAG, DL, C);
2337  adjustForLTGFR(C);
2338  adjustICmpTruncate(DAG, DL, C);
2339  }
2340 
2341  if (shouldSwapCmpOperands(C)) {
2342  std::swap(C.Op0, C.Op1);
2343  C.CCMask = reverseCCMask(C.CCMask);
2344  }
2345 
2346  adjustForTestUnderMask(DAG, DL, C);
2347  return C;
2348 }
2349 
2350 // Emit the comparison instruction described by C.
2351 static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
2352  if (!C.Op1.getNode()) {
2353  SDNode *Node;
2354  switch (C.Op0.getOpcode()) {
2356  Node = emitIntrinsicWithCCAndChain(DAG, C.Op0, C.Opcode);
2357  return SDValue(Node, 0);
2359  Node = emitIntrinsicWithCC(DAG, C.Op0, C.Opcode);
2360  return SDValue(Node, Node->getNumValues() - 1);
2361  default:
2362  llvm_unreachable("Invalid comparison operands");
2363  }
2364  }
2365  if (C.Opcode == SystemZISD::ICMP)
2366  return DAG.getNode(SystemZISD::ICMP, DL, MVT::i32, C.Op0, C.Op1,
2367  DAG.getConstant(C.ICmpType, DL, MVT::i32));
2368  if (C.Opcode == SystemZISD::TM) {
2369  bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
2370  bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
2371  return DAG.getNode(SystemZISD::TM, DL, MVT::i32, C.Op0, C.Op1,
2372  DAG.getConstant(RegisterOnly, DL, MVT::i32));
2373  }
2374  return DAG.getNode(C.Opcode, DL, MVT::i32, C.Op0, C.Op1);
2375 }
2376 
2377 // Implement a 32-bit *MUL_LOHI operation by extending both operands to
2378 // 64 bits. Extend is the extension type to use. Store the high part
2379 // in Hi and the low part in Lo.
2380 static void lowerMUL_LOHI32(SelectionDAG &DAG, const SDLoc &DL, unsigned Extend,
2381  SDValue Op0, SDValue Op1, SDValue &Hi,
2382  SDValue &Lo) {
2383  Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
2384  Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
2385  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
2386  Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2387  DAG.getConstant(32, DL, MVT::i64));
2388  Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
2389  Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
2390 }
2391 
2392 // Lower a binary operation that produces two VT results, one in each
2393 // half of a GR128 pair. Op0 and Op1 are the VT operands to the operation,
2394 // and Opcode performs the GR128 operation. Store the even register result
2395 // in Even and the odd register result in Odd.
2396 static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
2397  unsigned Opcode, SDValue Op0, SDValue Op1,
2398  SDValue &Even, SDValue &Odd) {
2399  SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped, Op0, Op1);
2400  bool Is32Bit = is32Bit(VT);
2401  Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
2402  Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
2403 }
2404 
2405 // Return an i32 value that is 1 if the CC value produced by CCReg is
2406 // in the mask CCMask and 0 otherwise. CC is known to have a value
2407 // in CCValid, so other values can be ignored.
2408 static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue CCReg,
2409  unsigned CCValid, unsigned CCMask) {
2410  SDValue Ops[] = { DAG.getConstant(1, DL, MVT::i32),
2411  DAG.getConstant(0, DL, MVT::i32),
2412  DAG.getConstant(CCValid, DL, MVT::i32),
2413  DAG.getConstant(CCMask, DL, MVT::i32), CCReg };
2414  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, MVT::i32, Ops);
2415 }
2416 
2417 // Return the SystemISD vector comparison operation for CC, or 0 if it cannot
2418 // be done directly. IsFP is true if CC is for a floating-point rather than
2419 // integer comparison.
2420 static unsigned getVectorComparison(ISD::CondCode CC, bool IsFP) {
2421  switch (CC) {
2422  case ISD::SETOEQ:
2423  case ISD::SETEQ:
2424  return IsFP ? SystemZISD::VFCMPE : SystemZISD::VICMPE;
2425 
2426  case ISD::SETOGE:
2427  case ISD::SETGE:
2428  return IsFP ? SystemZISD::VFCMPHE : static_cast<SystemZISD::NodeType>(0);
2429 
2430  case ISD::SETOGT:
2431  case ISD::SETGT:
2432  return IsFP ? SystemZISD::VFCMPH : SystemZISD::VICMPH;
2433 
2434  case ISD::SETUGT:
2435  return IsFP ? static_cast<SystemZISD::NodeType>(0) : SystemZISD::VICMPHL;
2436 
2437  default:
2438  return 0;
2439  }
2440 }
2441 
2442 // Return the SystemZISD vector comparison operation for CC or its inverse,
2443 // or 0 if neither can be done directly. Indicate in Invert whether the
2444 // result is for the inverse of CC. IsFP is true if CC is for a
2445 // floating-point rather than integer comparison.
2446 static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool IsFP,
2447  bool &Invert) {
2448  if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
2449  Invert = false;
2450  return Opcode;
2451  }
2452 
2453  CC = ISD::getSetCCInverse(CC, !IsFP);
2454  if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
2455  Invert = true;
2456  return Opcode;
2457  }
2458 
2459  return 0;
2460 }
2461 
2462 // Return a v2f64 that contains the extended form of elements Start and Start+1
2463 // of v4f32 value Op.
2464 static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
2465  SDValue Op) {
2466  int Mask[] = { Start, -1, Start + 1, -1 };
2467  Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
2468  return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op);
2469 }
2470 
2471 // Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
2472 // producing a result of type VT.
2473 SDValue SystemZTargetLowering::getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
2474  const SDLoc &DL, EVT VT,
2475  SDValue CmpOp0,
2476  SDValue CmpOp1) const {
2477  // There is no hardware support for v4f32 (unless we have the vector
2478  // enhancements facility 1), so extend the vector into two v2f64s
2479  // and compare those.
2480  if (CmpOp0.getValueType() == MVT::v4f32 &&
2481  !Subtarget.hasVectorEnhancements1()) {
2482  SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
2483  SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
2484  SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
2485  SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1);
2486  SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1);
2487  SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1);
2488  return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
2489  }
2490  return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
2491 }
2492 
2493 // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
2494 // an integer mask of type VT.
2495 SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
2496  const SDLoc &DL, EVT VT,
2497  ISD::CondCode CC,
2498  SDValue CmpOp0,
2499  SDValue CmpOp1) const {
2500  bool IsFP = CmpOp0.getValueType().isFloatingPoint();
2501  bool Invert = false;
2502  SDValue Cmp;
2503  switch (CC) {
2504  // Handle tests for order using (or (ogt y x) (oge x y)).
2505  case ISD::SETUO:
2506  Invert = true;
2508  case ISD::SETO: {
2509  assert(IsFP && "Unexpected integer comparison");
2510  SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
2511  SDValue GE = getVectorCmp(DAG, SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
2512  Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE);
2513  break;
2514  }
2515 
2516  // Handle <> tests using (or (ogt y x) (ogt x y)).
2517  case ISD::SETUEQ:
2518  Invert = true;
2520  case ISD::SETONE: {
2521  assert(IsFP && "Unexpected integer comparison");
2522  SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
2523  SDValue GT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
2524  Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT);
2525  break;
2526  }
2527 
2528  // Otherwise a single comparison is enough. It doesn't really
2529  // matter whether we try the inversion or the swap first, since
2530  // there are no cases where both work.
2531  default:
2532  if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
2533  Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1);
2534  else {
2536  if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
2537  Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0);
2538  else
2539  llvm_unreachable("Unhandled comparison");
2540  }
2541  break;
2542  }
2543  if (Invert) {
2544  SDValue Mask =
2545  DAG.getSplatBuildVector(VT, DL, DAG.getConstant(-1, DL, MVT::i64));
2546  Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask);
2547  }
2548  return Cmp;
2549 }
2550 
2551 SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
2552  SelectionDAG &DAG) const {
2553  SDValue CmpOp0 = Op.getOperand(0);
2554  SDValue CmpOp1 = Op.getOperand(1);
2555  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
2556  SDLoc DL(Op);
2557  EVT VT = Op.getValueType();
2558  if (VT.isVector())
2559  return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1);
2560 
2561  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
2562  SDValue CCReg = emitCmp(DAG, DL, C);
2563  return emitSETCC(DAG, DL, CCReg, C.CCValid, C.CCMask);
2564 }
2565 
2566 SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
2567  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
2568  SDValue CmpOp0 = Op.getOperand(2);
2569  SDValue CmpOp1 = Op.getOperand(3);
2570  SDValue Dest = Op.getOperand(4);
2571  SDLoc DL(Op);
2572 
2573  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
2574  SDValue CCReg = emitCmp(DAG, DL, C);
2575  return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
2576  Op.getOperand(0), DAG.getConstant(C.CCValid, DL, MVT::i32),
2577  DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, CCReg);
2578 }
2579 
2580 // Return true if Pos is CmpOp and Neg is the negative of CmpOp,
2581 // allowing Pos and Neg to be wider than CmpOp.
2582 static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
2583  return (Neg.getOpcode() == ISD::SUB &&
2584  Neg.getOperand(0).getOpcode() == ISD::Constant &&
2585  cast<ConstantSDNode>(Neg.getOperand(0))->getZExtValue() == 0 &&
2586  Neg.getOperand(1) == Pos &&
2587  (Pos == CmpOp ||
2588  (Pos.getOpcode() == ISD::SIGN_EXTEND &&
2589  Pos.getOperand(0) == CmpOp)));
2590 }
2591 
2592 // Return the absolute or negative absolute of Op; IsNegative decides which.
2593 static SDValue getAbsolute(SelectionDAG &DAG, const SDLoc &DL, SDValue Op,
2594  bool IsNegative) {
2595  Op = DAG.getNode(SystemZISD::IABS, DL, Op.getValueType(), Op);
2596  if (IsNegative)
2597  Op = DAG.getNode(ISD::SUB, DL, Op.getValueType(),
2598  DAG.getConstant(0, DL, Op.getValueType()), Op);
2599  return Op;
2600 }
2601 
2602 SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
2603  SelectionDAG &DAG) const {
2604  SDValue CmpOp0 = Op.getOperand(0);
2605  SDValue CmpOp1 = Op.getOperand(1);
2606  SDValue TrueOp = Op.getOperand(2);
2607  SDValue FalseOp = Op.getOperand(3);
2608  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
2609  SDLoc DL(Op);
2610 
2611  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
2612 
2613  // Check for absolute and negative-absolute selections, including those
2614  // where the comparison value is sign-extended (for LPGFR and LNGFR).
2615  // This check supplements the one in DAGCombiner.
2616  if (C.Opcode == SystemZISD::ICMP &&
2617  C.CCMask != SystemZ::CCMASK_CMP_EQ &&
2618  C.CCMask != SystemZ::CCMASK_CMP_NE &&
2619  C.Op1.getOpcode() == ISD::Constant &&
2620  cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
2621  if (isAbsolute(C.Op0, TrueOp, FalseOp))
2622  return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT);
2623  if (isAbsolute(C.Op0, FalseOp, TrueOp))
2624  return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
2625  }
2626 
2627  SDValue CCReg = emitCmp(DAG, DL, C);
2628  SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, DL, MVT::i32),
2629  DAG.getConstant(C.CCMask, DL, MVT::i32), CCReg};
2630 
2631  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, Op.getValueType(), Ops);
2632 }
2633 
2634 SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
2635  SelectionDAG &DAG) const {
2636  SDLoc DL(Node);
2637  const GlobalValue *GV = Node->getGlobal();
2638  int64_t Offset = Node->getOffset();
2639  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2641 
2642  SDValue Result;
2643  if (Subtarget.isPC32DBLSymbol(GV, CM)) {
2644  // Assign anchors at 1<<12 byte boundaries.
2645  uint64_t Anchor = Offset & ~uint64_t(0xfff);
2646  Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
2647  Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
2648 
2649  // The offset can be folded into the address if it is aligned to a halfword.
2650  Offset -= Anchor;
2651  if (Offset != 0 && (Offset & 1) == 0) {
2652  SDValue Full = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor + Offset);
2653  Result = DAG.getNode(SystemZISD::PCREL_OFFSET, DL, PtrVT, Full, Result);
2654  Offset = 0;
2655  }
2656  } else {
2657  Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
2658  Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
2659  Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
2661  }
2662 
2663  // If there was a non-zero offset that we didn't fold, create an explicit
2664  // addition for it.
2665  if (Offset != 0)
2666  Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
2667  DAG.getConstant(Offset, DL, PtrVT));
2668 
2669  return Result;
2670 }
2671 
2672 SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
2673  SelectionDAG &DAG,
2674  unsigned Opcode,
2675  SDValue GOTOffset) const {
2676  SDLoc DL(Node);
2677  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2678  SDValue Chain = DAG.getEntryNode();
2679  SDValue Glue;
2680 
2681  // __tls_get_offset takes the GOT offset in %r2 and the GOT in %r12.
2682  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
2683  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R12D, GOT, Glue);
2684  Glue = Chain.getValue(1);
2685  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R2D, GOTOffset, Glue);
2686  Glue = Chain.getValue(1);
2687 
2688  // The first call operand is the chain and the second is the TLS symbol.
2690  Ops.push_back(Chain);
2691  Ops.push_back(DAG.getTargetGlobalAddress(Node->getGlobal(), DL,
2692  Node->getValueType(0),
2693  0, 0));
2694 
2695  // Add argument registers to the end of the list so that they are
2696  // known live into the call.
2697  Ops.push_back(DAG.getRegister(SystemZ::R2D, PtrVT));
2698  Ops.push_back(DAG.getRegister(SystemZ::R12D, PtrVT));
2699 
2700  // Add a register mask operand representing the call-preserved registers.
2701  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2702  const uint32_t *Mask =
2704  assert(Mask && "Missing call preserved mask for calling convention");
2705  Ops.push_back(DAG.getRegisterMask(Mask));
2706 
2707  // Glue the call to the argument copies.
2708  Ops.push_back(Glue);
2709 
2710  // Emit the call.
2711  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2712  Chain = DAG.getNode(Opcode, DL, NodeTys, Ops);
2713  Glue = Chain.getValue(1);
2714 
2715  // Copy the return value from %r2.
2716  return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
2717 }
2718 
2719 SDValue SystemZTargetLowering::lowerThreadPointer(const SDLoc &DL,
2720  SelectionDAG &DAG) const {
2721  SDValue Chain = DAG.getEntryNode();
2722  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2723 
2724  // The high part of the thread pointer is in access register 0.
2725  SDValue TPHi = DAG.getCopyFromReg(Chain, DL, SystemZ::A0, MVT::i32);
2726  TPHi = DAG.getNode(ISD::ANY_EXTEND, DL, PtrVT, TPHi);
2727 
2728  // The low part of the thread pointer is in access register 1.
2729  SDValue TPLo = DAG.getCopyFromReg(Chain, DL, SystemZ::A1, MVT::i32);
2730  TPLo = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TPLo);
2731 
2732  // Merge them into a single 64-bit address.
2733  SDValue TPHiShifted = DAG.getNode(ISD::SHL, DL, PtrVT, TPHi,
2734  DAG.getConstant(32, DL, PtrVT));
2735  return DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
2736 }
2737 
2738 SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
2739  SelectionDAG &DAG) const {
2740  if (DAG.getTarget().useEmulatedTLS())
2741  return LowerToTLSEmulatedModel(Node, DAG);
2742  SDLoc DL(Node);
2743  const GlobalValue *GV = Node->getGlobal();
2744  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2745  TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
2746 
2747  SDValue TP = lowerThreadPointer(DL, DAG);
2748 
2749  // Get the offset of GA from the thread pointer, based on the TLS model.
2750  SDValue Offset;
2751  switch (model) {
2752  case TLSModel::GeneralDynamic: {
2753  // Load the GOT offset of the tls_index (module ID / per-symbol offset).
2756 
2757  Offset = DAG.getConstantPool(CPV, PtrVT, 8);
2758  Offset = DAG.getLoad(
2759  PtrVT, DL, DAG.getEntryNode(), Offset,
2761 
2762  // Call __tls_get_offset to retrieve the offset.
2763  Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
2764  break;
2765  }
2766 
2767  case TLSModel::LocalDynamic: {
2768  // Load the GOT offset of the module ID.
2771 
2772  Offset = DAG.getConstantPool(CPV, PtrVT, 8);
2773  Offset = DAG.getLoad(
2774  PtrVT, DL, DAG.getEntryNode(), Offset,
2776 
2777  // Call __tls_get_offset to retrieve the module base offset.
2778  Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
2779 
2780  // Note: The SystemZLDCleanupPass will remove redundant computations
2781  // of the module base offset. Count total number of local-dynamic
2782  // accesses to trigger execution of that pass.
2786 
2787  // Add the per-symbol offset.
2789 
2790  SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
2791  DTPOffset = DAG.getLoad(
2792  PtrVT, DL, DAG.getEntryNode(), DTPOffset,
2794 
2795  Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
2796  break;
2797  }
2798 
2799  case TLSModel::InitialExec: {
2800  // Load the offset from the GOT.
2801  Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2803  Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
2804  Offset =
2805  DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
2807  break;
2808  }
2809 
2810  case TLSModel::LocalExec: {
2811  // Force the offset into the constant pool and load it from there.
2814 
2815  Offset = DAG.getConstantPool(CPV, PtrVT, 8);
2816  Offset = DAG.getLoad(
2817  PtrVT, DL, DAG.getEntryNode(), Offset,
2819  break;
2820  }
2821  }
2822 
2823  // Add the base and offset together.
2824  return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
2825 }
2826 
2827 SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
2828  SelectionDAG &DAG) const {
2829  SDLoc DL(Node);
2830  const BlockAddress *BA = Node->getBlockAddress();
2831  int64_t Offset = Node->getOffset();
2832  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2833 
2834  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
2835  Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
2836  return Result;
2837 }
2838 
2839 SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
2840  SelectionDAG &DAG) const {
2841  SDLoc DL(JT);
2842  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2843  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
2844 
2845  // Use LARL to load the address of the table.
2846  return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
2847 }
2848 
2849 SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
2850  SelectionDAG &DAG) const {
2851  SDLoc DL(CP);
2852  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2853 
2854  SDValue Result;
2855  if (CP->isMachineConstantPoolEntry())
2856  Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2857  CP->getAlignment());
2858  else
2859  Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2860  CP->getAlignment(), CP->getOffset());
2861 
2862  // Use LARL to load the address of the constant pool entry.
2863  return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
2864 }
2865 
2866 SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
2867  SelectionDAG &DAG) const {
2868  MachineFunction &MF = DAG.getMachineFunction();
2869  MachineFrameInfo &MFI = MF.getFrameInfo();
2870  MFI.setFrameAddressIsTaken(true);
2871 
2872  SDLoc DL(Op);
2873  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2874  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2875 
2876  // If the back chain frame index has not been allocated yet, do so.
2878  int BackChainIdx = FI->getFramePointerSaveIndex();
2879  if (!BackChainIdx) {
2880  // By definition, the frame address is the address of the back chain.
2881  BackChainIdx = MFI.CreateFixedObject(8, -SystemZMC::CallFrameSize, false);
2882  FI->setFramePointerSaveIndex(BackChainIdx);
2883  }
2884  SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);
2885 
2886  // FIXME The frontend should detect this case.
2887  if (Depth > 0) {
2888  report_fatal_error("Unsupported stack frame traversal count");
2889  }
2890 
2891  return BackChain;
2892 }
2893 
2894 SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
2895  SelectionDAG &DAG) const {
2896  MachineFunction &MF = DAG.getMachineFunction();
2897  MachineFrameInfo &MFI = MF.getFrameInfo();
2898  MFI.setReturnAddressIsTaken(true);
2899 
2901  return SDValue();
2902 
2903  SDLoc DL(Op);
2904  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2905  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2906 
2907  // FIXME The frontend should detect this case.
2908  if (Depth > 0) {
2909  report_fatal_error("Unsupported stack frame traversal count");
2910  }
2911 
2912  // Return R14D, which has the return address. Mark it an implicit live-in.
2913  unsigned LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass);
2914  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, LinkReg, PtrVT);
2915 }
2916 
2917 SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
2918  SelectionDAG &DAG) const {
2919  SDLoc DL(Op);
2920  SDValue In = Op.getOperand(0);
2921  EVT InVT = In.getValueType();
2922  EVT ResVT = Op.getValueType();
2923 
2924  // Convert loads directly. This is normally done by DAGCombiner,
2925  // but we need this case for bitcasts that are created during lowering
2926  // and which are then lowered themselves.
2927  if (auto *LoadN = dyn_cast<LoadSDNode>(In))
2928  if (ISD::isNormalLoad(LoadN)) {
2929  SDValue NewLoad = DAG.getLoad(ResVT, DL, LoadN->getChain(),
2930  LoadN->getBasePtr(), LoadN->getMemOperand());
2931  // Update the chain uses.
2932  DAG.ReplaceAllUsesOfValueWith(SDValue(LoadN, 1), NewLoad.getValue(1));
2933  return NewLoad;
2934  }
2935 
2936  if (InVT == MVT::i32 && ResVT == MVT::f32) {
2937  SDValue In64;
2938  if (Subtarget.hasHighWord()) {
2939  SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL,
2940  MVT::i64);
2941  In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
2942  MVT::i64, SDValue(U64, 0), In);
2943  } else {
2944  In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
2945  In64 = DAG.getNode(ISD::SHL, DL, MVT::i64, In64,
2946  DAG.getConstant(32, DL, MVT::i64));
2947  }
2948  SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
2949  return DAG.getTargetExtractSubreg(SystemZ::subreg_h32,
2950  DL, MVT::f32, Out64);
2951  }
2952  if (InVT == MVT::f32 && ResVT == MVT::i32) {
2953  SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
2954  SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
2955  MVT::f64, SDValue(U64, 0), In);
2956  SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
2957  if (Subtarget.hasHighWord())
2958  return DAG.getTargetExtractSubreg(SystemZ::subreg_h32, DL,
2959  MVT::i32, Out64);
2960  SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64,
2961  DAG.getConstant(32, DL, MVT::i64));
2962  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
2963  }
2964  llvm_unreachable("Unexpected bitcast combination");
2965 }
2966 
2967 SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
2968  SelectionDAG &DAG) const {
2969  MachineFunction &MF = DAG.getMachineFunction();
2970  SystemZMachineFunctionInfo *FuncInfo =
2972  EVT PtrVT = getPointerTy(DAG.getDataLayout());
2973 
2974  SDValue Chain = Op.getOperand(0);
2975  SDValue Addr = Op.getOperand(1);
2976  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2977  SDLoc DL(Op);
2978 
2979  // The initial values of each field.
2980  const unsigned NumFields = 4;
2981  SDValue Fields[NumFields] = {
2982  DAG.getConstant(FuncInfo->getVarArgsFirstGPR(), DL, PtrVT),
2983  DAG.getConstant(FuncInfo->getVarArgsFirstFPR(), DL, PtrVT),
2984  DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT),
2985  DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT)
2986  };
2987 
2988  // Store each field into its respective slot.
2989  SDValue MemOps[NumFields];
2990  unsigned Offset = 0;
2991  for (unsigned I = 0; I < NumFields; ++I) {
2992  SDValue FieldAddr = Addr;
2993  if (Offset != 0)
2994  FieldAddr = DAG.getNode(ISD::ADD, DL, PtrVT, FieldAddr,
2995  DAG.getIntPtrConstant(Offset, DL));
2996  MemOps[I] = DAG.getStore(Chain, DL, Fields[I], FieldAddr,
2997  MachinePointerInfo(SV, Offset));
2998  Offset += 8;
2999  }
3000  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3001 }
3002 
3003 SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
3004  SelectionDAG &DAG) const {
3005  SDValue Chain = Op.getOperand(0);
3006  SDValue DstPtr = Op.getOperand(1);
3007  SDValue SrcPtr = Op.getOperand(2);
3008  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
3009  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
3010  SDLoc DL(Op);
3011 
3012  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32, DL),
3013  /*Align*/8, /*isVolatile*/false, /*AlwaysInline*/false,
3014  /*isTailCall*/false,
3015  MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
3016 }
3017 
3018 SDValue SystemZTargetLowering::
3019 lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
3020  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
3021  MachineFunction &MF = DAG.getMachineFunction();
3022  bool RealignOpt = !MF.getFunction().hasFnAttribute("no-realign-stack");
3023  bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
3024 
3025  SDValue Chain = Op.getOperand(0);
3026  SDValue Size = Op.getOperand(1);
3027  SDValue Align = Op.getOperand(2);
3028  SDLoc DL(Op);
3029 
3030  // If user has set the no alignment function attribute, ignore
3031  // alloca alignments.
3032  uint64_t AlignVal = (RealignOpt ?
3033  dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0);
3034 
3035  uint64_t StackAlign = TFI->getStackAlignment();
3036  uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
3037  uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
3038 
3039  unsigned SPReg = getStackPointerRegisterToSaveRestore();
3040  SDValue NeededSpace = Size;
3041 
3042  // Get a reference to the stack pointer.
3043  SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);
3044 
3045  // If we need a backchain, save it now.
3046  SDValue Backchain;
3047  if (StoreBackchain)
3048  Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
3049 
3050  // Add extra space for alignment if needed.
3051  if (ExtraAlignSpace)
3052  NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace,
3053  DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
3054 
3055  // Get the new stack pointer value.
3056  SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
3057 
3058  // Copy the new stack pointer back.
3059  Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
3060 
3061  // The allocated data lives above the 160 bytes allocated for the standard
3062  // frame, plus any outgoing stack arguments. We don't know how much that
3063  // amounts to yet, so emit a special ADJDYNALLOC placeholder.
3064  SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
3065  SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
3066 
3067  // Dynamically realign if needed.
3068  if (RequiredAlign > StackAlign) {
3069  Result =
3070  DAG.getNode(ISD::ADD, DL, MVT::i64, Result,
3071  DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
3072  Result =
3073  DAG.getNode(ISD::AND, DL, MVT::i64, Result,
3074  DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64));
3075  }
3076 
3077  if (StoreBackchain)
3078  Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
3079 
3080  SDValue Ops[2] = { Result, Chain };
3081  return DAG.getMergeValues(Ops, DL);
3082 }
3083 
3084 SDValue SystemZTargetLowering::lowerGET_DYNAMIC_AREA_OFFSET(
3085  SDValue Op, SelectionDAG &DAG) const {
3086  SDLoc DL(Op);
3087 
3088  return DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
3089 }
3090 
3091 SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
3092  SelectionDAG &DAG) const {
3093  EVT VT = Op.getValueType();
3094  SDLoc DL(Op);
3095  SDValue Ops[2];
3096  if (is32Bit(VT))
3097  // Just do a normal 64-bit multiplication and extract the results.
3098  // We define this so that it can be used for constant division.
3100  Op.getOperand(1), Ops[1], Ops[0]);
3101  else if (Subtarget.hasMiscellaneousExtensions2())
3102  // SystemZISD::SMUL_LOHI returns the low result in the odd register and
3103  // the high result in the even register. ISD::SMUL_LOHI is defined to
3104  // return the low half first, so the results are in reverse order.
3106  Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
3107  else {
3108  // Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI:
3109  //
3110  // (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
3111  //
3112  // but using the fact that the upper halves are either all zeros
3113  // or all ones:
3114  //
3115  // (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
3116  //
3117  // and grouping the right terms together since they are quicker than the
3118  // multiplication:
3119  //
3120  // (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
3121  SDValue C63 = DAG.getConstant(63, DL, MVT::i64);
3122  SDValue LL = Op.getOperand(0);
3123  SDValue RL = Op.getOperand(1);
3124  SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
3125  SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
3126  // SystemZISD::UMUL_LOHI returns the low result in the odd register and
3127  // the high result in the even register. ISD::SMUL_LOHI is defined to
3128  // return the low half first, so the results are in reverse order.
3130  LL, RL, Ops[1], Ops[0]);
3131  SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
3132  SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
3133  SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
3134  Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
3135  }
3136  return DAG.getMergeValues(Ops, DL);
3137 }
3138 
3139 SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
3140  SelectionDAG &DAG) const {
3141  EVT VT = Op.getValueType();
3142  SDLoc DL(Op);
3143  SDValue Ops[2];
3144  if (is32Bit(VT))
3145  // Just do a normal 64-bit multiplication and extract the results.
3146  // We define this so that it can be used for constant division.
3148  Op.getOperand(1), Ops[1], Ops[0]);
3149  else
3150  // SystemZISD::UMUL_LOHI returns the low result in the odd register and
3151  // the high result in the even register. ISD::UMUL_LOHI is defined to
3152  // return the low half first, so the results are in reverse order.
3154  Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
3155  return DAG.getMergeValues(Ops, DL);
3156 }
3157 
3158 SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
3159  SelectionDAG &DAG) const {
3160  SDValue Op0 = Op.getOperand(0);
3161  SDValue Op1 = Op.getOperand(1);
3162  EVT VT = Op.getValueType();
3163  SDLoc DL(Op);
3164 
3165  // We use DSGF for 32-bit division. This means the first operand must
3166  // always be 64-bit, and the second operand should be 32-bit whenever
3167  // that is possible, to improve performance.
3168  if (is32Bit(VT))
3169  Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
3170  else if (DAG.ComputeNumSignBits(Op1) > 32)
3171  Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
3172 
3173  // DSG(F) returns the remainder in the even register and the
3174  // quotient in the odd register.
3175  SDValue Ops[2];
3176  lowerGR128Binary(DAG, DL, VT, SystemZISD::SDIVREM, Op0, Op1, Ops[1], Ops[0]);
3177  return DAG.getMergeValues(Ops, DL);
3178 }
3179 
3180 SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
3181  SelectionDAG &DAG) const {
3182  EVT VT = Op.getValueType();
3183  SDLoc DL(Op);
3184 
3185  // DL(G) returns the remainder in the even register and the
3186  // quotient in the odd register.
3187  SDValue Ops[2];
3189  Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
3190  return DAG.getMergeValues(Ops, DL);
3191 }
3192 
3193 SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
3194  assert(Op.getValueType() == MVT::i64 && "Should be 64-bit operation");
3195 
3196  // Get the known-zero masks for each operand.
3197  SDValue Ops[] = {Op.getOperand(0), Op.getOperand(1)};
3198  KnownBits Known[2] = {DAG.computeKnownBits(Ops[0]),
3199  DAG.computeKnownBits(Ops[1])};
3200 
3201  // See if the upper 32 bits of one operand and the lower 32 bits of the
3202  // other are known zero. They are the low and high operands respectively.
3203  uint64_t Masks[] = { Known[0].Zero.getZExtValue(),
3204  Known[1].Zero.getZExtValue() };
3205  unsigned High, Low;
3206  if ((Masks[0] >> 32) == 0xffffffff && uint32_t(Masks[1]) == 0xffffffff)
3207  High = 1, Low = 0;
3208  else if ((Masks[1] >> 32) == 0xffffffff && uint32_t(Masks[0]) == 0xffffffff)
3209  High = 0, Low = 1;
3210  else
3211  return Op;
3212 
3213  SDValue LowOp = Ops[Low];
3214  SDValue HighOp = Ops[High];
3215 
3216  // If the high part is a constant, we're better off using IILH.
3217  if (HighOp.getOpcode() == ISD::Constant)
3218  return Op;
3219 
3220  // If the low part is a constant that is outside the range of LHI,
3221  // then we're better off using IILF.
3222  if (LowOp.getOpcode() == ISD::Constant) {
3223  int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue());
3224  if (!isInt<16>(Value))
3225  return Op;
3226  }
3227 
3228  // Check whether the high part is an AND that doesn't change the
3229  // high 32 bits and just masks out low bits. We can skip it if so.
3230  if (HighOp.getOpcode() == ISD::AND &&
3231  HighOp.getOperand(1).getOpcode() == ISD::Constant) {
3232  SDValue HighOp0 = HighOp.getOperand(0);
3233  uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
3234  if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
3235  HighOp = HighOp0;
3236  }
3237 
3238  // Take advantage of the fact that all GR32 operations only change the
3239  // low 32 bits by truncating Low to an i32 and inserting it directly
3240  // using a subreg. The interesting cases are those where the truncation
3241  // can be folded.
3242  SDLoc DL(Op);
3243  SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
3244  return DAG.getTargetInsertSubreg(SystemZ::subreg_l32, DL,
3245  MVT::i64, HighOp, Low32);
3246 }
3247 
3248 // Lower SADDO/SSUBO/UADDO/USUBO nodes.
3249 SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
3250  SelectionDAG &DAG) const {
3251  SDNode *N = Op.getNode();
3252  SDValue LHS = N->getOperand(0);
3253  SDValue RHS = N->getOperand(1);
3254  SDLoc DL(N);
3255  unsigned BaseOp = 0;
3256  unsigned CCValid = 0;
3257  unsigned CCMask = 0;
3258 
3259  switch (Op.getOpcode()) {
3260  default: llvm_unreachable("Unknown instruction!");
3261  case ISD::SADDO:
3262  BaseOp = SystemZISD::SADDO;
3263  CCValid = SystemZ::CCMASK_ARITH;
3265  break;
3266  case ISD::SSUBO:
3267  BaseOp = SystemZISD::SSUBO;
3268  CCValid = SystemZ::CCMASK_ARITH;
3270  break;
3271  case ISD::UADDO:
3272  BaseOp = SystemZISD::UADDO;
3273  CCValid = SystemZ::CCMASK_LOGICAL;
3275  break;
3276  case ISD::USUBO:
3277  BaseOp = SystemZISD::USUBO;
3278  CCValid = SystemZ::CCMASK_LOGICAL;
3280  break;
3281  }
3282 
3283  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
3284  SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
3285 
3286  SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
3287  if (N->getValueType(1) == MVT::i1)
3288  SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
3289 
3290  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
3291 }
3292 
3293 // Lower ADDCARRY/SUBCARRY nodes.
3294 SDValue SystemZTargetLowering::lowerADDSUBCARRY(SDValue Op,
3295  SelectionDAG &DAG) const {
3296 
3297  SDNode *N = Op.getNode();
3298  MVT VT = N->getSimpleValueType(0);
3299 
3300  // Let legalize expand this if it isn't a legal type yet.
3301  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3302  return SDValue();
3303 
3304  SDValue LHS = N->getOperand(0);
3305  SDValue RHS = N->getOperand(1);
3306  SDValue Carry = Op.getOperand(2);
3307  SDLoc DL(N);
3308  unsigned BaseOp = 0;
3309  unsigned CCValid = 0;
3310  unsigned CCMask = 0;
3311 
3312  switch (Op.getOpcode()) {
3313  default: llvm_unreachable("Unknown instruction!");
3314  case ISD::ADDCARRY:
3315  BaseOp = SystemZISD::ADDCARRY;
3316  CCValid = SystemZ::CCMASK_LOGICAL;
3318  break;
3319  case ISD::SUBCARRY:
3320  BaseOp = SystemZISD::SUBCARRY;
3321  CCValid = SystemZ::CCMASK_LOGICAL;
3323  break;
3324  }
3325 
3326  // Set the condition code from the carry flag.
3327  Carry = DAG.getNode(SystemZISD::GET_CCMASK, DL, MVT::i32, Carry,
3328  DAG.getConstant(CCValid, DL, MVT::i32),
3329  DAG.getConstant(CCMask, DL, MVT::i32));
3330 
3331  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
3332  SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS, Carry);
3333 
3334  SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
3335  if (N->getValueType(1) == MVT::i1)
3336  SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
3337 
3338  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
3339 }
3340 
3341 SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
3342  SelectionDAG &DAG) const {
3343  EVT VT = Op.getValueType();
3344  SDLoc DL(Op);
3345  Op = Op.getOperand(0);
3346 
3347  // Handle vector types via VPOPCT.
3348  if (VT.isVector()) {
3349  Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Op);
3350  Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::v16i8, Op);
3351  switch (VT.getScalarSizeInBits()) {
3352  case 8:
3353  break;
3354  case 16: {
3355  Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
3356  SDValue Shift = DAG.getConstant(8, DL, MVT::i32);
3357  SDValue Tmp = DAG.getNode(SystemZISD::VSHL_BY_SCALAR, DL, VT, Op, Shift);
3358  Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
3359  Op = DAG.getNode(SystemZISD::VSRL_BY_SCALAR, DL, VT, Op, Shift);
3360  break;
3361  }
3362  case 32: {
3363  SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
3364  DAG.getConstant(0, DL, MVT::i32));
3365  Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
3366  break;
3367  }
3368  case 64: {
3369  SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
3370  DAG.getConstant(0, DL, MVT::i32));
3371  Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp);
3372  Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
3373  break;
3374  }
3375  default:
3376  llvm_unreachable("Unexpected type");
3377  }
3378  return Op;
3379  }
3380 
3381  // Get the known-zero mask for the operand.
3382  KnownBits Known = DAG.computeKnownBits(Op);
3383  unsigned NumSignificantBits = (~Known.Zero).getActiveBits();
3384  if (NumSignificantBits == 0)
3385  return DAG.getConstant(0, DL, VT);
3386 
3387  // Skip known-zero high parts of the operand.
3388  int64_t OrigBitSize = VT.getSizeInBits();
3389  int64_t BitSize = (int64_t)1 << Log2_32_Ceil(NumSignificantBits);
3390  BitSize = std::min(BitSize, OrigBitSize);
3391 
3392  // The POPCNT instruction counts the number of bits in each byte.
3393  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
3394  Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
3395  Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
3396 
3397  // Add up per-byte counts in a binary tree. All bits of Op at
3398  // position larger than BitSize remain zero throughout.
3399  for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
3400  SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, DL, VT));
3401  if (BitSize != OrigBitSize)
3402  Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
3403  DAG.getConstant(((uint64_t)1 << BitSize) - 1, DL, VT));
3404  Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
3405  }
3406 
3407  // Extract overall result from high byte.
3408  if (BitSize > 8)
3409  Op = DAG.getNode(ISD::SRL, DL, VT, Op,
3410  DAG.getConstant(BitSize - 8, DL, VT));
3411 
3412  return Op;
3413 }
3414 
3415 SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
3416  SelectionDAG &DAG) const {
3417  SDLoc DL(Op);
3418  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
3419  cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
3420  SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
3421  cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
3422 
3423  // The only fence that needs an instruction is a sequentially-consistent
3424  // cross-thread fence.
3425  if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
3426  FenceSSID == SyncScope::System) {
3427  return SDValue(DAG.getMachineNode(SystemZ::Serialize, DL, MVT::Other,
3428  Op.getOperand(0)),
3429  0);
3430  }
3431 
3432  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
3433  return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
3434 }
3435 
3436 // Op is an atomic load. Lower it into a normal volatile load.
3437 SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
3438  SelectionDAG &DAG) const {
3439  auto *Node = cast<AtomicSDNode>(Op.getNode());
3440  return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
3441  Node->getChain(), Node->getBasePtr(),
3442  Node->getMemoryVT(), Node->getMemOperand());
3443 }
3444 
3445 // Op is an atomic store. Lower it into a normal volatile store.
3446 SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
3447  SelectionDAG &DAG) const {
3448  auto *Node = cast<AtomicSDNode>(Op.getNode());
3449  SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
3450  Node->getBasePtr(), Node->getMemoryVT(),
3451  Node->getMemOperand());
3452  // We have to enforce sequential consistency by performing a
3453  // serialization operation after the store.
3454  if (Node->getOrdering() == AtomicOrdering::SequentiallyConsistent)
3455  Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
3456  MVT::Other, Chain), 0);
3457  return Chain;
3458 }
3459 
3460 // Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation. Lower the first
3461 // two into the fullword ATOMIC_LOADW_* operation given by Opcode.
3462 SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
3463  SelectionDAG &DAG,
3464  unsigned Opcode) const {
3465  auto *Node = cast<AtomicSDNode>(Op.getNode());
3466 
3467  // 32-bit operations need no code outside the main loop.
3468  EVT NarrowVT = Node->getMemoryVT();
3469  EVT WideVT = MVT::i32;
3470  if (NarrowVT == WideVT)
3471  return Op;
3472 
3473  int64_t BitSize = NarrowVT.getSizeInBits();
3474  SDValue ChainIn = Node->getChain();
3475  SDValue Addr = Node->getBasePtr();
3476  SDValue Src2 = Node->getVal();
3477  MachineMemOperand *MMO = Node->getMemOperand();
3478  SDLoc DL(Node);
3479  EVT PtrVT = Addr.getValueType();
3480 
3481  // Convert atomic subtracts of constants into additions.
3482  if (Opcode == SystemZISD::ATOMIC_LOADW_SUB)
3483  if (auto *Const = dyn_cast<ConstantSDNode>(Src2)) {
3485  Src2 = DAG.getConstant(-Const->getSExtValue(), DL, Src2.getValueType());
3486  }
3487 
3488  // Get the address of the containing word.
3489  SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
3490  DAG.getConstant(-4, DL, PtrVT));
3491 
3492  // Get the number of bits that the word must be rotated left in order
3493  // to bring the field to the top bits of a GR32.
3494  SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
3495  DAG.getConstant(3, DL, PtrVT));
3496  BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
3497 
3498  // Get the complementing shift amount, for rotating a field in the top
3499  // bits back to its proper position.
3500  SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
3501  DAG.getConstant(0, DL, WideVT), BitShift);
3502 
3503  // Extend the source operand to 32 bits and prepare it for the inner loop.
3504  // ATOMIC_SWAPW uses RISBG to rotate the field left, but all other
3505  // operations require the source to be shifted in advance. (This shift
3506  // can be folded if the source is constant.) For AND and NAND, the lower
3507  // bits must be set, while for other opcodes they should be left clear.
3508  if (Opcode != SystemZISD::ATOMIC_SWAPW)
3509  Src2 = DAG.getNode(ISD::SHL, DL, WideVT, Src2,
3510  DAG.getConstant(32 - BitSize, DL, WideVT));
3511  if (Opcode == SystemZISD::ATOMIC_LOADW_AND ||
3513  Src2 = DAG.getNode(ISD::OR, DL, WideVT, Src2,
3514  DAG.getConstant(uint32_t(-1) >> BitSize, DL, WideVT));
3515 
3516  // Construct the ATOMIC_LOADW_* node.
3517  SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
3518  SDValue Ops[] = { ChainIn, AlignedAddr, Src2, BitShift, NegBitShift,
3519  DAG.getConstant(BitSize, DL, WideVT) };
3520  SDValue AtomicOp = DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops,
3521  NarrowVT, MMO);
3522 
3523  // Rotate the result of the final CS so that the field is in the lower
3524  // bits of a GR32, then truncate it.
3525  SDValue ResultShift = DAG.getNode(ISD::ADD, DL, WideVT, BitShift,
3526  DAG.getConstant(BitSize, DL, WideVT));
3527  SDValue Result = DAG.getNode(ISD::ROTL, DL, WideVT, AtomicOp, ResultShift);
3528 
3529  SDValue RetOps[2] = { Result, AtomicOp.getValue(1) };
3530  return DAG.getMergeValues(RetOps, DL);
3531 }
3532 
3533 // Op is an ATOMIC_LOAD_SUB operation. Lower 8- and 16-bit operations
3534 // into ATOMIC_LOADW_SUBs and decide whether to convert 32- and 64-bit
3535 // operations into additions.
3536 SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
3537  SelectionDAG &DAG) const {
3538  auto *Node = cast<AtomicSDNode>(Op.getNode());
3539  EVT MemVT = Node->getMemoryVT();
3540  if (MemVT == MVT::i32 || MemVT == MVT::i64) {
3541  // A full-width operation.
3542  assert(Op.getValueType() == MemVT && "Mismatched VTs");
3543  SDValue Src2 = Node->getVal();
3544  SDValue NegSrc2;
3545  SDLoc DL(Src2);
3546 
3547  if (auto *Op2 = dyn_cast<ConstantSDNode>(Src2)) {
3548  // Use an addition if the operand is constant and either LAA(G) is
3549  // available or the negative value is in the range of A(G)FHI.
3550  int64_t Value = (-Op2->getAPIntValue()).getSExtValue();
3551  if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1())
3552  NegSrc2 = DAG.getConstant(Value, DL, MemVT);
3553  } else if (Subtarget.hasInterlockedAccess1())
3554  // Use LAA(G) if available.
3555  NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, DL, MemVT),
3556  Src2);
3557 
3558  if (NegSrc2.getNode())
3559  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, MemVT,
3560  Node->getChain(), Node->getBasePtr(), NegSrc2,
3561  Node->getMemOperand());
3562 
3563  // Use the node as-is.
3564  return Op;
3565  }
3566 
3567  return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
3568 }
3569 
3570 // Lower 8/16/32/64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS node.
3571 SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
3572  SelectionDAG &DAG) const {
3573  auto *Node = cast<AtomicSDNode>(Op.getNode());
3574  SDValue ChainIn = Node->getOperand(0);
3575  SDValue Addr = Node->getOperand(1);
3576  SDValue CmpVal = Node->getOperand(2);
3577  SDValue SwapVal = Node->getOperand(3);
3578  MachineMemOperand *MMO = Node->getMemOperand();
3579  SDLoc DL(Node);
3580 
3581  // We have native support for 32-bit and 64-bit compare and swap, but we
3582  // still need to expand extracting the "success" result from the CC.
3583  EVT NarrowVT = Node->getMemoryVT();
3584  EVT WideVT = NarrowVT == MVT::i64 ? MVT::i64 : MVT::i32;
3585  if (NarrowVT == WideVT) {
3586  SDVTList Tys = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
3587  SDValue Ops[] = { ChainIn, Addr, CmpVal, SwapVal };
3589  DL, Tys, Ops, NarrowVT, MMO);
3590  SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
3592 
3593  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
3595  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
3596  return SDValue();
3597  }
3598 
3599  // Convert 8-bit and 16-bit compare and swap to a loop, implemented
3600  // via a fullword ATOMIC_CMP_SWAPW operation.
3601  int64_t BitSize = NarrowVT.getSizeInBits();
3602  EVT PtrVT = Addr.getValueType();
3603 
3604  // Get the address of the containing word.
3605  SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
3606  DAG.getConstant(-4, DL, PtrVT));
3607 
3608  // Get the number of bits that the word must be rotated left in order
3609  // to bring the field to the top bits of a GR32.
3610  SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
3611  DAG.getConstant(3, DL, PtrVT));
3612  BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
3613 
3614  // Get the complementing shift amount, for rotating a field in the top
3615  // bits back to its proper position.
3616  SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
3617  DAG.getConstant(0, DL, WideVT), BitShift);
3618 
3619  // Construct the ATOMIC_CMP_SWAPW node.
3620  SDVTList VTList = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
3621  SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
3622  NegBitShift, DAG.getConstant(BitSize, DL, WideVT) };
3624  VTList, Ops, NarrowVT, MMO);
3625  SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
3627 
3628  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
3630  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
3631  return SDValue();
3632 }
3633 
3634 SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
3635  SelectionDAG &DAG) const {
3636  MachineFunction &MF = DAG.getMachineFunction();
3637  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
3638  return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
3639  SystemZ::R15D, Op.getValueType());
3640 }
3641 
3642 SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
3643  SelectionDAG &DAG) const {
3644  MachineFunction &MF = DAG.getMachineFunction();
3645  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
3646  bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
3647 
3648  SDValue Chain = Op.getOperand(0);
3649  SDValue NewSP = Op.getOperand(1);
3650  SDValue Backchain;
3651  SDLoc DL(Op);
3652 
3653  if (StoreBackchain) {
3654  SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, MVT::i64);
3655  Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
3656  }
3657 
3658  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R15D, NewSP);
3659 
3660  if (StoreBackchain)
3661  Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
3662 
3663  return Chain;
3664 }
3665 
3666 SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
3667  SelectionDAG &DAG) const {
3668  bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3669  if (!IsData)
3670  // Just preserve the chain.
3671  return Op.getOperand(0);
3672 
3673  SDLoc DL(Op);
3674  bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3675  unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
3676  auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
3677  SDValue Ops[] = {
3678  Op.getOperand(0),
3679  DAG.getConstant(Code, DL, MVT::i32),
3680  Op.getOperand(1)
3681  };
3683  Node->getVTList(), Ops,
3684  Node->getMemoryVT(), Node->getMemOperand());
3685 }
3686 
3687 // Convert condition code in CCReg to an i32 value.
3689  SDLoc DL(CCReg);
3690  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
3691  return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
3693 }
3694 
3695 SDValue
3696 SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
3697  SelectionDAG &DAG) const {
3698  unsigned Opcode, CCValid;
3699  if (isIntrinsicWithCCAndChain(Op, Opcode, CCValid)) {
3700  assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
3701  SDNode *Node = emitIntrinsicWithCCAndChain(DAG, Op, Opcode);
3702  SDValue CC = getCCResult(DAG, SDValue(Node, 0));
3703  DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), CC);
3704  return SDValue();
3705  }
3706 
3707  return SDValue();
3708 }
3709 
3710 SDValue
3711 SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
3712  SelectionDAG &DAG) const {
3713  unsigned Opcode, CCValid;
3714  if (isIntrinsicWithCC(Op, Opcode, CCValid)) {
3715  SDNode *Node = emitIntrinsicWithCC(DAG, Op, Opcode);
3716  if (Op->getNumValues() == 1)
3717  return getCCResult(DAG, SDValue(Node, 0));
3718  assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
3719  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(),
3720  SDValue(Node, 0), getCCResult(DAG, SDValue(Node, 1)));
3721  }
3722 
3723  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3724  switch (Id) {
3725  case Intrinsic::thread_pointer:
3726  return lowerThreadPointer(SDLoc(Op), DAG);
3727 
3728  case Intrinsic::s390_vpdi:
3729  return DAG.getNode(SystemZISD::PERMUTE_DWORDS, SDLoc(Op), Op.getValueType(),
3730  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3731 
3732  case Intrinsic::s390_vperm:
3733  return DAG.getNode(SystemZISD::PERMUTE, SDLoc(Op), Op.getValueType(),
3734  Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3735 
3736  case Intrinsic::s390_vuphb:
3737  case Intrinsic::s390_vuphh:
3738  case Intrinsic::s390_vuphf:
3739  return DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(Op), Op.getValueType(),
3740  Op.getOperand(1));
3741 
3742  case Intrinsic::s390_vuplhb:
3743  case Intrinsic::s390_vuplhh:
3744  case Intrinsic::s390_vuplhf:
3745  return DAG.getNode(SystemZISD::UNPACKL_HIGH, SDLoc(Op), Op.getValueType(),
3746  Op.getOperand(1));
3747 
3748  case Intrinsic::s390_vuplb:
3749  case Intrinsic::s390_vuplhw:
3750  case Intrinsic::s390_vuplf:
3751  return DAG.getNode(SystemZISD::UNPACK_LOW, SDLoc(Op), Op.getValueType(),
3752  Op.getOperand(1));
3753 
3754  case Intrinsic::s390_vupllb:
3755  case Intrinsic::s390_vupllh:
3756  case Intrinsic::s390_vupllf:
3757  return DAG.getNode(SystemZISD::UNPACKL_LOW, SDLoc(Op), Op.getValueType(),
3758  Op.getOperand(1));
3759 
3760  case Intrinsic::s390_vsumb:
3761  case Intrinsic::s390_vsumh:
3762  case Intrinsic::s390_vsumgh:
3763  case Intrinsic::s390_vsumgf:
3764  case Intrinsic::s390_vsumqf:
3765  case Intrinsic::s390_vsumqg:
3766  return DAG.getNode(SystemZISD::VSUM, SDLoc(Op), Op.getValueType(),
3767  Op.getOperand(1), Op.getOperand(2));
3768  }
3769 
3770  return SDValue();
3771 }
3772 
3773 namespace {
3774 // Says that SystemZISD operation Opcode can be used to perform the equivalent
3775 // of a VPERM with permute vector Bytes. If Opcode takes three operands,
3776 // Operand is the constant third operand, otherwise it is the number of
3777 // bytes in each element of the result.
3778 struct Permute {
3779  unsigned Opcode;
3780  unsigned Operand;
3781  unsigned char Bytes[SystemZ::VectorBytes];
3782 };
3783 }
3784 
3785 static const Permute PermuteForms[] = {
3786  // VMRHG
3788  { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 } },
3789  // VMRHF
3791  { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 } },
3792  // VMRHH
3794  { 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 } },
3795  // VMRHB
3797  { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 } },
3798  // VMRLG
3799  { SystemZISD::MERGE_LOW, 8,
3800  { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 } },
3801  // VMRLF
3802  { SystemZISD::MERGE_LOW, 4,
3803  { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } },
3804  // VMRLH
3805  { SystemZISD::MERGE_LOW, 2,
3806  { 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } },
3807  // VMRLB
3808  { SystemZISD::MERGE_LOW, 1,
3809  { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 } },
3810  // VPKG
3811  { SystemZISD::PACK, 4,
3812  { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 } },
3813  // VPKF
3814  { SystemZISD::PACK, 2,
3815  { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } },
3816  // VPKH
3817  { SystemZISD::PACK, 1,
3818  { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } },
3819  // VPDI V1, V2, 4 (low half of V1, high half of V2)
3821  { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 } },
3822  // VPDI V1, V2, 1 (high half of V1, low half of V2)
3824  { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 } }
3825 };
3826 
3827 // Called after matching a vector shuffle against a particular pattern.
3828 // Both the original shuffle and the pattern have two vector operands.
3829 // OpNos[0] is the operand of the original shuffle that should be used for
3830 // operand 0 of the pattern, or -1 if operand 0 of the pattern can be anything.
3831 // OpNos[1] is the same for operand 1 of the pattern. Resolve these -1s and
3832 // set OpNo0 and OpNo1 to the shuffle operands that should actually be used
3833 // for operands 0 and 1 of the pattern.
3834 static bool chooseShuffleOpNos(int *OpNos, unsigned &OpNo0, unsigned &OpNo1) {
3835  if (OpNos[0] < 0) {
3836  if (OpNos[1] < 0)
3837  return false;
3838  OpNo0 = OpNo1 = OpNos[1];
3839  } else if (OpNos[1] < 0) {
3840  OpNo0 = OpNo1 = OpNos[0];
3841  } else {
3842  OpNo0 = OpNos[0];
3843  OpNo1 = OpNos[1];
3844  }
3845  return true;
3846 }
3847 
3848 // Bytes is a VPERM-like permute vector, except that -1 is used for
3849 // undefined bytes. Return true if the VPERM can be implemented using P.
3850 // When returning true set OpNo0 to the VPERM operand that should be
3851 // used for operand 0 of P and likewise OpNo1 for operand 1 of P.
3852 //
3853 // For example, if swapping the VPERM operands allows P to match, OpNo0
3854 // will be 1 and OpNo1 will be 0. If instead Bytes only refers to one
3855 // operand, but rewriting it to use two duplicated operands allows it to
3856 // match P, then OpNo0 and OpNo1 will be the same.
3857 static bool matchPermute(const SmallVectorImpl<int> &Bytes, const Permute &P,
3858  unsigned &OpNo0, unsigned &OpNo1) {
3859  int OpNos[] = { -1, -1 };
3860  for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
3861  int Elt = Bytes[I];
3862  if (Elt >= 0) {
3863  // Make sure that the two permute vectors use the same suboperand
3864  // byte number. Only the operand numbers (the high bits) are
3865  // allowed to differ.
3866  if ((Elt ^ P.Bytes[I]) & (SystemZ::VectorBytes - 1))
3867  return false;
3868  int ModelOpNo = P.Bytes[I] / SystemZ::VectorBytes;
3869  int RealOpNo = unsigned(Elt) / SystemZ::VectorBytes;
3870  // Make sure that the operand mappings are consistent with previous
3871  // elements.
3872  if (OpNos[ModelOpNo] == 1 - RealOpNo)
3873  return false;
3874  OpNos[ModelOpNo] = RealOpNo;
3875  }
3876  }
3877  return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
3878 }
3879 
3880 // As above, but search for a matching permute.
3881 static const Permute *matchPermute(const SmallVectorImpl<int> &Bytes,
3882  unsigned &OpNo0, unsigned &OpNo1) {
3883  for (auto &P : PermuteForms)
3884  if (matchPermute(Bytes, P, OpNo0, OpNo1))
3885  return &P;
3886  return nullptr;
3887 }
3888 
3889 // Bytes is a VPERM-like permute vector, except that -1 is used for
3890 // undefined bytes. This permute is an operand of an outer permute.
3891 // See whether redistributing the -1 bytes gives a shuffle that can be
3892 // implemented using P. If so, set Transform to a VPERM-like permute vector
3893 // that, when applied to the result of P, gives the original permute in Bytes.
3894 static bool matchDoublePermute(const SmallVectorImpl<int> &Bytes,
3895  const Permute &P,
3896  SmallVectorImpl<int> &Transform) {
3897  unsigned To = 0;
3898  for (unsigned From = 0; From < SystemZ::VectorBytes; ++From) {
3899  int Elt = Bytes[From];
3900  if (Elt < 0)
3901  // Byte number From of the result is undefined.
3902  Transform[From] = -1;
3903  else {
3904  while (P.Bytes[To] != Elt) {
3905  To += 1;
3906  if (To == SystemZ::VectorBytes)
3907  return false;
3908  }
3909  Transform[From] = To;
3910  }
3911  }
3912  return true;
3913 }
3914 
3915 // As above, but search for a matching permute.
3916 static const Permute *matchDoublePermute(const SmallVectorImpl<int> &Bytes,
3917  SmallVectorImpl<int> &Transform) {
3918  for (auto &P : PermuteForms)
3919  if (matchDoublePermute(Bytes, P, Transform))
3920  return &P;
3921  return nullptr;
3922 }
3923 
3924 // Convert the mask of the given shuffle op into a byte-level mask,
3925 // as if it had type vNi8.
3926 static bool getVPermMask(SDValue ShuffleOp,
3927  SmallVectorImpl<int> &Bytes) {
3928  EVT VT = ShuffleOp.getValueType();
3929  unsigned NumElements = VT.getVectorNumElements();
3930  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
3931 
3932  if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(ShuffleOp)) {
3933  Bytes.resize(NumElements * BytesPerElement, -1);
3934  for (unsigned I = 0; I < NumElements; ++I) {
3935  int Index = VSN->getMaskElt(I);
3936  if (Index >= 0)
3937  for (unsigned J = 0; J < BytesPerElement; ++J)
3938  Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
3939  }
3940  return true;
3941  }
3942  if (SystemZISD::SPLAT == ShuffleOp.getOpcode() &&
3943  isa<ConstantSDNode>(ShuffleOp.getOperand(1))) {
3944  unsigned Index = ShuffleOp.getConstantOperandVal(1);
3945  Bytes.resize(NumElements * BytesPerElement, -1);
3946  for (unsigned I = 0; I < NumElements; ++I)
3947  for (unsigned J = 0; J < BytesPerElement; ++J)
3948  Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
3949  return true;
3950  }
3951  return false;
3952 }
3953 
3954 // Bytes is a VPERM-like permute vector, except that -1 is used for
3955 // undefined bytes. See whether bytes [Start, Start + BytesPerElement) of
3956 // the result come from a contiguous sequence of bytes from one input.
3957 // Set Base to the selector for the first byte if so.
3958 static bool getShuffleInput(const SmallVectorImpl<int> &Bytes, unsigned Start,
3959  unsigned BytesPerElement, int &Base) {
3960  Base = -1;
3961  for (unsigned I = 0; I < BytesPerElement; ++I) {
3962  if (Bytes[Start + I] >= 0) {
3963  unsigned Elem = Bytes[Start + I];
3964  if (Base < 0) {
3965  Base = Elem - I;
3966  // Make sure the bytes would come from one input operand.
3967  if (unsigned(Base) % Bytes.size() + BytesPerElement > Bytes.size())
3968  return false;
3969  } else if (unsigned(Base) != Elem - I)
3970  return false;
3971  }
3972  }
3973  return true;
3974 }
3975 
3976 // Bytes is a VPERM-like permute vector, except that -1 is used for
3977 // undefined bytes. Return true if it can be performed using VSLDI.
3978 // When returning true, set StartIndex to the shift amount and OpNo0
3979 // and OpNo1 to the VPERM operands that should be used as the first
3980 // and second shift operand respectively.
3981 static bool isShlDoublePermute(const SmallVectorImpl<int> &Bytes,
3982  unsigned &StartIndex, unsigned &OpNo0,
3983  unsigned &OpNo1) {
3984  int OpNos[] = { -1, -1 };
3985  int Shift = -1;
3986  for (unsigned I = 0; I < 16; ++I) {
3987  int Index = Bytes[I];
3988  if (Index >= 0) {
3989  int ExpectedShift = (Index - I) % SystemZ::VectorBytes;
3990  int ModelOpNo = unsigned(ExpectedShift + I) / SystemZ::VectorBytes;
3991  int RealOpNo = unsigned(Index) / SystemZ::VectorBytes;
3992  if (Shift < 0)
3993  Shift = ExpectedShift;
3994  else if (Shift != ExpectedShift)
3995  return false;
3996  // Make sure that the operand mappings are consistent with previous
3997  // elements.
3998  if (OpNos[ModelOpNo] == 1 - RealOpNo)
3999  return false;
4000  OpNos[ModelOpNo] = RealOpNo;
4001  }
4002  }
4003  StartIndex = Shift;
4004  return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
4005 }
4006 
4007 // Create a node that performs P on operands Op0 and Op1, casting the
4008 // operands to the appropriate type. The type of the result is determined by P.
4009 static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
4010  const Permute &P, SDValue Op0, SDValue Op1) {
4011  // VPDI (PERMUTE_DWORDS) always operates on v2i64s. The input
4012  // elements of a PACK are twice as wide as the outputs.
4013  unsigned InBytes = (P.Opcode == SystemZISD::PERMUTE_DWORDS ? 8 :
4014  P.Opcode == SystemZISD::PACK ? P.Operand * 2 :
4015  P.Operand);
4016  // Cast both operands to the appropriate type.
4017  MVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBytes * 8),
4018  SystemZ::VectorBytes / InBytes);
4019  Op0 = DAG.getNode(ISD::BITCAST, DL, InVT, Op0);
4020  Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1);
4021  SDValue Op;
4022  if (P.Opcode == SystemZISD::PERMUTE_DWORDS) {
4023  SDValue Op2 = DAG.getConstant(P.Operand, DL, MVT::i32);
4024  Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2);
4025  } else if (P.Opcode == SystemZISD::PACK) {
4026  MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8),
4027  SystemZ::VectorBytes / P.Operand);
4028  Op = DAG.getNode(SystemZISD::PACK, DL, OutVT, Op0, Op1);
4029  } else {
4030  Op = DAG.getNode(P.Opcode, DL, InVT, Op0, Op1);
4031  }
4032  return Op;
4033 }
4034 
4035 // Bytes is a VPERM-like permute vector, except that -1 is used for
4036 // undefined bytes. Implement it on operands Ops[0] and Ops[1] using
4037 // VSLDI or VPERM.
4039  SDValue *Ops,
4040  const SmallVectorImpl<int> &Bytes) {
4041  for (unsigned I = 0; I < 2; ++I)
4042  Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]);
4043 
4044  // First see whether VSLDI can be used.
4045  unsigned StartIndex, OpNo0, OpNo1;
4046  if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
4047  return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0],
4048  Ops[OpNo1], DAG.getConstant(StartIndex, DL, MVT::i32));
4049 
4050  // Fall back on VPERM. Construct an SDNode for the permute vector.
4051  SDValue IndexNodes[SystemZ::VectorBytes];
4052  for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
4053  if (Bytes[I] >= 0)
4054  IndexNodes[I] = DAG.getConstant(Bytes[I], DL, MVT::i32);
4055  else
4056  IndexNodes[I] = DAG.getUNDEF(MVT::i32);
4057  SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
4058  return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2);
4059 }
4060 
4061 namespace {
4062 // Describes a general N-operand vector shuffle.
4063 struct GeneralShuffle {
4064  GeneralShuffle(EVT vt) : VT(vt) {}
4065  void addUndef();
4066  bool add(SDValue, unsigned);
4067  SDValue getNode(SelectionDAG &, const SDLoc &);
4068 
4069  // The operands of the shuffle.
4071 
4072  // Index I is -1 if byte I of the result is undefined. Otherwise the
4073  // result comes from byte Bytes[I] % SystemZ::VectorBytes of operand
4074  // Bytes[I] / SystemZ::VectorBytes.
4076 
4077  // The type of the shuffle result.
4078  EVT VT;
4079 };
4080 }
4081 
4082 // Add an extra undefined element to the shuffle.
4083 void GeneralShuffle::addUndef() {
4084  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
4085  for (unsigned I = 0; I < BytesPerElement; ++I)
4086  Bytes.push_back(-1);
4087 }
4088 
4089 // Add an extra element to the shuffle, taking it from element Elem of Op.
4090 // A null Op indicates a vector input whose value will be calculated later;
4091 // there is at most one such input per shuffle and it always has the same
4092 // type as the result. Aborts and returns false if the source vector elements
4093 // of an EXTRACT_VECTOR_ELT are smaller than the destination elements. Per
4094 // LLVM they become implicitly extended, but this is rare and not optimized.
4095 bool GeneralShuffle::add(SDValue Op, unsigned Elem) {
4096  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
4097 
4098  // The source vector can have wider elements than the result,
4099  // either through an explicit TRUNCATE or because of type legalization.
4100  // We want the least significant part.
4101  EVT FromVT = Op.getNode() ? Op.getValueType() : VT;
4102  unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize();
4103 
4104  // Return false if the source elements are smaller than their destination
4105  // elements.
4106  if (FromBytesPerElement < BytesPerElement)
4107  return false;
4108 
4109  unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes +
4110  (FromBytesPerElement - BytesPerElement));
4111 
4112  // Look through things like shuffles and bitcasts.
4113  while (Op.getNode()) {
4114  if (Op.getOpcode() == ISD::BITCAST)
4115  Op = Op.getOperand(0);
4116  else if (Op.getOpcode() == ISD::VECTOR_SHUFFLE && Op.hasOneUse()) {
4117  // See whether the bytes we need come from a contiguous part of one
4118  // operand.
4120  if (!getVPermMask(Op, OpBytes))
4121  break;
4122  int NewByte;
4123  if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte))
4124  break;
4125  if (NewByte < 0) {
4126  addUndef();
4127  return true;
4128  }
4129  Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
4130  Byte = unsigned(NewByte) % SystemZ::VectorBytes;
4131  } else if (Op.isUndef()) {
4132  addUndef();
4133  return true;
4134  } else
4135  break;
4136  }
4137 
4138  // Make sure that the source of the extraction is in Ops.
4139  unsigned OpNo = 0;
4140  for (; OpNo < Ops.size(); ++OpNo)
4141  if (Ops[OpNo] == Op)
4142  break;
4143  if (OpNo == Ops.size())
4144  Ops.push_back(Op);
4145 
4146  // Add the element to Bytes.
4147  unsigned Base = OpNo * SystemZ::VectorBytes + Byte;
4148  for (unsigned I = 0; I < BytesPerElement; ++I)
4149  Bytes.push_back(Base + I);
4150 
4151  return true;
4152 }
4153 
4154 // Return SDNodes for the completed shuffle.
4155 SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
4156  assert(Bytes.size() == SystemZ::VectorBytes && "Incomplete vector");
4157 
4158  if (Ops.size() == 0)
4159  return DAG.getUNDEF(VT);
4160 
4161  // Make sure that there are at least two shuffle operands.
4162  if (Ops.size() == 1)
4163  Ops.push_back(DAG.getUNDEF(MVT::v16i8));
4164 
4165  // Create a tree of shuffles, deferring root node until after the loop.
4166  // Try to redistribute the undefined elements of non-root nodes so that
4167  // the non-root shuffles match something like a pack or merge, then adjust
4168  // the parent node's permute vector to compensate for the new order.
4169  // Among other things, this copes with vectors like <2 x i16> that were
4170  // padded with undefined elements during type legalization.
4171  //
4172  // In the best case this redistribution will lead to the whole tree
4173  // using packs and merges. It should rarely be a loss in other cases.
4174  unsigned Stride = 1;
4175  for (; Stride * 2 < Ops.size(); Stride *= 2) {
4176  for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) {
4177  SDValue SubOps[] = { Ops[I], Ops[I + Stride] };
4178 
4179  // Create a mask for just these two operands.
4181  for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
4182  unsigned OpNo = unsigned(Bytes[J]) / SystemZ::VectorBytes;
4183  unsigned Byte = unsigned(Bytes[J]) % SystemZ::VectorBytes;
4184  if (OpNo == I)
4185  NewBytes[J] = Byte;
4186  else if (OpNo == I + Stride)
4187  NewBytes[J] = SystemZ::VectorBytes + Byte;
4188  else
4189  NewBytes[J] = -1;
4190  }
4191  // See if it would be better to reorganize NewMask to avoid using VPERM.
4192  SmallVector<int, SystemZ::VectorBytes> NewBytesMap(SystemZ::VectorBytes);
4193  if (const Permute *P = matchDoublePermute(NewBytes, NewBytesMap)) {
4194  Ops[I] = getPermuteNode(DAG, DL, *P, SubOps[0], SubOps[1]);
4195  // Applying NewBytesMap to Ops[I] gets back to NewBytes.
4196  for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
4197  if (NewBytes[J] >= 0) {
4198  assert(unsigned(NewBytesMap[J]) < SystemZ::VectorBytes &&
4199  "Invalid double permute");
4200  Bytes[J] = I * SystemZ::VectorBytes + NewBytesMap[J];
4201  } else
4202  assert(NewBytesMap[J] < 0 && "Invalid double permute");
4203  }
4204  } else {
4205  // Just use NewBytes on the operands.
4206  Ops[I] = getGeneralPermuteNode(DAG, DL, SubOps, NewBytes);
4207  for (unsigned J = 0; J < SystemZ::VectorBytes; ++J)
4208  if (NewBytes[J] >= 0)
4209  Bytes[J] = I * SystemZ::VectorBytes + J;
4210  }
4211  }
4212  }
4213 
4214  // Now we just have 2 inputs. Put the second operand in Ops[1].
4215  if (Stride > 1) {
4216  Ops[1] = Ops[Stride];
4217  for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
4218  if (Bytes[I] >= int(SystemZ::VectorBytes))
4219  Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes;
4220  }
4221 
4222  // Look for an instruction that can do the permute without resorting
4223  // to VPERM.
4224  unsigned OpNo0, OpNo1;
4225  SDValue Op;
4226  if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
4227  Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
4228  else
4229  Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
4230  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
4231 }
4232 
4233 // Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
4234 static bool isScalarToVector(SDValue Op) {
4235  for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
4236  if (!Op.getOperand(I).isUndef())
4237  return false;
4238  return true;
4239 }
4240 
4241 // Return a vector of type VT that contains Value in the first element.
4242 // The other elements don't matter.
4243 static SDValue buildScalarToVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
4244  SDValue Value) {
4245  // If we have a constant, replicate it to all elements and let the
4246  // BUILD_VECTOR lowering take care of it.
4247  if (Value.getOpcode() == ISD::Constant ||
4248  Value.getOpcode() == ISD::ConstantFP) {
4250  return DAG.getBuildVector(VT, DL, Ops);
4251  }
4252  if (Value.isUndef())
4253  return DAG.getUNDEF(VT);
4254  return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
4255 }
4256 
4257 // Return a vector of type VT in which Op0 is in element 0 and Op1 is in
4258 // element 1. Used for cases in which replication is cheap.
4259 static SDValue buildMergeScalars(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
4260  SDValue Op0, SDValue Op1) {
4261  if (Op0.isUndef()) {
4262  if (Op1.isUndef())
4263  return DAG.getUNDEF(VT);
4264  return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op1);
4265  }
4266  if (Op1.isUndef())
4267  return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0);
4268  return DAG.getNode(SystemZISD::MERGE_HIGH, DL, VT,
4269  buildScalarToVector(DAG, DL, VT, Op0),
4270  buildScalarToVector(DAG, DL, VT, Op1));
4271 }
4272 
4273 // Extend GPR scalars Op0 and Op1 to doublewords and return a v2i64
4274 // vector for them.
4275 static SDValue joinDwords(SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
4276  SDValue Op1) {
4277  if (Op0.isUndef() && Op1.isUndef())
4278  return DAG.getUNDEF(MVT::v2i64);
4279  // If one of the two inputs is undefined then replicate the other one,
4280  // in order to avoid using another register unnecessarily.
4281  if (Op0.isUndef())
4282  Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
4283  else if (Op1.isUndef())
4284  Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
4285  else {
4286  Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
4287  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
4288  }
4289  return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1);
4290 }
4291 
4292 // Try to represent constant BUILD_VECTOR node BVN using a BYTE MASK style
4293 // mask. Store the mask value in Mask on success.
4296  EVT ElemVT = BVN->getValueType(0).getVectorElementType();
4297  unsigned BytesPerElement = ElemVT.getStoreSize();
4298  for (unsigned I = 0, E = BVN->getNumOperands(); I != E; ++I) {
4299  SDValue Op = BVN->getOperand(I);
4300  if (!Op.isUndef()) {
4301  uint64_t Value;
4302  if (Op.getOpcode() == ISD::Constant)
4303  Value = cast<ConstantSDNode>(Op)->getZExtValue();
4304  else if (Op.getOpcode() == ISD::ConstantFP)
4305  Value = (cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt()
4306  .getZExtValue());
4307  else
4308  return false;
4309  for (unsigned J = 0; J < BytesPerElement; ++J) {
4310  uint64_t Byte = (Value >> (J * 8)) & 0xff;
4311  if (Byte == 0xff)
4312  Mask |= 1ULL << ((E - I - 1) * BytesPerElement + J);
4313  else if (Byte != 0)
4314  return false;
4315  }
4316  }
4317  }
4318  return true;
4319 }
4320 
4321 // Try to load a vector constant in which BitsPerElement-bit value Value
4322 // is replicated to fill the vector. VT is the type of the resulting
4323 // constant, which may have elements of a different size from BitsPerElement.
4324 // Return the SDValue of the constant on success, otherwise return
4325 // an empty value.
4327  const SystemZInstrInfo *TII,
4328  const SDLoc &DL, EVT VT, uint64_t Value,
4329  unsigned BitsPerElement) {
4330  // Signed 16-bit values can be replicated using VREPI.
4331  // Mark the constants as opaque or DAGCombiner will convert back to
4332  // BUILD_VECTOR.
4333  int64_t SignedValue = SignExtend64(Value, BitsPerElement);
4334  if (isInt<16>(SignedValue)) {
4335  MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
4336  SystemZ::VectorBits / BitsPerElement);
4337  SDValue Op = DAG.getNode(
4338  SystemZISD::REPLICATE, DL, VecVT,
4339  DAG.getConstant(SignedValue, DL, MVT::i32, false, true /*isOpaque*/));
4340  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
4341  }
4342  // See whether rotating the constant left some N places gives a value that
4343  // is one less than a power of 2 (i.e. all zeros followed by all ones).
4344  // If so we can use VGM.
4345  unsigned Start, End;
4346  if (TII->isRxSBGMask(Value, BitsPerElement, Start, End)) {
4347  // isRxSBGMask returns the bit numbers for a full 64-bit value,
4348  // with 0 denoting 1 << 63 and 63 denoting 1. Convert them to
4349  // bit numbers for an BitsPerElement value, so that 0 denotes
4350  // 1 << (BitsPerElement-1).
4351  Start -= 64 - BitsPerElement;
4352  End -= 64 - BitsPerElement;
4353  MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
4354  SystemZ::VectorBits / BitsPerElement);
4355  SDValue Op = DAG.getNode(
4356  SystemZISD::ROTATE_MASK, DL, VecVT,
4357  DAG.getConstant(Start, DL, MVT::i32, false, true /*isOpaque*/),
4358  DAG.getConstant(End, DL, MVT::i32, false, true /*isOpaque*/));
4359  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
4360  }
4361  return SDValue();
4362 }
4363 
4364 // If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually
4365 // better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for
4366 // the non-EXTRACT_VECTOR_ELT elements. See if the given BUILD_VECTOR
4367 // would benefit from this representation and return it if so.
4369  BuildVectorSDNode *BVN) {
4370  EVT VT = BVN->getValueType(0);
4371  unsigned NumElements = VT.getVectorNumElements();
4372 
4373  // Represent the BUILD_VECTOR as an N-operand VECTOR_SHUFFLE-like operation
4374  // on byte vectors. If there are non-EXTRACT_VECTOR_ELT elements that still
4375  // need a BUILD_VECTOR, add an additional placeholder operand for that
4376  // BUILD_VECTOR and store its operands in ResidueOps.
4377  GeneralShuffle GS(VT);
4379  bool FoundOne = false;
4380  for (unsigned I = 0; I < NumElements; ++I) {
4381  SDValue Op = BVN->getOperand(I);
4382  if (Op.getOpcode() == ISD::TRUNCATE)
4383  Op = Op.getOperand(0);
4384  if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
4385  Op.getOperand(1).getOpcode() == ISD::Constant) {
4386  unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4387  if (!GS.add(Op.getOperand(0), Elem))
4388  return SDValue();
4389  FoundOne = true;
4390  } else if (Op.isUndef()) {
4391  GS.addUndef();
4392  } else {
4393  if (!GS.add(SDValue(), ResidueOps.size()))
4394  return SDValue();
4395  ResidueOps.push_back(BVN->getOperand(I));
4396  }
4397  }
4398 
4399  // Nothing to do if there are no EXTRACT_VECTOR_ELTs.
4400  if (!FoundOne)
4401  return SDValue();
4402 
4403  // Create the BUILD_VECTOR for the remaining elements, if any.
4404  if (!ResidueOps.empty()) {
4405  while (ResidueOps.size() < NumElements)
4406  ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType()));
4407  for (auto &Op : GS.Ops) {
4408  if (!Op.getNode()) {
4409  Op = DAG.getBuildVector(VT, SDLoc(BVN), ResidueOps);
4410  break;
4411  }
4412  }
4413  }
4414  return GS.getNode(DAG, SDLoc(BVN));
4415 }
4416 
4417 // Combine GPR scalar values Elems into a vector of type VT.
4418 static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
4419  SmallVectorImpl<SDValue> &Elems) {
4420  // See whether there is a single replicated value.
4421  SDValue Single;
4422  unsigned int NumElements = Elems.size();
4423  unsigned int Count = 0;
4424  for (auto Elem : Elems) {
4425  if (!Elem.isUndef()) {
4426  if (!Single.getNode())
4427  Single = Elem;
4428  else if (Elem != Single) {
4429  Single = SDValue();
4430  break;
4431  }
4432  Count += 1;
4433  }
4434  }
4435  // There are three cases here:
4436  //
4437  // - if the only defined element is a loaded one, the best sequence
4438  // is a replicating load.
4439  //
4440  // - otherwise, if the only defined element is an i64 value, we will
4441  // end up with the same VLVGP sequence regardless of whether we short-cut
4442  // for replication or fall through to the later code.
4443  //
4444  // - otherwise, if the only defined element is an i32 or smaller value,
4445  // we would need 2 instructions to replicate it: VLVGP followed by VREPx.
4446  // This is only a win if the single defined element is used more than once.
4447  // In other cases we're better off using a single VLVGx.
4448  if (Single.getNode() && (Count > 1 || Single.getOpcode() == ISD::LOAD))
4449  return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
4450 
4451  // If all elements are loads, use VLREP/VLEs (below).
4452  bool AllLoads = true;
4453  for (auto Elem : Elems)
4454  if (Elem.getOpcode() != ISD::LOAD || cast<LoadSDNode>(Elem)->isIndexed()) {
4455  AllLoads = false;
4456  break;
4457  }
4458 
4459  // The best way of building a v2i64 from two i64s is to use VLVGP.
4460  if (VT == MVT::v2i64 && !AllLoads)
4461  return joinDwords(DAG, DL, Elems[0], Elems[1]);
4462 
4463  // Use a 64-bit merge high to combine two doubles.
4464  if (VT == MVT::v2f64 && !AllLoads)
4465  return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
4466 
4467  // Build v4f32 values directly from the FPRs:
4468  //
4469  // <Axxx> <Bxxx> <Cxxxx> <Dxxx>
4470  // V V VMRHF
4471  // <ABxx> <CDxx>
4472  // V VMRHG
4473  // <ABCD>
4474  if (VT == MVT::v4f32 && !AllLoads) {
4475  SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
4476  SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
4477  // Avoid unnecessary undefs by reusing the other operand.
4478  if (Op01.isUndef())
4479  Op01 = Op23;
4480  else if (Op23.isUndef())
4481  Op23 = Op01;
4482  // Merging identical replications is a no-op.
4483  if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
4484  return Op01;
4485  Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
4486  Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
4488  DL, MVT::v2i64, Op01, Op23);
4489  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
4490  }
4491 
4492  // Collect the constant terms.
4494  SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false);
4495 
4496  unsigned NumConstants = 0;
4497  for (unsigned I = 0; I < NumElements; ++I) {
4498  SDValue Elem = Elems[I];
4499  if (Elem.getOpcode() == ISD::Constant ||
4500  Elem.getOpcode() == ISD::ConstantFP) {
4501  NumConstants += 1;
4502  Constants[I] = Elem;
4503  Done[I] = true;
4504  }
4505  }
4506  // If there was at least one constant, fill in the other elements of
4507  // Constants with undefs to get a full vector constant and use that
4508  // as the starting point.
4509  SDValue Result;
4510  SDValue ReplicatedVal;
4511  if (NumConstants > 0) {
4512  for (unsigned I = 0; I < NumElements; ++I)
4513  if (!Constants[I].getNode())
4514  Constants[I] = DAG.getUNDEF(Elems[I].getValueType());
4515  Result = DAG.getBuildVector(VT, DL, Constants);
4516  } else {
4517  // Otherwise try to use VLREP or VLVGP to start the sequence in order to
4518  // avoid a false dependency on any previous contents of the vector
4519  // register.
4520 
4521  // Use a VLREP if at least one element is a load. Make sure to replicate
4522  // the load with the most elements having its value.
4523  std::map<const SDNode*, unsigned> UseCounts;
4524  SDNode *LoadMaxUses = nullptr;
4525  for (unsigned I = 0; I < NumElements; ++I)
4526  if (Elems[I].getOpcode() == ISD::LOAD &&
4527  cast<LoadSDNode>(Elems[I])->isUnindexed()) {
4528  SDNode *Ld = Elems[I].getNode();
4529  UseCounts[Ld]++;
4530  if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld])
4531  LoadMaxUses = Ld;
4532  }
4533  if (LoadMaxUses != nullptr) {
4534  ReplicatedVal = SDValue(LoadMaxUses, 0);
4535  Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, ReplicatedVal);
4536  } else {
4537  // Try to use VLVGP.
4538  unsigned I1 = NumElements / 2 - 1;
4539  unsigned I2 = NumElements - 1;
4540  bool Def1 = !Elems[I1].isUndef();
4541  bool Def2 = !Elems[I2].isUndef();
4542  if (Def1 || Def2) {
4543  SDValue Elem1 = Elems[Def1 ? I1 : I2];
4544  SDValue Elem2 = Elems[Def2 ? I2 : I1];
4545  Result = DAG.getNode(ISD::BITCAST, DL, VT,
4546  joinDwords(DAG, DL, Elem1, Elem2));
4547  Done[I1] = true;
4548  Done[I2] = true;
4549  } else
4550  Result = DAG.getUNDEF(VT);
4551  }
4552  }
4553 
4554  // Use VLVGx to insert the other elements.
4555  for (unsigned I = 0; I < NumElements; ++I)
4556  if (!Done[I] && !Elems[I].isUndef() && Elems[I] != ReplicatedVal)
4557  Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I],
4558  DAG.getConstant(I, DL, MVT::i32));
4559  return Result;
4560 }
4561 
4562 SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
4563  SelectionDAG &DAG) const {
4564  const SystemZInstrInfo *TII =
4565  static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
4566  auto *BVN = cast<BuildVectorSDNode>(Op.getNode());
4567  SDLoc DL(Op);
4568  EVT VT = Op.getValueType();
4569 
4570  if (BVN->isConstant()) {
4571  // Try using VECTOR GENERATE BYTE MASK. This is the architecturally-
4572  // preferred way of creating all-zero and all-one vectors so give it
4573  // priority over other methods below.
4574  uint64_t Mask;
4575  if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
4577  (VT.isInteger() && tryBuildVectorByteMask(BVN, Mask)))
4578  return Op;
4579 
4580  // Try using some form of replication.
4581  APInt SplatBits, SplatUndef;
4582  unsigned SplatBitSize;
4583  bool HasAnyUndefs;
4584  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
4585  8, true) &&
4586  SplatBitSize <= 64) {
4587  // First try assuming that any undefined bits above the highest set bit
4588  // and below the lowest set bit are 1s. This increases the likelihood of
4589  // being able to use a sign-extended element value in VECTOR REPLICATE
4590  // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK.
4591  uint64_t SplatBitsZ = SplatBits.getZExtValue();
4592  uint64_t SplatUndefZ = SplatUndef.getZExtValue();
4593  uint64_t Lower = (SplatUndefZ
4594  & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1));
4595  uint64_t Upper = (SplatUndefZ
4596  & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1));
4597  uint64_t Value = SplatBitsZ | Upper | Lower;
4598  SDValue Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value,
4599  SplatBitSize);
4600  if (Op.getNode())
4601  return Op;
4602 
4603  // Now try assuming that any undefined bits between the first and
4604  // last defined set bits are set. This increases the chances of
4605  // using a non-wraparound mask.
4606  uint64_t Middle = SplatUndefZ & ~Upper & ~Lower;
4607  Value = SplatBitsZ | Middle;
4608  Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, SplatBitSize);
4609  if (Op.getNode())
4610  return Op;
4611  }
4612 
4613  // Fall back to loading it from memory.
4614  return SD