LLVM  10.0.0svn
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86.h"
15 #include "X86MachineFunctionInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Config/llvm-config.h"
24 #include "llvm/IR/ConstantRange.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/KnownBits.h"
36 #include <stdint.h>
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "x86-isel"
40 
41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42 
43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44  cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45  cl::Hidden);
46 
47 //===----------------------------------------------------------------------===//
48 // Pattern Matcher Implementation
49 //===----------------------------------------------------------------------===//
50 
51 namespace {
52  /// This corresponds to X86AddressMode, but uses SDValue's instead of register
53  /// numbers for the leaves of the matched tree.
54  struct X86ISelAddressMode {
55  enum {
56  RegBase,
57  FrameIndexBase
58  } BaseType;
59 
60  // This is really a union, discriminated by BaseType!
61  SDValue Base_Reg;
62  int Base_FrameIndex;
63 
64  unsigned Scale;
65  SDValue IndexReg;
66  int32_t Disp;
67  SDValue Segment;
68  const GlobalValue *GV;
69  const Constant *CP;
70  const BlockAddress *BlockAddr;
71  const char *ES;
72  MCSymbol *MCSym;
73  int JT;
74  unsigned Align; // CP alignment.
75  unsigned char SymbolFlags; // X86II::MO_*
76  bool NegateIndex = false;
77 
78  X86ISelAddressMode()
79  : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
80  Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
81  MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
82 
83  bool hasSymbolicDisplacement() const {
84  return GV != nullptr || CP != nullptr || ES != nullptr ||
85  MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
86  }
87 
88  bool hasBaseOrIndexReg() const {
89  return BaseType == FrameIndexBase ||
90  IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
91  }
92 
93  /// Return true if this addressing mode is already RIP-relative.
94  bool isRIPRelative() const {
95  if (BaseType != RegBase) return false;
96  if (RegisterSDNode *RegNode =
97  dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
98  return RegNode->getReg() == X86::RIP;
99  return false;
100  }
101 
102  void setBaseReg(SDValue Reg) {
103  BaseType = RegBase;
104  Base_Reg = Reg;
105  }
106 
107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
108  void dump(SelectionDAG *DAG = nullptr) {
109  dbgs() << "X86ISelAddressMode " << this << '\n';
110  dbgs() << "Base_Reg ";
111  if (Base_Reg.getNode())
112  Base_Reg.getNode()->dump(DAG);
113  else
114  dbgs() << "nul\n";
115  if (BaseType == FrameIndexBase)
116  dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
117  dbgs() << " Scale " << Scale << '\n'
118  << "IndexReg ";
119  if (NegateIndex)
120  dbgs() << "negate ";
121  if (IndexReg.getNode())
122  IndexReg.getNode()->dump(DAG);
123  else
124  dbgs() << "nul\n";
125  dbgs() << " Disp " << Disp << '\n'
126  << "GV ";
127  if (GV)
128  GV->dump();
129  else
130  dbgs() << "nul";
131  dbgs() << " CP ";
132  if (CP)
133  CP->dump();
134  else
135  dbgs() << "nul";
136  dbgs() << '\n'
137  << "ES ";
138  if (ES)
139  dbgs() << ES;
140  else
141  dbgs() << "nul";
142  dbgs() << " MCSym ";
143  if (MCSym)
144  dbgs() << MCSym;
145  else
146  dbgs() << "nul";
147  dbgs() << " JT" << JT << " Align" << Align << '\n';
148  }
149 #endif
150  };
151 }
152 
153 namespace {
154  //===--------------------------------------------------------------------===//
155  /// ISel - X86-specific code to select X86 machine instructions for
156  /// SelectionDAG operations.
157  ///
158  class X86DAGToDAGISel final : public SelectionDAGISel {
159  /// Keep a pointer to the X86Subtarget around so that we can
160  /// make the right decision when generating code for different targets.
161  const X86Subtarget *Subtarget;
162 
163  /// If true, selector should try to optimize for code size instead of
164  /// performance.
165  bool OptForSize;
166 
167  /// If true, selector should try to optimize for minimum code size.
168  bool OptForMinSize;
169 
170  /// Disable direct TLS access through segment registers.
171  bool IndirectTlsSegRefs;
172 
173  public:
174  explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
175  : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
176  OptForMinSize(false), IndirectTlsSegRefs(false) {}
177 
178  StringRef getPassName() const override {
179  return "X86 DAG->DAG Instruction Selection";
180  }
181 
182  bool runOnMachineFunction(MachineFunction &MF) override {
183  // Reset the subtarget each time through.
184  Subtarget = &MF.getSubtarget<X86Subtarget>();
185  IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
186  "indirect-tls-seg-refs");
187 
188  // OptFor[Min]Size are used in pattern predicates that isel is matching.
189  OptForSize = MF.getFunction().hasOptSize();
190  OptForMinSize = MF.getFunction().hasMinSize();
191  assert((!OptForMinSize || OptForSize) &&
192  "OptForMinSize implies OptForSize");
193 
195  return true;
196  }
197 
198  void EmitFunctionEntryCode() override;
199 
200  bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
201 
202  void PreprocessISelDAG() override;
203  void PostprocessISelDAG() override;
204 
205 // Include the pieces autogenerated from the target description.
206 #include "X86GenDAGISel.inc"
207 
208  private:
209  void Select(SDNode *N) override;
210 
211  bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
212  bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
213  bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
214  bool matchAddress(SDValue N, X86ISelAddressMode &AM);
215  bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
216  bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
217  bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218  unsigned Depth);
219  bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
220  bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
221  SDValue &Scale, SDValue &Index, SDValue &Disp,
222  SDValue &Segment);
223  bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
224  SDValue &Scale, SDValue &Index, SDValue &Disp,
225  SDValue &Segment);
226  bool selectMOV64Imm32(SDValue N, SDValue &Imm);
227  bool selectLEAAddr(SDValue N, SDValue &Base,
228  SDValue &Scale, SDValue &Index, SDValue &Disp,
229  SDValue &Segment);
230  bool selectLEA64_32Addr(SDValue N, SDValue &Base,
231  SDValue &Scale, SDValue &Index, SDValue &Disp,
232  SDValue &Segment);
233  bool selectTLSADDRAddr(SDValue N, SDValue &Base,
234  SDValue &Scale, SDValue &Index, SDValue &Disp,
235  SDValue &Segment);
236  bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
237  SDValue &Base, SDValue &Scale,
238  SDValue &Index, SDValue &Disp,
239  SDValue &Segment,
240  SDValue &NodeWithChain);
241  bool selectRelocImm(SDValue N, SDValue &Op);
242 
243  bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
244  SDValue &Base, SDValue &Scale,
245  SDValue &Index, SDValue &Disp,
246  SDValue &Segment);
247 
248  // Convenience method where P is also root.
249  bool tryFoldLoad(SDNode *P, SDValue N,
250  SDValue &Base, SDValue &Scale,
251  SDValue &Index, SDValue &Disp,
252  SDValue &Segment) {
253  return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
254  }
255 
256  bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
257  SDValue &Base, SDValue &Scale,
258  SDValue &Index, SDValue &Disp,
259  SDValue &Segment);
260 
261  /// Implement addressing mode selection for inline asm expressions.
262  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
263  unsigned ConstraintID,
264  std::vector<SDValue> &OutOps) override;
265 
266  void emitSpecialCodeForMain();
267 
268  inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
269  MVT VT, SDValue &Base, SDValue &Scale,
270  SDValue &Index, SDValue &Disp,
271  SDValue &Segment) {
272  if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
273  Base = CurDAG->getTargetFrameIndex(
274  AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
275  else if (AM.Base_Reg.getNode())
276  Base = AM.Base_Reg;
277  else
278  Base = CurDAG->getRegister(0, VT);
279 
280  Scale = getI8Imm(AM.Scale, DL);
281 
282  // Negate the index if needed.
283  if (AM.NegateIndex) {
284  unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
285  SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
286  AM.IndexReg), 0);
287  AM.IndexReg = Neg;
288  }
289 
290  if (AM.IndexReg.getNode())
291  Index = AM.IndexReg;
292  else
293  Index = CurDAG->getRegister(0, VT);
294 
295  // These are 32-bit even in 64-bit mode since RIP-relative offset
296  // is 32-bit.
297  if (AM.GV)
298  Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
299  MVT::i32, AM.Disp,
300  AM.SymbolFlags);
301  else if (AM.CP)
302  Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
303  AM.Align, AM.Disp, AM.SymbolFlags);
304  else if (AM.ES) {
305  assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
306  Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
307  } else if (AM.MCSym) {
308  assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
309  assert(AM.SymbolFlags == 0 && "oo");
310  Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
311  } else if (AM.JT != -1) {
312  assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
313  Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
314  } else if (AM.BlockAddr)
315  Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
316  AM.SymbolFlags);
317  else
318  Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
319 
320  if (AM.Segment.getNode())
321  Segment = AM.Segment;
322  else
323  Segment = CurDAG->getRegister(0, MVT::i16);
324  }
325 
326  // Utility function to determine whether we should avoid selecting
327  // immediate forms of instructions for better code size or not.
328  // At a high level, we'd like to avoid such instructions when
329  // we have similar constants used within the same basic block
330  // that can be kept in a register.
331  //
332  bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
333  uint32_t UseCount = 0;
334 
335  // Do not want to hoist if we're not optimizing for size.
336  // TODO: We'd like to remove this restriction.
337  // See the comment in X86InstrInfo.td for more info.
338  if (!OptForSize)
339  return false;
340 
341  // Walk all the users of the immediate.
342  for (SDNode::use_iterator UI = N->use_begin(),
343  UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
344 
345  SDNode *User = *UI;
346 
347  // This user is already selected. Count it as a legitimate use and
348  // move on.
349  if (User->isMachineOpcode()) {
350  UseCount++;
351  continue;
352  }
353 
354  // We want to count stores of immediates as real uses.
355  if (User->getOpcode() == ISD::STORE &&
356  User->getOperand(1).getNode() == N) {
357  UseCount++;
358  continue;
359  }
360 
361  // We don't currently match users that have > 2 operands (except
362  // for stores, which are handled above)
363  // Those instruction won't match in ISEL, for now, and would
364  // be counted incorrectly.
365  // This may change in the future as we add additional instruction
366  // types.
367  if (User->getNumOperands() != 2)
368  continue;
369 
370  // If this can match to INC/DEC, don't count it as a use.
371  if (User->getOpcode() == ISD::ADD &&
372  (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0))))
373  continue;
374 
375  // Immediates that are used for offsets as part of stack
376  // manipulation should be left alone. These are typically
377  // used to indicate SP offsets for argument passing and
378  // will get pulled into stores/pushes (implicitly).
379  if (User->getOpcode() == X86ISD::ADD ||
380  User->getOpcode() == ISD::ADD ||
381  User->getOpcode() == X86ISD::SUB ||
382  User->getOpcode() == ISD::SUB) {
383 
384  // Find the other operand of the add/sub.
385  SDValue OtherOp = User->getOperand(0);
386  if (OtherOp.getNode() == N)
387  OtherOp = User->getOperand(1);
388 
389  // Don't count if the other operand is SP.
390  RegisterSDNode *RegNode;
391  if (OtherOp->getOpcode() == ISD::CopyFromReg &&
392  (RegNode = dyn_cast_or_null<RegisterSDNode>(
393  OtherOp->getOperand(1).getNode())))
394  if ((RegNode->getReg() == X86::ESP) ||
395  (RegNode->getReg() == X86::RSP))
396  continue;
397  }
398 
399  // ... otherwise, count this and move on.
400  UseCount++;
401  }
402 
403  // If we have more than 1 use, then recommend for hoisting.
404  return (UseCount > 1);
405  }
406 
407  /// Return a target constant with the specified value of type i8.
408  inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
409  return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
410  }
411 
412  /// Return a target constant with the specified value, of type i32.
413  inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
414  return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
415  }
416 
417  /// Return a target constant with the specified value, of type i64.
418  inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
419  return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
420  }
421 
422  SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
423  const SDLoc &DL) {
424  assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
425  uint64_t Index = N->getConstantOperandVal(1);
426  MVT VecVT = N->getOperand(0).getSimpleValueType();
427  return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
428  }
429 
430  SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
431  const SDLoc &DL) {
432  assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
433  uint64_t Index = N->getConstantOperandVal(2);
434  MVT VecVT = N->getSimpleValueType(0);
435  return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
436  }
437 
438  // Helper to detect unneeded and instructions on shift amounts. Called
439  // from PatFrags in tablegen.
440  bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
441  assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
442  const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
443 
444  if (Val.countTrailingOnes() >= Width)
445  return true;
446 
447  APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
448  return Mask.countTrailingOnes() >= Width;
449  }
450 
451  /// Return an SDNode that returns the value of the global base register.
452  /// Output instructions required to initialize the global base register,
453  /// if necessary.
454  SDNode *getGlobalBaseReg();
455 
456  /// Return a reference to the TargetMachine, casted to the target-specific
457  /// type.
458  const X86TargetMachine &getTargetMachine() const {
459  return static_cast<const X86TargetMachine &>(TM);
460  }
461 
462  /// Return a reference to the TargetInstrInfo, casted to the target-specific
463  /// type.
464  const X86InstrInfo *getInstrInfo() const {
465  return Subtarget->getInstrInfo();
466  }
467 
468  /// Address-mode matching performs shift-of-and to and-of-shift
469  /// reassociation in order to expose more scaled addressing
470  /// opportunities.
471  bool ComplexPatternFuncMutatesDAG() const override {
472  return true;
473  }
474 
475  bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
476 
477  /// Returns whether this is a relocatable immediate in the range
478  /// [-2^Width .. 2^Width-1].
479  template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
480  if (auto *CN = dyn_cast<ConstantSDNode>(N))
481  return isInt<Width>(CN->getSExtValue());
482  return isSExtAbsoluteSymbolRef(Width, N);
483  }
484 
485  // Indicates we should prefer to use a non-temporal load for this load.
486  bool useNonTemporalLoad(LoadSDNode *N) const {
487  if (!N->isNonTemporal())
488  return false;
489 
490  unsigned StoreSize = N->getMemoryVT().getStoreSize();
491 
492  if (N->getAlignment() < StoreSize)
493  return false;
494 
495  switch (StoreSize) {
496  default: llvm_unreachable("Unsupported store size");
497  case 4:
498  case 8:
499  return false;
500  case 16:
501  return Subtarget->hasSSE41();
502  case 32:
503  return Subtarget->hasAVX2();
504  case 64:
505  return Subtarget->hasAVX512();
506  }
507  }
508 
509  bool foldLoadStoreIntoMemOperand(SDNode *Node);
510  MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
511  bool matchBitExtract(SDNode *Node);
512  bool shrinkAndImmediate(SDNode *N);
513  bool isMaskZeroExtended(SDNode *N) const;
514  bool tryShiftAmountMod(SDNode *N);
515  bool combineIncDecVector(SDNode *Node);
516  bool tryShrinkShlLogicImm(SDNode *N);
517  bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
518  bool tryMatchBitSelect(SDNode *N);
519 
520  MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
521  const SDLoc &dl, MVT VT, SDNode *Node);
522  MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
523  const SDLoc &dl, MVT VT, SDNode *Node,
524  SDValue &InFlag);
525 
526  bool tryOptimizeRem8Extend(SDNode *N);
527 
528  bool onlyUsesZeroFlag(SDValue Flags) const;
529  bool hasNoSignFlagUses(SDValue Flags) const;
530  bool hasNoCarryFlagUses(SDValue Flags) const;
531  };
532 }
533 
534 
535 // Returns true if this masked compare can be implemented legally with this
536 // type.
537 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
538  unsigned Opcode = N->getOpcode();
539  if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
540  Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) {
541  // We can get 256-bit 8 element types here without VLX being enabled. When
542  // this happens we will use 512-bit operations and the mask will not be
543  // zero extended.
544  EVT OpVT = N->getOperand(0).getValueType();
545  if (OpVT.is256BitVector() || OpVT.is128BitVector())
546  return Subtarget->hasVLX();
547 
548  return true;
549  }
550  // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
551  if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
552  Opcode == X86ISD::FSETCCM_SAE)
553  return true;
554 
555  return false;
556 }
557 
558 // Returns true if we can assume the writer of the mask has zero extended it
559 // for us.
560 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
561  // If this is an AND, check if we have a compare on either side. As long as
562  // one side guarantees the mask is zero extended, the AND will preserve those
563  // zeros.
564  if (N->getOpcode() == ISD::AND)
565  return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
566  isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
567 
568  return isLegalMaskCompare(N, Subtarget);
569 }
570 
571 bool
572 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
573  if (OptLevel == CodeGenOpt::None) return false;
574 
575  if (!N.hasOneUse())
576  return false;
577 
578  if (N.getOpcode() != ISD::LOAD)
579  return true;
580 
581  // Don't fold non-temporal loads if we have an instruction for them.
582  if (useNonTemporalLoad(cast<LoadSDNode>(N)))
583  return false;
584 
585  // If N is a load, do additional profitability checks.
586  if (U == Root) {
587  switch (U->getOpcode()) {
588  default: break;
589  case X86ISD::ADD:
590  case X86ISD::ADC:
591  case X86ISD::SUB:
592  case X86ISD::SBB:
593  case X86ISD::AND:
594  case X86ISD::XOR:
595  case X86ISD::OR:
596  case ISD::ADD:
597  case ISD::ADDCARRY:
598  case ISD::AND:
599  case ISD::OR:
600  case ISD::XOR: {
601  SDValue Op1 = U->getOperand(1);
602 
603  // If the other operand is a 8-bit immediate we should fold the immediate
604  // instead. This reduces code size.
605  // e.g.
606  // movl 4(%esp), %eax
607  // addl $4, %eax
608  // vs.
609  // movl $4, %eax
610  // addl 4(%esp), %eax
611  // The former is 2 bytes shorter. In case where the increment is 1, then
612  // the saving can be 4 bytes (by using incl %eax).
613  if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
614  if (Imm->getAPIntValue().isSignedIntN(8))
615  return false;
616 
617  // If this is a 64-bit AND with an immediate that fits in 32-bits,
618  // prefer using the smaller and over folding the load. This is needed to
619  // make sure immediates created by shrinkAndImmediate are always folded.
620  // Ideally we would narrow the load during DAG combine and get the
621  // best of both worlds.
622  if (U->getOpcode() == ISD::AND &&
623  Imm->getAPIntValue().getBitWidth() == 64 &&
624  Imm->getAPIntValue().isIntN(32))
625  return false;
626 
627  // If this really a zext_inreg that can be represented with a movzx
628  // instruction, prefer that.
629  // TODO: We could shrink the load and fold if it is non-volatile.
630  if (U->getOpcode() == ISD::AND &&
631  (Imm->getAPIntValue() == UINT8_MAX ||
632  Imm->getAPIntValue() == UINT16_MAX ||
633  Imm->getAPIntValue() == UINT32_MAX))
634  return false;
635 
636  // ADD/SUB with can negate the immediate and use the opposite operation
637  // to fit 128 into a sign extended 8 bit immediate.
638  if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
639  (-Imm->getAPIntValue()).isSignedIntN(8))
640  return false;
641  }
642 
643  // If the other operand is a TLS address, we should fold it instead.
644  // This produces
645  // movl %gs:0, %eax
646  // leal i@NTPOFF(%eax), %eax
647  // instead of
648  // movl $i@NTPOFF, %eax
649  // addl %gs:0, %eax
650  // if the block also has an access to a second TLS address this will save
651  // a load.
652  // FIXME: This is probably also true for non-TLS addresses.
653  if (Op1.getOpcode() == X86ISD::Wrapper) {
654  SDValue Val = Op1.getOperand(0);
656  return false;
657  }
658 
659  // Don't fold load if this matches the BTS/BTR/BTC patterns.
660  // BTS: (or X, (shl 1, n))
661  // BTR: (and X, (rotl -2, n))
662  // BTC: (xor X, (shl 1, n))
663  if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
664  if (U->getOperand(0).getOpcode() == ISD::SHL &&
666  return false;
667 
668  if (U->getOperand(1).getOpcode() == ISD::SHL &&
670  return false;
671  }
672  if (U->getOpcode() == ISD::AND) {
673  SDValue U0 = U->getOperand(0);
674  SDValue U1 = U->getOperand(1);
675  if (U0.getOpcode() == ISD::ROTL) {
676  auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
677  if (C && C->getSExtValue() == -2)
678  return false;
679  }
680 
681  if (U1.getOpcode() == ISD::ROTL) {
682  auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
683  if (C && C->getSExtValue() == -2)
684  return false;
685  }
686  }
687 
688  break;
689  }
690  case ISD::SHL:
691  case ISD::SRA:
692  case ISD::SRL:
693  // Don't fold a load into a shift by immediate. The BMI2 instructions
694  // support folding a load, but not an immediate. The legacy instructions
695  // support folding an immediate, but can't fold a load. Folding an
696  // immediate is preferable to folding a load.
697  if (isa<ConstantSDNode>(U->getOperand(1)))
698  return false;
699 
700  break;
701  }
702  }
703 
704  // Prevent folding a load if this can implemented with an insert_subreg or
705  // a move that implicitly zeroes.
706  if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
707  isNullConstant(Root->getOperand(2)) &&
708  (Root->getOperand(0).isUndef() ||
710  return false;
711 
712  return true;
713 }
714 
715 /// Replace the original chain operand of the call with
716 /// load's chain operand and move load below the call's chain operand.
718  SDValue Call, SDValue OrigChain) {
720  SDValue Chain = OrigChain.getOperand(0);
721  if (Chain.getNode() == Load.getNode())
722  Ops.push_back(Load.getOperand(0));
723  else {
724  assert(Chain.getOpcode() == ISD::TokenFactor &&
725  "Unexpected chain operand");
726  for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
727  if (Chain.getOperand(i).getNode() == Load.getNode())
728  Ops.push_back(Load.getOperand(0));
729  else
730  Ops.push_back(Chain.getOperand(i));
731  SDValue NewChain =
732  CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
733  Ops.clear();
734  Ops.push_back(NewChain);
735  }
736  Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
737  CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
738  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
739  Load.getOperand(1), Load.getOperand(2));
740 
741  Ops.clear();
742  Ops.push_back(SDValue(Load.getNode(), 1));
743  Ops.append(Call->op_begin() + 1, Call->op_end());
744  CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
745 }
746 
747 /// Return true if call address is a load and it can be
748 /// moved below CALLSEQ_START and the chains leading up to the call.
749 /// Return the CALLSEQ_START by reference as a second output.
750 /// In the case of a tail call, there isn't a callseq node between the call
751 /// chain and the load.
752 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
753  // The transformation is somewhat dangerous if the call's chain was glued to
754  // the call. After MoveBelowOrigChain the load is moved between the call and
755  // the chain, this can create a cycle if the load is not folded. So it is
756  // *really* important that we are sure the load will be folded.
757  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
758  return false;
759  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
760  if (!LD ||
761  !LD->isSimple() ||
764  return false;
765 
766  // Now let's find the callseq_start.
767  while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
768  if (!Chain.hasOneUse())
769  return false;
770  Chain = Chain.getOperand(0);
771  }
772 
773  if (!Chain.getNumOperands())
774  return false;
775  // Since we are not checking for AA here, conservatively abort if the chain
776  // writes to memory. It's not safe to move the callee (a load) across a store.
777  if (isa<MemSDNode>(Chain.getNode()) &&
778  cast<MemSDNode>(Chain.getNode())->writeMem())
779  return false;
780  if (Chain.getOperand(0).getNode() == Callee.getNode())
781  return true;
782  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
783  Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
784  Callee.getValue(1).hasOneUse())
785  return true;
786  return false;
787 }
788 
789 void X86DAGToDAGISel::PreprocessISelDAG() {
790  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
791  E = CurDAG->allnodes_end(); I != E; ) {
792  SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
793 
794  // If this is a target specific AND node with no flag usages, turn it back
795  // into ISD::AND to enable test instruction matching.
796  if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
797  SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
798  N->getOperand(0), N->getOperand(1));
799  --I;
800  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
801  ++I;
802  CurDAG->DeleteNode(N);
803  continue;
804  }
805 
806  switch (N->getOpcode()) {
807  case ISD::FP_TO_SINT:
808  case ISD::FP_TO_UINT: {
809  // Replace vector fp_to_s/uint with their X86 specific equivalent so we
810  // don't need 2 sets of patterns.
811  if (!N->getSimpleValueType(0).isVector())
812  break;
813 
814  unsigned NewOpc;
815  switch (N->getOpcode()) {
816  default: llvm_unreachable("Unexpected opcode!");
817  case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
818  case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
819  }
820  SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
821  N->getOperand(0));
822  --I;
823  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
824  ++I;
825  CurDAG->DeleteNode(N);
826  continue;
827  }
828  case ISD::SHL:
829  case ISD::SRA:
830  case ISD::SRL: {
831  // Replace vector shifts with their X86 specific equivalent so we don't
832  // need 2 sets of patterns.
833  if (!N->getValueType(0).isVector())
834  break;
835 
836  unsigned NewOpc;
837  switch (N->getOpcode()) {
838  default: llvm_unreachable("Unexpected opcode!");
839  case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
840  case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
841  case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
842  }
843  SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
844  N->getOperand(0), N->getOperand(1));
845  --I;
846  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
847  ++I;
848  CurDAG->DeleteNode(N);
849  continue;
850  }
851  case ISD::ANY_EXTEND:
853  // Replace vector any extend with the zero extend equivalents so we don't
854  // need 2 sets of patterns. Ignore vXi1 extensions.
855  if (!N->getValueType(0).isVector() ||
857  break;
858 
859  unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
862 
863  SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
864  N->getOperand(0));
865  --I;
866  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
867  ++I;
868  CurDAG->DeleteNode(N);
869  continue;
870  }
871  case ISD::FCEIL:
872  case ISD::FFLOOR:
873  case ISD::FTRUNC:
874  case ISD::FNEARBYINT:
875  case ISD::FRINT: {
876  // Replace fp rounding with their X86 specific equivalent so we don't
877  // need 2 sets of patterns.
878  unsigned Imm;
879  switch (N->getOpcode()) {
880  default: llvm_unreachable("Unexpected opcode!");
881  case ISD::FCEIL: Imm = 0xA; break;
882  case ISD::FFLOOR: Imm = 0x9; break;
883  case ISD::FTRUNC: Imm = 0xB; break;
884  case ISD::FNEARBYINT: Imm = 0xC; break;
885  case ISD::FRINT: Imm = 0x4; break;
886  }
887  SDLoc dl(N);
888  SDValue Res = CurDAG->getNode(
889  X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0),
890  CurDAG->getTargetConstant(Imm, dl, MVT::i8));
891  --I;
892  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
893  ++I;
894  CurDAG->DeleteNode(N);
895  continue;
896  }
897  case X86ISD::FANDN:
898  case X86ISD::FAND:
899  case X86ISD::FOR:
900  case X86ISD::FXOR: {
901  // Widen scalar fp logic ops to vector to reduce isel patterns.
902  // FIXME: Can we do this during lowering/combine.
903  MVT VT = N->getSimpleValueType(0);
904  if (VT.isVector() || VT == MVT::f128)
905  break;
906 
907  MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
908  SDLoc dl(N);
909  SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
910  N->getOperand(0));
911  SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
912  N->getOperand(1));
913 
914  SDValue Res;
915  if (Subtarget->hasSSE2()) {
916  EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
917  Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
918  Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
919  unsigned Opc;
920  switch (N->getOpcode()) {
921  default: llvm_unreachable("Unexpected opcode!");
922  case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
923  case X86ISD::FAND: Opc = ISD::AND; break;
924  case X86ISD::FOR: Opc = ISD::OR; break;
925  case X86ISD::FXOR: Opc = ISD::XOR; break;
926  }
927  Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
928  Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
929  } else {
930  Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
931  }
932  Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
933  CurDAG->getIntPtrConstant(0, dl));
934  --I;
935  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
936  ++I;
937  CurDAG->DeleteNode(N);
938  continue;
939  }
940  }
941 
942  if (OptLevel != CodeGenOpt::None &&
943  // Only do this when the target can fold the load into the call or
944  // jmp.
945  !Subtarget->useRetpolineIndirectCalls() &&
946  ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
947  (N->getOpcode() == X86ISD::TC_RETURN &&
948  (Subtarget->is64Bit() ||
949  !getTargetMachine().isPositionIndependent())))) {
950  /// Also try moving call address load from outside callseq_start to just
951  /// before the call to allow it to be folded.
952  ///
953  /// [Load chain]
954  /// ^
955  /// |
956  /// [Load]
957  /// ^ ^
958  /// | |
959  /// / \--
960  /// / |
961  ///[CALLSEQ_START] |
962  /// ^ |
963  /// | |
964  /// [LOAD/C2Reg] |
965  /// | |
966  /// \ /
967  /// \ /
968  /// [CALL]
969  bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
970  SDValue Chain = N->getOperand(0);
971  SDValue Load = N->getOperand(1);
972  if (!isCalleeLoad(Load, Chain, HasCallSeq))
973  continue;
974  moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
975  ++NumLoadMoved;
976  continue;
977  }
978 
979  // Lower fpround and fpextend nodes that target the FP stack to be store and
980  // load to the stack. This is a gross hack. We would like to simply mark
981  // these as being illegal, but when we do that, legalize produces these when
982  // it expands calls, then expands these in the same legalize pass. We would
983  // like dag combine to be able to hack on these between the call expansion
984  // and the node legalization. As such this pass basically does "really
985  // late" legalization of these inline with the X86 isel pass.
986  // FIXME: This should only happen when not compiled with -O0.
987  switch (N->getOpcode()) {
988  default: continue;
989  case ISD::FP_ROUND:
990  case ISD::FP_EXTEND:
991  {
992  MVT SrcVT = N->getOperand(0).getSimpleValueType();
993  MVT DstVT = N->getSimpleValueType(0);
994 
995  // If any of the sources are vectors, no fp stack involved.
996  if (SrcVT.isVector() || DstVT.isVector())
997  continue;
998 
999  // If the source and destination are SSE registers, then this is a legal
1000  // conversion that should not be lowered.
1001  const X86TargetLowering *X86Lowering =
1002  static_cast<const X86TargetLowering *>(TLI);
1003  bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1004  bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1005  if (SrcIsSSE && DstIsSSE)
1006  continue;
1007 
1008  if (!SrcIsSSE && !DstIsSSE) {
1009  // If this is an FPStack extension, it is a noop.
1010  if (N->getOpcode() == ISD::FP_EXTEND)
1011  continue;
1012  // If this is a value-preserving FPStack truncation, it is a noop.
1013  if (N->getConstantOperandVal(1))
1014  continue;
1015  }
1016 
1017  // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1018  // FPStack has extload and truncstore. SSE can fold direct loads into other
1019  // operations. Based on this, decide what we want to do.
1020  MVT MemVT;
1021  if (N->getOpcode() == ISD::FP_ROUND)
1022  MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1023  else
1024  MemVT = SrcIsSSE ? SrcVT : DstVT;
1025 
1026  SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1027  SDLoc dl(N);
1028 
1029  // FIXME: optimize the case where the src/dest is a load or store?
1030 
1031  SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
1032  MemTmp, MachinePointerInfo(), MemVT);
1033  SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1034  MachinePointerInfo(), MemVT);
1035 
1036  // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1037  // extload we created. This will cause general havok on the dag because
1038  // anything below the conversion could be folded into other existing nodes.
1039  // To avoid invalidating 'I', back it up to the convert node.
1040  --I;
1041  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1042  break;
1043  }
1044 
1045  //The sequence of events for lowering STRICT_FP versions of these nodes requires
1046  //dealing with the chain differently, as there is already a preexisting chain.
1047  case ISD::STRICT_FP_ROUND:
1048  case ISD::STRICT_FP_EXTEND:
1049  {
1050  MVT SrcVT = N->getOperand(1).getSimpleValueType();
1051  MVT DstVT = N->getSimpleValueType(0);
1052 
1053  // If any of the sources are vectors, no fp stack involved.
1054  if (SrcVT.isVector() || DstVT.isVector())
1055  continue;
1056 
1057  // If the source and destination are SSE registers, then this is a legal
1058  // conversion that should not be lowered.
1059  const X86TargetLowering *X86Lowering =
1060  static_cast<const X86TargetLowering *>(TLI);
1061  bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1062  bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1063  if (SrcIsSSE && DstIsSSE)
1064  continue;
1065 
1066  if (!SrcIsSSE && !DstIsSSE) {
1067  // If this is an FPStack extension, it is a noop.
1068  if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1069  continue;
1070  // If this is a value-preserving FPStack truncation, it is a noop.
1071  if (N->getConstantOperandVal(2))
1072  continue;
1073  }
1074 
1075  // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1076  // FPStack has extload and truncstore. SSE can fold direct loads into other
1077  // operations. Based on this, decide what we want to do.
1078  MVT MemVT;
1079  if (N->getOpcode() == ISD::STRICT_FP_ROUND)
1080  MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1081  else
1082  MemVT = SrcIsSSE ? SrcVT : DstVT;
1083 
1084  SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1085  SDLoc dl(N);
1086 
1087  // FIXME: optimize the case where the src/dest is a load or store?
1088 
1089  //Since the operation is StrictFP, use the preexisting chain.
1090  SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
1091  MemTmp, MachinePointerInfo(), MemVT);
1092  SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1093  MachinePointerInfo(), MemVT);
1094 
1095  // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1096  // extload we created. This will cause general havok on the dag because
1097  // anything below the conversion could be folded into other existing nodes.
1098  // To avoid invalidating 'I', back it up to the convert node.
1099  --I;
1100  CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1101  break;
1102  }
1103  }
1104 
1105 
1106  // Now that we did that, the node is dead. Increment the iterator to the
1107  // next node to process, then delete N.
1108  ++I;
1109  CurDAG->DeleteNode(N);
1110  }
1111 
1112  // The load+call transform above can leave some dead nodes in the graph. Make
1113  // sure we remove them. Its possible some of the other transforms do to so
1114  // just remove dead nodes unconditionally.
1115  CurDAG->RemoveDeadNodes();
1116 }
1117 
1118 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1119 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1120  unsigned Opc = N->getMachineOpcode();
1121  if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1122  Opc != X86::MOVSX64rr8)
1123  return false;
1124 
1125  SDValue N0 = N->getOperand(0);
1126 
1127  // We need to be extracting the lower bit of an extend.
1128  if (!N0.isMachineOpcode() ||
1129  N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1130  N0.getConstantOperandVal(1) != X86::sub_8bit)
1131  return false;
1132 
1133  // We're looking for either a movsx or movzx to match the original opcode.
1134  unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1135  : X86::MOVSX32rr8_NOREX;
1136  SDValue N00 = N0.getOperand(0);
1137  if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1138  return false;
1139 
1140  if (Opc == X86::MOVSX64rr8) {
1141  // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1142  // to 64.
1143  MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1144  MVT::i64, N00);
1145  ReplaceUses(N, Extend);
1146  } else {
1147  // Ok we can drop this extend and just use the original extend.
1148  ReplaceUses(N, N00.getNode());
1149  }
1150 
1151  return true;
1152 }
1153 
1154 void X86DAGToDAGISel::PostprocessISelDAG() {
1155  // Skip peepholes at -O0.
1156  if (TM.getOptLevel() == CodeGenOpt::None)
1157  return;
1158 
1159  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1160 
1161  bool MadeChange = false;
1162  while (Position != CurDAG->allnodes_begin()) {
1163  SDNode *N = &*--Position;
1164  // Skip dead nodes and any non-machine opcodes.
1165  if (N->use_empty() || !N->isMachineOpcode())
1166  continue;
1167 
1168  if (tryOptimizeRem8Extend(N)) {
1169  MadeChange = true;
1170  continue;
1171  }
1172 
1173  // Look for a TESTrr+ANDrr pattern where both operands of the test are
1174  // the same. Rewrite to remove the AND.
1175  unsigned Opc = N->getMachineOpcode();
1176  if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
1177  Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
1178  N->getOperand(0) == N->getOperand(1) &&
1179  N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1180  N->getOperand(0).isMachineOpcode()) {
1181  SDValue And = N->getOperand(0);
1182  unsigned N0Opc = And.getMachineOpcode();
1183  if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
1184  N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
1185  MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1186  MVT::i32,
1187  And.getOperand(0),
1188  And.getOperand(1));
1189  ReplaceUses(N, Test);
1190  MadeChange = true;
1191  continue;
1192  }
1193  if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
1194  N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
1195  unsigned NewOpc;
1196  switch (N0Opc) {
1197  case X86::AND8rm: NewOpc = X86::TEST8mr; break;
1198  case X86::AND16rm: NewOpc = X86::TEST16mr; break;
1199  case X86::AND32rm: NewOpc = X86::TEST32mr; break;
1200  case X86::AND64rm: NewOpc = X86::TEST64mr; break;
1201  }
1202 
1203  // Need to swap the memory and register operand.
1204  SDValue Ops[] = { And.getOperand(1),
1205  And.getOperand(2),
1206  And.getOperand(3),
1207  And.getOperand(4),
1208  And.getOperand(5),
1209  And.getOperand(0),
1210  And.getOperand(6) /* Chain */ };
1211  MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1212  MVT::i32, MVT::Other, Ops);
1213  ReplaceUses(N, Test);
1214  MadeChange = true;
1215  continue;
1216  }
1217  }
1218 
1219  // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1220  // used. We're doing this late so we can prefer to fold the AND into masked
1221  // comparisons. Doing that can be better for the live range of the mask
1222  // register.
1223  if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
1224  Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
1225  N->getOperand(0) == N->getOperand(1) &&
1226  N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1227  N->getOperand(0).isMachineOpcode() &&
1228  onlyUsesZeroFlag(SDValue(N, 0))) {
1229  SDValue And = N->getOperand(0);
1230  unsigned N0Opc = And.getMachineOpcode();
1231  // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1232  // KAND instructions and KTEST use the same ISA feature.
1233  if (N0Opc == X86::KANDBrr ||
1234  (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
1235  N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
1236  unsigned NewOpc;
1237  switch (Opc) {
1238  default: llvm_unreachable("Unexpected opcode!");
1239  case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1240  case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1241  case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1242  case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1243  }
1244  MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1245  MVT::i32,
1246  And.getOperand(0),
1247  And.getOperand(1));
1248  ReplaceUses(N, KTest);
1249  MadeChange = true;
1250  continue;
1251  }
1252  }
1253 
1254  // Attempt to remove vectors moves that were inserted to zero upper bits.
1255  if (Opc != TargetOpcode::SUBREG_TO_REG)
1256  continue;
1257 
1258  unsigned SubRegIdx = N->getConstantOperandVal(2);
1259  if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1260  continue;
1261 
1262  SDValue Move = N->getOperand(1);
1263  if (!Move.isMachineOpcode())
1264  continue;
1265 
1266  // Make sure its one of the move opcodes we recognize.
1267  switch (Move.getMachineOpcode()) {
1268  default:
1269  continue;
1270  case X86::VMOVAPDrr: case X86::VMOVUPDrr:
1271  case X86::VMOVAPSrr: case X86::VMOVUPSrr:
1272  case X86::VMOVDQArr: case X86::VMOVDQUrr:
1273  case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
1274  case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
1275  case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
1276  case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
1277  case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
1278  case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1279  case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1280  case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
1281  case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
1282  case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1283  case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1284  break;
1285  }
1286 
1287  SDValue In = Move.getOperand(0);
1288  if (!In.isMachineOpcode() ||
1289  In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1290  continue;
1291 
1292  // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1293  // the SHA instructions which use a legacy encoding.
1294  uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1295  if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1296  (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1297  (TSFlags & X86II::EncodingMask) != X86II::XOP)
1298  continue;
1299 
1300  // Producing instruction is another vector instruction. We can drop the
1301  // move.
1302  CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1303  MadeChange = true;
1304  }
1305 
1306  if (MadeChange)
1307  CurDAG->RemoveDeadNodes();
1308 }
1309 
1310 
1311 /// Emit any code that needs to be executed only in the main function.
1312 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1313  if (Subtarget->isTargetCygMing()) {
1315  auto &DL = CurDAG->getDataLayout();
1316 
1317  TargetLowering::CallLoweringInfo CLI(*CurDAG);
1318  CLI.setChain(CurDAG->getRoot())
1319  .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1320  CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1321  std::move(Args));
1322  const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1323  std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1324  CurDAG->setRoot(Result.second);
1325  }
1326 }
1327 
1328 void X86DAGToDAGISel::EmitFunctionEntryCode() {
1329  // If this is main, emit special code for main.
1330  const Function &F = MF->getFunction();
1331  if (F.hasExternalLinkage() && F.getName() == "main")
1332  emitSpecialCodeForMain();
1333 }
1334 
1335 static bool isDispSafeForFrameIndex(int64_t Val) {
1336  // On 64-bit platforms, we can run into an issue where a frame index
1337  // includes a displacement that, when added to the explicit displacement,
1338  // will overflow the displacement field. Assuming that the frame index
1339  // displacement fits into a 31-bit integer (which is only slightly more
1340  // aggressive than the current fundamental assumption that it fits into
1341  // a 32-bit integer), a 31-bit disp should always be safe.
1342  return isInt<31>(Val);
1343 }
1344 
1345 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1346  X86ISelAddressMode &AM) {
1347  // If there's no offset to fold, we don't need to do any work.
1348  if (Offset == 0)
1349  return false;
1350 
1351  // Cannot combine ExternalSymbol displacements with integer offsets.
1352  if (AM.ES || AM.MCSym)
1353  return true;
1354 
1355  int64_t Val = AM.Disp + Offset;
1356  CodeModel::Model M = TM.getCodeModel();
1357  if (Subtarget->is64Bit()) {
1359  AM.hasSymbolicDisplacement()))
1360  return true;
1361  // In addition to the checks required for a register base, check that
1362  // we do not try to use an unsafe Disp with a frame index.
1363  if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1365  return true;
1366  }
1367  AM.Disp = Val;
1368  return false;
1369 
1370 }
1371 
1372 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
1373  SDValue Address = N->getOperand(1);
1374 
1375  // load gs:0 -> GS segment register.
1376  // load fs:0 -> FS segment register.
1377  //
1378  // This optimization is valid because the GNU TLS model defines that
1379  // gs:0 (or fs:0 on X86-64) contains its own address.
1380  // For more information see http://people.redhat.com/drepper/tls.pdf
1381  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
1382  if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
1383  !IndirectTlsSegRefs &&
1384  (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1385  Subtarget->isTargetFuchsia()))
1386  switch (N->getPointerInfo().getAddrSpace()) {
1387  case 256:
1388  AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1389  return false;
1390  case 257:
1391  AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1392  return false;
1393  // Address space 258 is not handled here, because it is not used to
1394  // address TLS areas.
1395  }
1396 
1397  return true;
1398 }
1399 
1400 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1401 /// mode. These wrap things that will resolve down into a symbol reference.
1402 /// If no match is possible, this returns true, otherwise it returns false.
1403 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1404  // If the addressing mode already has a symbol as the displacement, we can
1405  // never match another symbol.
1406  if (AM.hasSymbolicDisplacement())
1407  return true;
1408 
1409  bool IsRIPRelTLS = false;
1410  bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1411  if (IsRIPRel) {
1412  SDValue Val = N.getOperand(0);
1414  IsRIPRelTLS = true;
1415  }
1416 
1417  // We can't use an addressing mode in the 64-bit large code model.
1418  // Global TLS addressing is an exception. In the medium code model,
1419  // we use can use a mode when RIP wrappers are present.
1420  // That signifies access to globals that are known to be "near",
1421  // such as the GOT itself.
1422  CodeModel::Model M = TM.getCodeModel();
1423  if (Subtarget->is64Bit() &&
1424  ((M == CodeModel::Large && !IsRIPRelTLS) ||
1425  (M == CodeModel::Medium && !IsRIPRel)))
1426  return true;
1427 
1428  // Base and index reg must be 0 in order to use %rip as base.
1429  if (IsRIPRel && AM.hasBaseOrIndexReg())
1430  return true;
1431 
1432  // Make a local copy in case we can't do this fold.
1433  X86ISelAddressMode Backup = AM;
1434 
1435  int64_t Offset = 0;
1436  SDValue N0 = N.getOperand(0);
1437  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1438  AM.GV = G->getGlobal();
1439  AM.SymbolFlags = G->getTargetFlags();
1440  Offset = G->getOffset();
1441  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1442  AM.CP = CP->getConstVal();
1443  AM.Align = CP->getAlignment();
1444  AM.SymbolFlags = CP->getTargetFlags();
1445  Offset = CP->getOffset();
1446  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1447  AM.ES = S->getSymbol();
1448  AM.SymbolFlags = S->getTargetFlags();
1449  } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1450  AM.MCSym = S->getMCSymbol();
1451  } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
1452  AM.JT = J->getIndex();
1453  AM.SymbolFlags = J->getTargetFlags();
1454  } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1455  AM.BlockAddr = BA->getBlockAddress();
1456  AM.SymbolFlags = BA->getTargetFlags();
1457  Offset = BA->getOffset();
1458  } else
1459  llvm_unreachable("Unhandled symbol reference node.");
1460 
1461  if (foldOffsetIntoAddress(Offset, AM)) {
1462  AM = Backup;
1463  return true;
1464  }
1465 
1466  if (IsRIPRel)
1467  AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1468 
1469  // Commit the changes now that we know this fold is safe.
1470  return false;
1471 }
1472 
1473 /// Add the specified node to the specified addressing mode, returning true if
1474 /// it cannot be done. This just pattern matches for the addressing mode.
1475 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1476  if (matchAddressRecursively(N, AM, 0))
1477  return true;
1478 
1479  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1480  // a smaller encoding and avoids a scaled-index.
1481  if (AM.Scale == 2 &&
1482  AM.BaseType == X86ISelAddressMode::RegBase &&
1483  AM.Base_Reg.getNode() == nullptr) {
1484  AM.Base_Reg = AM.IndexReg;
1485  AM.Scale = 1;
1486  }
1487 
1488  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1489  // because it has a smaller encoding.
1490  // TODO: Which other code models can use this?
1491  switch (TM.getCodeModel()) {
1492  default: break;
1493  case CodeModel::Small:
1494  case CodeModel::Kernel:
1495  if (Subtarget->is64Bit() &&
1496  AM.Scale == 1 &&
1497  AM.BaseType == X86ISelAddressMode::RegBase &&
1498  AM.Base_Reg.getNode() == nullptr &&
1499  AM.IndexReg.getNode() == nullptr &&
1500  AM.SymbolFlags == X86II::MO_NO_FLAG &&
1501  AM.hasSymbolicDisplacement())
1502  AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1503  break;
1504  }
1505 
1506  return false;
1507 }
1508 
1509 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1510  unsigned Depth) {
1511  // Add an artificial use to this node so that we can keep track of
1512  // it if it gets CSE'd with a different node.
1513  HandleSDNode Handle(N);
1514 
1515  X86ISelAddressMode Backup = AM;
1516  if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1517  !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1518  return false;
1519  AM = Backup;
1520 
1521  // Try again after commuting the operands.
1522  if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
1523  !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
1524  return false;
1525  AM = Backup;
1526 
1527  // If we couldn't fold both operands into the address at the same time,
1528  // see if we can just put each operand into a register and fold at least
1529  // the add.
1530  if (AM.BaseType == X86ISelAddressMode::RegBase &&
1531  !AM.Base_Reg.getNode() &&
1532  !AM.IndexReg.getNode()) {
1533  N = Handle.getValue();
1534  AM.Base_Reg = N.getOperand(0);
1535  AM.IndexReg = N.getOperand(1);
1536  AM.Scale = 1;
1537  return false;
1538  }
1539  N = Handle.getValue();
1540  return true;
1541 }
1542 
1543 // Insert a node into the DAG at least before the Pos node's position. This
1544 // will reposition the node as needed, and will assign it a node ID that is <=
1545 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1546 // IDs! The selection DAG must no longer depend on their uniqueness when this
1547 // is used.
1548 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1549  if (N->getNodeId() == -1 ||
1552  DAG.RepositionNode(Pos->getIterator(), N.getNode());
1553  // Mark Node as invalid for pruning as after this it may be a successor to a
1554  // selected node but otherwise be in the same position of Pos.
1555  // Conservatively mark it with the same -abs(Id) to assure node id
1556  // invariant is preserved.
1557  N->setNodeId(Pos->getNodeId());
1559  }
1560 }
1561 
1562 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1563 // safe. This allows us to convert the shift and and into an h-register
1564 // extract and a scaled index. Returns false if the simplification is
1565 // performed.
1567  uint64_t Mask,
1568  SDValue Shift, SDValue X,
1569  X86ISelAddressMode &AM) {
1570  if (Shift.getOpcode() != ISD::SRL ||
1571  !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1572  !Shift.hasOneUse())
1573  return true;
1574 
1575  int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1576  if (ScaleLog <= 0 || ScaleLog >= 4 ||
1577  Mask != (0xffu << ScaleLog))
1578  return true;
1579 
1580  MVT VT = N.getSimpleValueType();
1581  SDLoc DL(N);
1582  SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
1583  SDValue NewMask = DAG.getConstant(0xff, DL, VT);
1584  SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
1585  SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
1586  SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
1587  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
1588 
1589  // Insert the new nodes into the topological ordering. We must do this in
1590  // a valid topological ordering as nothing is going to go back and re-sort
1591  // these nodes. We continually insert before 'N' in sequence as this is
1592  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1593  // hierarchy left to express.
1594  insertDAGNode(DAG, N, Eight);
1595  insertDAGNode(DAG, N, Srl);
1596  insertDAGNode(DAG, N, NewMask);
1597  insertDAGNode(DAG, N, And);
1598  insertDAGNode(DAG, N, ShlCount);
1599  insertDAGNode(DAG, N, Shl);
1600  DAG.ReplaceAllUsesWith(N, Shl);
1601  DAG.RemoveDeadNode(N.getNode());
1602  AM.IndexReg = And;
1603  AM.Scale = (1 << ScaleLog);
1604  return false;
1605 }
1606 
1607 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1608 // allows us to fold the shift into this addressing mode. Returns false if the
1609 // transform succeeded.
1611  X86ISelAddressMode &AM) {
1612  SDValue Shift = N.getOperand(0);
1613 
1614  // Use a signed mask so that shifting right will insert sign bits. These
1615  // bits will be removed when we shift the result left so it doesn't matter
1616  // what we use. This might allow a smaller immediate encoding.
1617  int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
1618 
1619  // If we have an any_extend feeding the AND, look through it to see if there
1620  // is a shift behind it. But only if the AND doesn't use the extended bits.
1621  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
1622  bool FoundAnyExtend = false;
1623  if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
1624  Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
1625  isUInt<32>(Mask)) {
1626  FoundAnyExtend = true;
1627  Shift = Shift.getOperand(0);
1628  }
1629 
1630  if (Shift.getOpcode() != ISD::SHL ||
1631  !isa<ConstantSDNode>(Shift.getOperand(1)))
1632  return true;
1633 
1634  SDValue X = Shift.getOperand(0);
1635 
1636  // Not likely to be profitable if either the AND or SHIFT node has more
1637  // than one use (unless all uses are for address computation). Besides,
1638  // isel mechanism requires their node ids to be reused.
1639  if (!N.hasOneUse() || !Shift.hasOneUse())
1640  return true;
1641 
1642  // Verify that the shift amount is something we can fold.
1643  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1644  if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
1645  return true;
1646 
1647  MVT VT = N.getSimpleValueType();
1648  SDLoc DL(N);
1649  if (FoundAnyExtend) {
1650  SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
1651  insertDAGNode(DAG, N, NewX);
1652  X = NewX;
1653  }
1654 
1655  SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
1656  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
1657  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
1658 
1659  // Insert the new nodes into the topological ordering. We must do this in
1660  // a valid topological ordering as nothing is going to go back and re-sort
1661  // these nodes. We continually insert before 'N' in sequence as this is
1662  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1663  // hierarchy left to express.
1664  insertDAGNode(DAG, N, NewMask);
1665  insertDAGNode(DAG, N, NewAnd);
1666  insertDAGNode(DAG, N, NewShift);
1667  DAG.ReplaceAllUsesWith(N, NewShift);
1668  DAG.RemoveDeadNode(N.getNode());
1669 
1670  AM.Scale = 1 << ShiftAmt;
1671  AM.IndexReg = NewAnd;
1672  return false;
1673 }
1674 
1675 // Implement some heroics to detect shifts of masked values where the mask can
1676 // be replaced by extending the shift and undoing that in the addressing mode
1677 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1678 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1679 // the addressing mode. This results in code such as:
1680 //
1681 // int f(short *y, int *lookup_table) {
1682 // ...
1683 // return *y + lookup_table[*y >> 11];
1684 // }
1685 //
1686 // Turning into:
1687 // movzwl (%rdi), %eax
1688 // movl %eax, %ecx
1689 // shrl $11, %ecx
1690 // addl (%rsi,%rcx,4), %eax
1691 //
1692 // Instead of:
1693 // movzwl (%rdi), %eax
1694 // movl %eax, %ecx
1695 // shrl $9, %ecx
1696 // andl $124, %rcx
1697 // addl (%rsi,%rcx), %eax
1698 //
1699 // Note that this function assumes the mask is provided as a mask *after* the
1700 // value is shifted. The input chain may or may not match that, but computing
1701 // such a mask is trivial.
1703  uint64_t Mask,
1704  SDValue Shift, SDValue X,
1705  X86ISelAddressMode &AM) {
1706  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
1707  !isa<ConstantSDNode>(Shift.getOperand(1)))
1708  return true;
1709 
1710  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1711  unsigned MaskLZ = countLeadingZeros(Mask);
1712  unsigned MaskTZ = countTrailingZeros(Mask);
1713 
1714  // The amount of shift we're trying to fit into the addressing mode is taken
1715  // from the trailing zeros of the mask.
1716  unsigned AMShiftAmt = MaskTZ;
1717 
1718  // There is nothing we can do here unless the mask is removing some bits.
1719  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1720  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1721 
1722  // We also need to ensure that mask is a continuous run of bits.
1723  if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
1724 
1725  // Scale the leading zero count down based on the actual size of the value.
1726  // Also scale it down based on the size of the shift.
1727  unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
1728  if (MaskLZ < ScaleDown)
1729  return true;
1730  MaskLZ -= ScaleDown;
1731 
1732  // The final check is to ensure that any masked out high bits of X are
1733  // already known to be zero. Otherwise, the mask has a semantic impact
1734  // other than masking out a couple of low bits. Unfortunately, because of
1735  // the mask, zero extensions will be removed from operands in some cases.
1736  // This code works extra hard to look through extensions because we can
1737  // replace them with zero extensions cheaply if necessary.
1738  bool ReplacingAnyExtend = false;
1739  if (X.getOpcode() == ISD::ANY_EXTEND) {
1740  unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
1742  // Assume that we'll replace the any-extend with a zero-extend, and
1743  // narrow the search to the extended value.
1744  X = X.getOperand(0);
1745  MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
1746  ReplacingAnyExtend = true;
1747  }
1748  APInt MaskedHighBits =
1750  KnownBits Known = DAG.computeKnownBits(X);
1751  if (MaskedHighBits != Known.Zero) return true;
1752 
1753  // We've identified a pattern that can be transformed into a single shift
1754  // and an addressing mode. Make it so.
1755  MVT VT = N.getSimpleValueType();
1756  if (ReplacingAnyExtend) {
1757  assert(X.getValueType() != VT);
1758  // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1759  SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
1760  insertDAGNode(DAG, N, NewX);
1761  X = NewX;
1762  }
1763  SDLoc DL(N);
1764  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1765  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1766  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1767  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
1768 
1769  // Insert the new nodes into the topological ordering. We must do this in
1770  // a valid topological ordering as nothing is going to go back and re-sort
1771  // these nodes. We continually insert before 'N' in sequence as this is
1772  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1773  // hierarchy left to express.
1774  insertDAGNode(DAG, N, NewSRLAmt);
1775  insertDAGNode(DAG, N, NewSRL);
1776  insertDAGNode(DAG, N, NewSHLAmt);
1777  insertDAGNode(DAG, N, NewSHL);
1778  DAG.ReplaceAllUsesWith(N, NewSHL);
1779  DAG.RemoveDeadNode(N.getNode());
1780 
1781  AM.Scale = 1 << AMShiftAmt;
1782  AM.IndexReg = NewSRL;
1783  return false;
1784 }
1785 
1786 // Transform "(X >> SHIFT) & (MASK << C1)" to
1787 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1788 // matched to a BEXTR later. Returns false if the simplification is performed.
1790  uint64_t Mask,
1791  SDValue Shift, SDValue X,
1792  X86ISelAddressMode &AM,
1793  const X86Subtarget &Subtarget) {
1794  if (Shift.getOpcode() != ISD::SRL ||
1795  !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1796  !Shift.hasOneUse() || !N.hasOneUse())
1797  return true;
1798 
1799  // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1800  if (!Subtarget.hasTBM() &&
1801  !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
1802  return true;
1803 
1804  // We need to ensure that mask is a continuous run of bits.
1805  if (!isShiftedMask_64(Mask)) return true;
1806 
1807  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1808 
1809  // The amount of shift we're trying to fit into the addressing mode is taken
1810  // from the trailing zeros of the mask.
1811  unsigned AMShiftAmt = countTrailingZeros(Mask);
1812 
1813  // There is nothing we can do here unless the mask is removing some bits.
1814  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1815  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1816 
1817  MVT VT = N.getSimpleValueType();
1818  SDLoc DL(N);
1819  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1820  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1821  SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
1822  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
1823  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1824  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
1825 
1826  // Insert the new nodes into the topological ordering. We must do this in
1827  // a valid topological ordering as nothing is going to go back and re-sort
1828  // these nodes. We continually insert before 'N' in sequence as this is
1829  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1830  // hierarchy left to express.
1831  insertDAGNode(DAG, N, NewSRLAmt);
1832  insertDAGNode(DAG, N, NewSRL);
1833  insertDAGNode(DAG, N, NewMask);
1834  insertDAGNode(DAG, N, NewAnd);
1835  insertDAGNode(DAG, N, NewSHLAmt);
1836  insertDAGNode(DAG, N, NewSHL);
1837  DAG.ReplaceAllUsesWith(N, NewSHL);
1838  DAG.RemoveDeadNode(N.getNode());
1839 
1840  AM.Scale = 1 << AMShiftAmt;
1841  AM.IndexReg = NewAnd;
1842  return false;
1843 }
1844 
1845 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
1846  unsigned Depth) {
1847  SDLoc dl(N);
1848  LLVM_DEBUG({
1849  dbgs() << "MatchAddress: ";
1850  AM.dump(CurDAG);
1851  });
1852  // Limit recursion.
1853  if (Depth > 5)
1854  return matchAddressBase(N, AM);
1855 
1856  // If this is already a %rip relative address, we can only merge immediates
1857  // into it. Instead of handling this in every case, we handle it here.
1858  // RIP relative addressing: %rip + 32-bit displacement!
1859  if (AM.isRIPRelative()) {
1860  // FIXME: JumpTable and ExternalSymbol address currently don't like
1861  // displacements. It isn't very important, but this should be fixed for
1862  // consistency.
1863  if (!(AM.ES || AM.MCSym) && AM.JT != -1)
1864  return true;
1865 
1866  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
1867  if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
1868  return false;
1869  return true;
1870  }
1871 
1872  switch (N.getOpcode()) {
1873  default: break;
1874  case ISD::LOCAL_RECOVER: {
1875  if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
1876  if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
1877  // Use the symbol and don't prefix it.
1878  AM.MCSym = ESNode->getMCSymbol();
1879  return false;
1880  }
1881  break;
1882  }
1883  case ISD::Constant: {
1884  uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
1885  if (!foldOffsetIntoAddress(Val, AM))
1886  return false;
1887  break;
1888  }
1889 
1890  case X86ISD::Wrapper:
1891  case X86ISD::WrapperRIP:
1892  if (!matchWrapper(N, AM))
1893  return false;
1894  break;
1895 
1896  case ISD::LOAD:
1897  if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
1898  return false;
1899  break;
1900 
1901  case ISD::FrameIndex:
1902  if (AM.BaseType == X86ISelAddressMode::RegBase &&
1903  AM.Base_Reg.getNode() == nullptr &&
1904  (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
1905  AM.BaseType = X86ISelAddressMode::FrameIndexBase;
1906  AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
1907  return false;
1908  }
1909  break;
1910 
1911  case ISD::SHL:
1912  if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
1913  break;
1914 
1915  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
1916  unsigned Val = CN->getZExtValue();
1917  // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1918  // that the base operand remains free for further matching. If
1919  // the base doesn't end up getting used, a post-processing step
1920  // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1921  if (Val == 1 || Val == 2 || Val == 3) {
1922  AM.Scale = 1 << Val;
1923  SDValue ShVal = N.getOperand(0);
1924 
1925  // Okay, we know that we have a scale by now. However, if the scaled
1926  // value is an add of something and a constant, we can fold the
1927  // constant into the disp field here.
1928  if (CurDAG->isBaseWithConstantOffset(ShVal)) {
1929  AM.IndexReg = ShVal.getOperand(0);
1930  ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
1931  uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
1932  if (!foldOffsetIntoAddress(Disp, AM))
1933  return false;
1934  }
1935 
1936  AM.IndexReg = ShVal;
1937  return false;
1938  }
1939  }
1940  break;
1941 
1942  case ISD::SRL: {
1943  // Scale must not be used already.
1944  if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
1945 
1946  // We only handle up to 64-bit values here as those are what matter for
1947  // addressing mode optimizations.
1948  assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
1949  "Unexpected value size!");
1950 
1951  SDValue And = N.getOperand(0);
1952  if (And.getOpcode() != ISD::AND) break;
1953  SDValue X = And.getOperand(0);
1954 
1955  // The mask used for the transform is expected to be post-shift, but we
1956  // found the shift first so just apply the shift to the mask before passing
1957  // it down.
1958  if (!isa<ConstantSDNode>(N.getOperand(1)) ||
1959  !isa<ConstantSDNode>(And.getOperand(1)))
1960  break;
1961  uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
1962 
1963  // Try to fold the mask and shift into the scale, and return false if we
1964  // succeed.
1965  if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
1966  return false;
1967  break;
1968  }
1969 
1970  case ISD::SMUL_LOHI:
1971  case ISD::UMUL_LOHI:
1972  // A mul_lohi where we need the low part can be folded as a plain multiply.
1973  if (N.getResNo() != 0) break;
1975  case ISD::MUL:
1976  case X86ISD::MUL_IMM:
1977  // X*[3,5,9] -> X+X*[2,4,8]
1978  if (AM.BaseType == X86ISelAddressMode::RegBase &&
1979  AM.Base_Reg.getNode() == nullptr &&
1980  AM.IndexReg.getNode() == nullptr) {
1981  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
1982  if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
1983  CN->getZExtValue() == 9) {
1984  AM.Scale = unsigned(CN->getZExtValue())-1;
1985 
1986  SDValue MulVal = N.getOperand(0);
1987  SDValue Reg;
1988 
1989  // Okay, we know that we have a scale by now. However, if the scaled
1990  // value is an add of something and a constant, we can fold the
1991  // constant into the disp field here.
1992  if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
1993  isa<ConstantSDNode>(MulVal.getOperand(1))) {
1994  Reg = MulVal.getOperand(0);
1995  ConstantSDNode *AddVal =
1996  cast<ConstantSDNode>(MulVal.getOperand(1));
1997  uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
1998  if (foldOffsetIntoAddress(Disp, AM))
1999  Reg = N.getOperand(0);
2000  } else {
2001  Reg = N.getOperand(0);
2002  }
2003 
2004  AM.IndexReg = AM.Base_Reg = Reg;
2005  return false;
2006  }
2007  }
2008  break;
2009 
2010  case ISD::SUB: {
2011  // Given A-B, if A can be completely folded into the address and
2012  // the index field with the index field unused, use -B as the index.
2013  // This is a win if a has multiple parts that can be folded into
2014  // the address. Also, this saves a mov if the base register has
2015  // other uses, since it avoids a two-address sub instruction, however
2016  // it costs an additional mov if the index register has other uses.
2017 
2018  // Add an artificial use to this node so that we can keep track of
2019  // it if it gets CSE'd with a different node.
2020  HandleSDNode Handle(N);
2021 
2022  // Test if the LHS of the sub can be folded.
2023  X86ISelAddressMode Backup = AM;
2024  if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2025  N = Handle.getValue();
2026  AM = Backup;
2027  break;
2028  }
2029  N = Handle.getValue();
2030  // Test if the index field is free for use.
2031  if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2032  AM = Backup;
2033  break;
2034  }
2035 
2036  int Cost = 0;
2037  SDValue RHS = N.getOperand(1);
2038  // If the RHS involves a register with multiple uses, this
2039  // transformation incurs an extra mov, due to the neg instruction
2040  // clobbering its operand.
2041  if (!RHS.getNode()->hasOneUse() ||
2042  RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2043  RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2044  RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2045  (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2046  RHS.getOperand(0).getValueType() == MVT::i32))
2047  ++Cost;
2048  // If the base is a register with multiple uses, this
2049  // transformation may save a mov.
2050  if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2051  !AM.Base_Reg.getNode()->hasOneUse()) ||
2052  AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2053  --Cost;
2054  // If the folded LHS was interesting, this transformation saves
2055  // address arithmetic.
2056  if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2057  ((AM.Disp != 0) && (Backup.Disp == 0)) +
2058  (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2059  --Cost;
2060  // If it doesn't look like it may be an overall win, don't do it.
2061  if (Cost >= 0) {
2062  AM = Backup;
2063  break;
2064  }
2065 
2066  // Ok, the transformation is legal and appears profitable. Go for it.
2067  // Negation will be emitted later to avoid creating dangling nodes if this
2068  // was an unprofitable LEA.
2069  AM.IndexReg = RHS;
2070  AM.NegateIndex = true;
2071  AM.Scale = 1;
2072  return false;
2073  }
2074 
2075  case ISD::ADD:
2076  if (!matchAdd(N, AM, Depth))
2077  return false;
2078  break;
2079 
2080  case ISD::OR:
2081  // We want to look through a transform in InstCombine and DAGCombiner that
2082  // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
2083  // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
2084  // An 'lea' can then be used to match the shift (multiply) and add:
2085  // and $1, %esi
2086  // lea (%rsi, %rdi, 8), %rax
2087  if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
2088  !matchAdd(N, AM, Depth))
2089  return false;
2090  break;
2091 
2092  case ISD::AND: {
2093  // Perform some heroic transforms on an and of a constant-count shift
2094  // with a constant to enable use of the scaled offset field.
2095 
2096  // Scale must not be used already.
2097  if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2098 
2099  // We only handle up to 64-bit values here as those are what matter for
2100  // addressing mode optimizations.
2101  assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2102  "Unexpected value size!");
2103 
2104  if (!isa<ConstantSDNode>(N.getOperand(1)))
2105  break;
2106 
2107  if (N.getOperand(0).getOpcode() == ISD::SRL) {
2108  SDValue Shift = N.getOperand(0);
2109  SDValue X = Shift.getOperand(0);
2110 
2111  uint64_t Mask = N.getConstantOperandVal(1);
2112 
2113  // Try to fold the mask and shift into an extract and scale.
2114  if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2115  return false;
2116 
2117  // Try to fold the mask and shift directly into the scale.
2118  if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2119  return false;
2120 
2121  // Try to fold the mask and shift into BEXTR and scale.
2122  if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2123  return false;
2124  }
2125 
2126  // Try to swap the mask and shift to place shifts which can be done as
2127  // a scale on the outside of the mask.
2128  if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2129  return false;
2130 
2131  break;
2132  }
2133  case ISD::ZERO_EXTEND: {
2134  // Try to widen a zexted shift left to the same size as its use, so we can
2135  // match the shift as a scale factor.
2136  if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2137  break;
2138  if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
2139  break;
2140 
2141  // Give up if the shift is not a valid scale factor [1,2,3].
2142  SDValue Shl = N.getOperand(0);
2143  auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
2144  if (!ShAmtC || ShAmtC->getZExtValue() > 3)
2145  break;
2146 
2147  // The narrow shift must only shift out zero bits (it must be 'nuw').
2148  // That makes it safe to widen to the destination type.
2150  ShAmtC->getZExtValue());
2151  if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
2152  break;
2153 
2154  // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
2155  MVT VT = N.getSimpleValueType();
2156  SDLoc DL(N);
2157  SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
2158  SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
2159 
2160  // Convert the shift to scale factor.
2161  AM.Scale = 1 << ShAmtC->getZExtValue();
2162  AM.IndexReg = Zext;
2163 
2164  insertDAGNode(*CurDAG, N, Zext);
2165  insertDAGNode(*CurDAG, N, NewShl);
2166  CurDAG->ReplaceAllUsesWith(N, NewShl);
2167  CurDAG->RemoveDeadNode(N.getNode());
2168  return false;
2169  }
2170  }
2171 
2172  return matchAddressBase(N, AM);
2173 }
2174 
2175 /// Helper for MatchAddress. Add the specified node to the
2176 /// specified addressing mode without any further recursion.
2177 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2178  // Is the base register already occupied?
2179  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2180  // If so, check to see if the scale index register is set.
2181  if (!AM.IndexReg.getNode()) {
2182  AM.IndexReg = N;
2183  AM.Scale = 1;
2184  return false;
2185  }
2186 
2187  // Otherwise, we cannot select it.
2188  return true;
2189  }
2190 
2191  // Default, generate it as a register.
2192  AM.BaseType = X86ISelAddressMode::RegBase;
2193  AM.Base_Reg = N;
2194  return false;
2195 }
2196 
2197 /// Helper for selectVectorAddr. Handles things that can be folded into a
2198 /// gather scatter address. The index register and scale should have already
2199 /// been handled.
2200 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2201  // TODO: Support other operations.
2202  switch (N.getOpcode()) {
2203  case ISD::Constant: {
2204  uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2205  if (!foldOffsetIntoAddress(Val, AM))
2206  return false;
2207  break;
2208  }
2209  case X86ISD::Wrapper:
2210  if (!matchWrapper(N, AM))
2211  return false;
2212  break;
2213  }
2214 
2215  return matchAddressBase(N, AM);
2216 }
2217 
2218 bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
2219  SDValue &Scale, SDValue &Index,
2220  SDValue &Disp, SDValue &Segment) {
2221  X86ISelAddressMode AM;
2222  auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
2223  AM.IndexReg = Mgs->getIndex();
2224  AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
2225 
2226  unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2227  // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2228  if (AddrSpace == 256)
2229  AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2230  if (AddrSpace == 257)
2231  AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2232  if (AddrSpace == 258)
2233  AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2234 
2235  SDLoc DL(N);
2236  MVT VT = N.getSimpleValueType();
2237 
2238  // Try to match into the base and displacement fields.
2239  if (matchVectorAddress(N, AM))
2240  return false;
2241 
2242  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2243  return true;
2244 }
2245 
2246 /// Returns true if it is able to pattern match an addressing mode.
2247 /// It returns the operands which make up the maximal addressing mode it can
2248 /// match by reference.
2249 ///
2250 /// Parent is the parent node of the addr operand that is being matched. It
2251 /// is always a load, store, atomic node, or null. It is only null when
2252 /// checking memory operands for inline asm nodes.
2253 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2254  SDValue &Scale, SDValue &Index,
2255  SDValue &Disp, SDValue &Segment) {
2256  X86ISelAddressMode AM;
2257 
2258  if (Parent &&
2259  // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2260  // that are not a MemSDNode, and thus don't have proper addrspace info.
2261  Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2262  Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2263  Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2264  Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2265  Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2266  Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2267  Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2268  unsigned AddrSpace =
2269  cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2270  // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2271  if (AddrSpace == 256)
2272  AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2273  if (AddrSpace == 257)
2274  AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2275  if (AddrSpace == 258)
2276  AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2277  }
2278 
2279  // Save the DL and VT before calling matchAddress, it can invalidate N.
2280  SDLoc DL(N);
2281  MVT VT = N.getSimpleValueType();
2282 
2283  if (matchAddress(N, AM))
2284  return false;
2285 
2286  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2287  return true;
2288 }
2289 
2290 // We can only fold a load if all nodes between it and the root node have a
2291 // single use. If there are additional uses, we could end up duplicating the
2292 // load.
2293 static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
2294  while (User != Root) {
2295  if (!User->hasOneUse())
2296  return false;
2297  User = *User->use_begin();
2298  }
2299 
2300  return true;
2301 }
2302 
2303 /// Match a scalar SSE load. In particular, we want to match a load whose top
2304 /// elements are either undef or zeros. The load flavor is derived from the
2305 /// type of N, which is either v4f32 or v2f64.
2306 ///
2307 /// We also return:
2308 /// PatternChainNode: this is the matched node that has a chain input and
2309 /// output.
2310 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
2311  SDValue N, SDValue &Base,
2312  SDValue &Scale, SDValue &Index,
2313  SDValue &Disp, SDValue &Segment,
2314  SDValue &PatternNodeWithChain) {
2315  if (!hasSingleUsesFromRoot(Root, Parent))
2316  return false;
2317 
2318  // We can allow a full vector load here since narrowing a load is ok unless
2319  // it's volatile or atomic.
2320  if (ISD::isNON_EXTLoad(N.getNode())) {
2321  LoadSDNode *LD = cast<LoadSDNode>(N);
2322  if (LD->isSimple() &&
2323  IsProfitableToFold(N, LD, Root) &&
2324  IsLegalToFold(N, Parent, Root, OptLevel)) {
2325  PatternNodeWithChain = N;
2326  return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2327  Segment);
2328  }
2329  }
2330 
2331  // We can also match the special zero extended load opcode.
2332  if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
2333  PatternNodeWithChain = N;
2334  if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2335  IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
2336  auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
2337  return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
2338  Segment);
2339  }
2340  }
2341 
2342  // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
2343  // once. Otherwise the load might get duplicated and the chain output of the
2344  // duplicate load will not be observed by all dependencies.
2345  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
2346  PatternNodeWithChain = N.getOperand(0);
2347  if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2348  IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2349  IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2350  LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2351  return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2352  Segment);
2353  }
2354  }
2355 
2356  return false;
2357 }
2358 
2359 
2360 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2361  if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
2362  uint64_t ImmVal = CN->getZExtValue();
2363  if (!isUInt<32>(ImmVal))
2364  return false;
2365 
2366  Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
2367  return true;
2368  }
2369 
2370  // In static codegen with small code model, we can get the address of a label
2371  // into a register with 'movl'
2372  if (N->getOpcode() != X86ISD::Wrapper)
2373  return false;
2374 
2375  N = N.getOperand(0);
2376 
2377  // At least GNU as does not accept 'movl' for TPOFF relocations.
2378  // FIXME: We could use 'movl' when we know we are targeting MC.
2380  return false;
2381 
2382  Imm = N;
2383  if (N->getOpcode() != ISD::TargetGlobalAddress)
2384  return TM.getCodeModel() == CodeModel::Small;
2385 
2387  cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
2388  if (!CR)
2389  return TM.getCodeModel() == CodeModel::Small;
2390 
2391  return CR->getUnsignedMax().ult(1ull << 32);
2392 }
2393 
2394 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2395  SDValue &Scale, SDValue &Index,
2396  SDValue &Disp, SDValue &Segment) {
2397  // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2398  SDLoc DL(N);
2399 
2400  if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2401  return false;
2402 
2404  if (RN && RN->getReg() == 0)
2405  Base = CurDAG->getRegister(0, MVT::i64);
2406  else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2407  // Base could already be %rip, particularly in the x32 ABI.
2408  SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2409  MVT::i64), 0);
2410  Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2411  Base);
2412  }
2413 
2414  RN = dyn_cast<RegisterSDNode>(Index);
2415  if (RN && RN->getReg() == 0)
2416  Index = CurDAG->getRegister(0, MVT::i64);
2417  else {
2418  assert(Index.getValueType() == MVT::i32 &&
2419  "Expect to be extending 32-bit registers for use in LEA");
2420  SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2421  MVT::i64), 0);
2422  Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2423  Index);
2424  }
2425 
2426  return true;
2427 }
2428 
2429 /// Calls SelectAddr and determines if the maximal addressing
2430 /// mode it matches can be cost effectively emitted as an LEA instruction.
2431 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
2432  SDValue &Base, SDValue &Scale,
2433  SDValue &Index, SDValue &Disp,
2434  SDValue &Segment) {
2435  X86ISelAddressMode AM;
2436 
2437  // Save the DL and VT before calling matchAddress, it can invalidate N.
2438  SDLoc DL(N);
2439  MVT VT = N.getSimpleValueType();
2440 
2441  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2442  // segments.
2443  SDValue Copy = AM.Segment;
2444  SDValue T = CurDAG->getRegister(0, MVT::i32);
2445  AM.Segment = T;
2446  if (matchAddress(N, AM))
2447  return false;
2448  assert (T == AM.Segment);
2449  AM.Segment = Copy;
2450 
2451  unsigned Complexity = 0;
2452  if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
2453  Complexity = 1;
2454  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2455  Complexity = 4;
2456 
2457  if (AM.IndexReg.getNode())
2458  Complexity++;
2459 
2460  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2461  // a simple shift.
2462  if (AM.Scale > 1)
2463  Complexity++;
2464 
2465  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2466  // to a LEA. This is determined with some experimentation but is by no means
2467  // optimal (especially for code size consideration). LEA is nice because of
2468  // its three-address nature. Tweak the cost function again when we can run
2469  // convertToThreeAddress() at register allocation time.
2470  if (AM.hasSymbolicDisplacement()) {
2471  // For X86-64, always use LEA to materialize RIP-relative addresses.
2472  if (Subtarget->is64Bit())
2473  Complexity = 4;
2474  else
2475  Complexity += 2;
2476  }
2477 
2478  // Heuristic: try harder to form an LEA from ADD if the operands set flags.
2479  // Unlike ADD, LEA does not affect flags, so we will be less likely to require
2480  // duplicating flag-producing instructions later in the pipeline.
2481  if (N.getOpcode() == ISD::ADD) {
2482  auto isMathWithFlags = [](SDValue V) {
2483  switch (V.getOpcode()) {
2484  case X86ISD::ADD:
2485  case X86ISD::SUB:
2486  case X86ISD::ADC:
2487  case X86ISD::SBB:
2488  /* TODO: These opcodes can be added safely, but we may want to justify
2489  their inclusion for different reasons (better for reg-alloc).
2490  case X86ISD::SMUL:
2491  case X86ISD::UMUL:
2492  case X86ISD::OR:
2493  case X86ISD::XOR:
2494  case X86ISD::AND:
2495  */
2496  // Value 1 is the flag output of the node - verify it's not dead.
2497  return !SDValue(V.getNode(), 1).use_empty();
2498  default:
2499  return false;
2500  }
2501  };
2502  // TODO: This could be an 'or' rather than 'and' to make the transform more
2503  // likely to happen. We might want to factor in whether there's a
2504  // load folding opportunity for the math op that disappears with LEA.
2505  if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
2506  Complexity++;
2507  }
2508 
2509  if (AM.Disp)
2510  Complexity++;
2511 
2512  // If it isn't worth using an LEA, reject it.
2513  if (Complexity <= 2)
2514  return false;
2515 
2516  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2517  return true;
2518 }
2519 
2520 /// This is only run on TargetGlobalTLSAddress nodes.
2521 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
2522  SDValue &Scale, SDValue &Index,
2523  SDValue &Disp, SDValue &Segment) {
2525  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
2526 
2527  X86ISelAddressMode AM;
2528  AM.GV = GA->getGlobal();
2529  AM.Disp += GA->getOffset();
2530  AM.SymbolFlags = GA->getTargetFlags();
2531 
2532  MVT VT = N.getSimpleValueType();
2533  if (VT == MVT::i32) {
2534  AM.Scale = 1;
2535  AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
2536  }
2537 
2538  getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
2539  return true;
2540 }
2541 
2542 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
2543  if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
2544  Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
2545  N.getValueType());
2546  return true;
2547  }
2548 
2549  // Keep track of the original value type and whether this value was
2550  // truncated. If we see a truncation from pointer type to VT that truncates
2551  // bits that are known to be zero, we can use a narrow reference.
2552  EVT VT = N.getValueType();
2553  bool WasTruncated = false;
2554  if (N.getOpcode() == ISD::TRUNCATE) {
2555  WasTruncated = true;
2556  N = N.getOperand(0);
2557  }
2558 
2559  if (N.getOpcode() != X86ISD::Wrapper)
2560  return false;
2561 
2562  // We can only use non-GlobalValues as immediates if they were not truncated,
2563  // as we do not have any range information. If we have a GlobalValue and the
2564  // address was not truncated, we can select it as an operand directly.
2565  unsigned Opc = N.getOperand(0)->getOpcode();
2566  if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
2567  Op = N.getOperand(0);
2568  // We can only select the operand directly if we didn't have to look past a
2569  // truncate.
2570  return !WasTruncated;
2571  }
2572 
2573  // Check that the global's range fits into VT.
2574  auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
2575  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2576  if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
2577  return false;
2578 
2579  // Okay, we can use a narrow reference.
2580  Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
2581  GA->getOffset(), GA->getTargetFlags());
2582  return true;
2583 }
2584 
2585 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
2586  SDValue &Base, SDValue &Scale,
2587  SDValue &Index, SDValue &Disp,
2588  SDValue &Segment) {
2589  assert(Root && P && "Unknown root/parent nodes");
2590  if (!ISD::isNON_EXTLoad(N.getNode()) ||
2591  !IsProfitableToFold(N, P, Root) ||
2592  !IsLegalToFold(N, P, Root, OptLevel))
2593  return false;
2594 
2595  return selectAddr(N.getNode(),
2596  N.getOperand(1), Base, Scale, Index, Disp, Segment);
2597 }
2598 
2599 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
2600  SDValue &Base, SDValue &Scale,
2601  SDValue &Index, SDValue &Disp,
2602  SDValue &Segment) {
2603  assert(Root && P && "Unknown root/parent nodes");
2604  if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
2605  !IsProfitableToFold(N, P, Root) ||
2606  !IsLegalToFold(N, P, Root, OptLevel))
2607  return false;
2608 
2609  return selectAddr(N.getNode(),
2610  N.getOperand(1), Base, Scale, Index, Disp, Segment);
2611 }
2612 
2613 /// Return an SDNode that returns the value of the global base register.
2614 /// Output instructions required to initialize the global base register,
2615 /// if necessary.
2616 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
2617  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
2618  auto &DL = MF->getDataLayout();
2619  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
2620 }
2621 
2622 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
2623  if (N->getOpcode() == ISD::TRUNCATE)
2624  N = N->getOperand(0).getNode();
2625  if (N->getOpcode() != X86ISD::Wrapper)
2626  return false;
2627 
2628  auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
2629  if (!GA)
2630  return false;
2631 
2632  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2633  return CR && CR->getSignedMin().sge(-1ull << Width) &&
2634  CR->getSignedMax().slt(1ull << Width);
2635 }
2636 
2638  assert(N->isMachineOpcode() && "Unexpected node");
2640  unsigned Opc = N->getMachineOpcode();
2641  if (Opc == X86::JCC_1)
2642  CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
2643  else if (Opc == X86::SETCCr)
2644  CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
2645  else if (Opc == X86::SETCCm)
2646  CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
2647  else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
2648  Opc == X86::CMOV64rr)
2649  CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
2650  else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
2651  Opc == X86::CMOV64rm)
2652  CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
2653 
2654  return CC;
2655 }
2656 
2657 /// Test whether the given X86ISD::CMP node has any users that use a flag
2658 /// other than ZF.
2659 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
2660  // Examine each user of the node.
2661  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2662  UI != UE; ++UI) {
2663  // Only check things that use the flags.
2664  if (UI.getUse().getResNo() != Flags.getResNo())
2665  continue;
2666  // Only examine CopyToReg uses that copy to EFLAGS.
2667  if (UI->getOpcode() != ISD::CopyToReg ||
2668  cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2669  return false;
2670  // Examine each user of the CopyToReg use.
2671  for (SDNode::use_iterator FlagUI = UI->use_begin(),
2672  FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2673  // Only examine the Flag result.
2674  if (FlagUI.getUse().getResNo() != 1) continue;
2675  // Anything unusual: assume conservatively.
2676  if (!FlagUI->isMachineOpcode()) return false;
2677  // Examine the condition code of the user.
2678  X86::CondCode CC = getCondFromNode(*FlagUI);
2679 
2680  switch (CC) {
2681  // Comparisons which only use the zero flag.
2682  case X86::COND_E: case X86::COND_NE:
2683  continue;
2684  // Anything else: assume conservatively.
2685  default:
2686  return false;
2687  }
2688  }
2689  }
2690  return true;
2691 }
2692 
2693 /// Test whether the given X86ISD::CMP node has any uses which require the SF
2694 /// flag to be accurate.
2695 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
2696  // Examine each user of the node.
2697  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2698  UI != UE; ++UI) {
2699  // Only check things that use the flags.
2700  if (UI.getUse().getResNo() != Flags.getResNo())
2701  continue;
2702  // Only examine CopyToReg uses that copy to EFLAGS.
2703  if (UI->getOpcode() != ISD::CopyToReg ||
2704  cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2705  return false;
2706  // Examine each user of the CopyToReg use.
2707  for (SDNode::use_iterator FlagUI = UI->use_begin(),
2708  FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2709  // Only examine the Flag result.
2710  if (FlagUI.getUse().getResNo() != 1) continue;
2711  // Anything unusual: assume conservatively.
2712  if (!FlagUI->isMachineOpcode()) return false;
2713  // Examine the condition code of the user.
2714  X86::CondCode CC = getCondFromNode(*FlagUI);
2715 
2716  switch (CC) {
2717  // Comparisons which don't examine the SF flag.
2718  case X86::COND_A: case X86::COND_AE:
2719  case X86::COND_B: case X86::COND_BE:
2720  case X86::COND_E: case X86::COND_NE:
2721  case X86::COND_O: case X86::COND_NO:
2722  case X86::COND_P: case X86::COND_NP:
2723  continue;
2724  // Anything else: assume conservatively.
2725  default:
2726  return false;
2727  }
2728  }
2729  }
2730  return true;
2731 }
2732 
2734  switch (CC) {
2735  // Comparisons which don't examine the CF flag.
2736  case X86::COND_O: case X86::COND_NO:
2737  case X86::COND_E: case X86::COND_NE:
2738  case X86::COND_S: case X86::COND_NS:
2739  case X86::COND_P: case X86::COND_NP:
2740  case X86::COND_L: case X86::COND_GE:
2741  case X86::COND_G: case X86::COND_LE:
2742  return false;
2743  // Anything else: assume conservatively.
2744  default:
2745  return true;
2746  }
2747 }
2748 
2749 /// Test whether the given node which sets flags has any uses which require the
2750 /// CF flag to be accurate.
2751  bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
2752  // Examine each user of the node.
2753  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2754  UI != UE; ++UI) {
2755  // Only check things that use the flags.
2756  if (UI.getUse().getResNo() != Flags.getResNo())
2757  continue;
2758 
2759  unsigned UIOpc = UI->getOpcode();
2760 
2761  if (UIOpc == ISD::CopyToReg) {
2762  // Only examine CopyToReg uses that copy to EFLAGS.
2763  if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2764  return false;
2765  // Examine each user of the CopyToReg use.
2766  for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
2767  FlagUI != FlagUE; ++FlagUI) {
2768  // Only examine the Flag result.
2769  if (FlagUI.getUse().getResNo() != 1)
2770  continue;
2771  // Anything unusual: assume conservatively.
2772  if (!FlagUI->isMachineOpcode())
2773  return false;
2774  // Examine the condition code of the user.
2775  X86::CondCode CC = getCondFromNode(*FlagUI);
2776 
2777  if (mayUseCarryFlag(CC))
2778  return false;
2779  }
2780 
2781  // This CopyToReg is ok. Move on to the next user.
2782  continue;
2783  }
2784 
2785  // This might be an unselected node. So look for the pre-isel opcodes that
2786  // use flags.
2787  unsigned CCOpNo;
2788  switch (UIOpc) {
2789  default:
2790  // Something unusual. Be conservative.
2791  return false;
2792  case X86ISD::SETCC: CCOpNo = 0; break;
2793  case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
2794  case X86ISD::CMOV: CCOpNo = 2; break;
2795  case X86ISD::BRCOND: CCOpNo = 2; break;
2796  }
2797 
2798  X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
2799  if (mayUseCarryFlag(CC))
2800  return false;
2801  }
2802  return true;
2803 }
2804 
2805 /// Check whether or not the chain ending in StoreNode is suitable for doing
2806 /// the {load; op; store} to modify transformation.
2808  SDValue StoredVal, SelectionDAG *CurDAG,
2809  unsigned LoadOpNo,
2810  LoadSDNode *&LoadNode,
2811  SDValue &InputChain) {
2812  // Is the stored value result 0 of the operation?
2813  if (StoredVal.getResNo() != 0) return false;
2814 
2815  // Are there other uses of the operation other than the store?
2816  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
2817 
2818  // Is the store non-extending and non-indexed?
2819  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
2820  return false;
2821 
2822  SDValue Load = StoredVal->getOperand(LoadOpNo);
2823  // Is the stored value a non-extending and non-indexed load?
2824  if (!ISD::isNormalLoad(Load.getNode())) return false;
2825 
2826  // Return LoadNode by reference.
2827  LoadNode = cast<LoadSDNode>(Load);
2828 
2829  // Is store the only read of the loaded value?
2830  if (!Load.hasOneUse())
2831  return false;
2832 
2833  // Is the address of the store the same as the load?
2834  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
2835  LoadNode->getOffset() != StoreNode->getOffset())
2836  return false;
2837 
2838  bool FoundLoad = false;
2839  SmallVector<SDValue, 4> ChainOps;
2840  SmallVector<const SDNode *, 4> LoopWorklist;
2842  const unsigned int Max = 1024;
2843 
2844  // Visualization of Load-Op-Store fusion:
2845  // -------------------------
2846  // Legend:
2847  // *-lines = Chain operand dependencies.
2848  // |-lines = Normal operand dependencies.
2849  // Dependencies flow down and right. n-suffix references multiple nodes.
2850  //
2851  // C Xn C
2852  // * * *
2853  // * * *
2854  // Xn A-LD Yn TF Yn
2855  // * * \ | * |
2856  // * * \ | * |
2857  // * * \ | => A--LD_OP_ST
2858  // * * \| \
2859  // TF OP \
2860  // * | \ Zn
2861  // * | \
2862  // A-ST Zn
2863  //
2864 
2865  // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2866  // #2: Yn -> LD
2867  // #3: ST -> Zn
2868 
2869  // Ensure the transform is safe by checking for the dual
2870  // dependencies to make sure we do not induce a loop.
2871 
2872  // As LD is a predecessor to both OP and ST we can do this by checking:
2873  // a). if LD is a predecessor to a member of Xn or Yn.
2874  // b). if a Zn is a predecessor to ST.
2875 
2876  // However, (b) can only occur through being a chain predecessor to
2877  // ST, which is the same as Zn being a member or predecessor of Xn,
2878  // which is a subset of LD being a predecessor of Xn. So it's
2879  // subsumed by check (a).
2880 
2881  SDValue Chain = StoreNode->getChain();
2882 
2883  // Gather X elements in ChainOps.
2884  if (Chain == Load.getValue(1)) {
2885  FoundLoad = true;
2886  ChainOps.push_back(Load.getOperand(0));
2887  } else if (Chain.getOpcode() == ISD::TokenFactor) {
2888  for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
2889  SDValue Op = Chain.getOperand(i);
2890  if (Op == Load.getValue(1)) {
2891  FoundLoad = true;
2892  // Drop Load, but keep its chain. No cycle check necessary.
2893  ChainOps.push_back(Load.getOperand(0));
2894  continue;
2895  }
2896  LoopWorklist.push_back(Op.getNode());
2897  ChainOps.push_back(Op);
2898  }
2899  }
2900 
2901  if (!FoundLoad)
2902  return false;
2903 
2904  // Worklist is currently Xn. Add Yn to worklist.
2905  for (SDValue Op : StoredVal->ops())
2906  if (Op.getNode() != LoadNode)
2907  LoopWorklist.push_back(Op.getNode());
2908 
2909  // Check (a) if Load is a predecessor to Xn + Yn
2910  if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
2911  true))
2912  return false;
2913 
2914  InputChain =
2915  CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
2916  return true;
2917 }
2918 
2919 // Change a chain of {load; op; store} of the same value into a simple op
2920 // through memory of that value, if the uses of the modified value and its
2921 // address are suitable.
2922 //
2923 // The tablegen pattern memory operand pattern is currently not able to match
2924 // the case where the EFLAGS on the original operation are used.
2925 //
2926 // To move this to tablegen, we'll need to improve tablegen to allow flags to
2927 // be transferred from a node in the pattern to the result node, probably with
2928 // a new keyword. For example, we have this
2929 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2930 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2931 // (implicit EFLAGS)]>;
2932 // but maybe need something like this
2933 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2934 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2935 // (transferrable EFLAGS)]>;
2936 //
2937 // Until then, we manually fold these and instruction select the operation
2938 // here.
2939 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
2940  StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
2941  SDValue StoredVal = StoreNode->getOperand(1);
2942  unsigned Opc = StoredVal->getOpcode();
2943 
2944  // Before we try to select anything, make sure this is memory operand size
2945  // and opcode we can handle. Note that this must match the code below that
2946  // actually lowers the opcodes.
2947  EVT MemVT = StoreNode->getMemoryVT();
2948  if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
2949  MemVT != MVT::i8)
2950  return false;
2951 
2952  bool IsCommutable = false;
2953  bool IsNegate = false;
2954  switch (Opc) {
2955  default:
2956  return false;
2957  case X86ISD::SUB:
2958  IsNegate = isNullConstant(StoredVal.getOperand(0));
2959  break;
2960  case X86ISD::SBB:
2961  break;
2962  case X86ISD::ADD:
2963  case X86ISD::ADC:
2964  case X86ISD::AND:
2965  case X86ISD::OR:
2966  case X86ISD::XOR:
2967  IsCommutable = true;
2968  break;
2969  }
2970 
2971  unsigned LoadOpNo = IsNegate ? 1 : 0;
2972  LoadSDNode *LoadNode = nullptr;
2973  SDValue InputChain;
2974  if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2975  LoadNode, InputChain)) {
2976  if (!IsCommutable)
2977  return false;
2978 
2979  // This operation is commutable, try the other operand.
2980  LoadOpNo = 1;
2981  if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2982  LoadNode, InputChain))
2983  return false;
2984  }
2985 
2986  SDValue Base, Scale, Index, Disp, Segment;
2987  if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
2988  Segment))
2989  return false;
2990 
2991  auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
2992  unsigned Opc8) {
2993  switch (MemVT.getSimpleVT().SimpleTy) {
2994  case MVT::i64:
2995  return Opc64;
2996  case MVT::i32:
2997  return Opc32;
2998  case MVT::i16:
2999  return Opc16;
3000  case MVT::i8:
3001  return Opc8;
3002  default:
3003  llvm_unreachable("Invalid size!");
3004  }
3005  };
3006 
3007  MachineSDNode *Result;
3008  switch (Opc) {
3009  case X86ISD::SUB:
3010  // Handle negate.
3011  if (IsNegate) {
3012  unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3013  X86::NEG8m);
3014  const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3015  Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3016  MVT::Other, Ops);
3017  break;
3018  }
3020  case X86ISD::ADD:
3021  // Try to match inc/dec.
3022  if (!Subtarget->slowIncDec() || OptForSize) {
3023  bool IsOne = isOneConstant(StoredVal.getOperand(1));
3024  bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3025  // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3026  if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3027  unsigned NewOpc =
3028  ((Opc == X86ISD::ADD) == IsOne)
3029  ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3030  : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3031  const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3032  Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3033  MVT::Other, Ops);
3034  break;
3035  }
3036  }
3038  case X86ISD::ADC:
3039  case X86ISD::SBB:
3040  case X86ISD::AND:
3041  case X86ISD::OR:
3042  case X86ISD::XOR: {
3043  auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3044  switch (Opc) {
3045  case X86ISD::ADD:
3046  return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3047  X86::ADD8mr);
3048  case X86ISD::ADC:
3049  return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3050  X86::ADC8mr);
3051  case X86ISD::SUB:
3052  return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3053  X86::SUB8mr);
3054  case X86ISD::SBB:
3055  return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3056  X86::SBB8mr);
3057  case X86ISD::AND:
3058  return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3059  X86::AND8mr);
3060  case X86ISD::OR:
3061  return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3062  case X86ISD::XOR:
3063  return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3064  X86::XOR8mr);
3065  default:
3066  llvm_unreachable("Invalid opcode!");
3067  }
3068  };
3069  auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
3070  switch (Opc) {
3071  case X86ISD::ADD:
3072  return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
3073  case X86ISD::ADC:
3074  return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
3075  case X86ISD::SUB:
3076  return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
3077  case X86ISD::SBB:
3078  return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
3079  case X86ISD::AND:
3080  return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
3081  case X86ISD::OR:
3082  return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
3083  case X86ISD::XOR:
3084  return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
3085  default:
3086  llvm_unreachable("Invalid opcode!");
3087  }
3088  };
3089  auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3090  switch (Opc) {
3091  case X86ISD::ADD:
3092  return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3093  X86::ADD8mi);
3094  case X86ISD::ADC:
3095  return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3096  X86::ADC8mi);
3097  case X86ISD::SUB:
3098  return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3099  X86::SUB8mi);
3100  case X86ISD::SBB:
3101  return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3102  X86::SBB8mi);
3103  case X86ISD::AND:
3104  return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3105  X86::AND8mi);
3106  case X86ISD::OR:
3107  return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3108  X86::OR8mi);
3109  case X86ISD::XOR:
3110  return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3111  X86::XOR8mi);
3112  default:
3113  llvm_unreachable("Invalid opcode!");
3114  }
3115  };
3116 
3117  unsigned NewOpc = SelectRegOpcode(Opc);
3118  SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3119 
3120  // See if the operand is a constant that we can fold into an immediate
3121  // operand.
3122  if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3123  int64_t OperandV = OperandC->getSExtValue();
3124 
3125  // Check if we can shrink the operand enough to fit in an immediate (or
3126  // fit into a smaller immediate) by negating it and switching the
3127  // operation.
3128  if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3129  ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3130  (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3131  isInt<32>(-OperandV))) &&
3132  hasNoCarryFlagUses(StoredVal.getValue(1))) {
3133  OperandV = -OperandV;
3134  Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3135  }
3136 
3137  // First try to fit this into an Imm8 operand. If it doesn't fit, then try
3138  // the larger immediate operand.
3139  if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
3140  Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3141  NewOpc = SelectImm8Opcode(Opc);
3142  } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3143  Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3144  NewOpc = SelectImmOpcode(Opc);
3145  }
3146  }
3147 
3148  if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3149  SDValue CopyTo =
3150  CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3151  StoredVal.getOperand(2), SDValue());
3152 
3153  const SDValue Ops[] = {Base, Scale, Index, Disp,
3154  Segment, Operand, CopyTo, CopyTo.getValue(1)};
3155  Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3156  Ops);
3157  } else {
3158  const SDValue Ops[] = {Base, Scale, Index, Disp,
3159  Segment, Operand, InputChain};
3160  Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3161  Ops);
3162  }
3163  break;
3164  }
3165  default:
3166  llvm_unreachable("Invalid opcode!");
3167  }
3168 
3169  MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3170  LoadNode->getMemOperand()};
3171  CurDAG->setNodeMemRefs(Result, MemOps);
3172 
3173  // Update Load Chain uses as well.
3174  ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3175  ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3176  ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3177  CurDAG->RemoveDeadNode(Node);
3178  return true;
3179 }
3180 
3181 // See if this is an X & Mask that we can match to BEXTR/BZHI.
3182 // Where Mask is one of the following patterns:
3183 // a) x & (1 << nbits) - 1
3184 // b) x & ~(-1 << nbits)
3185 // c) x & (-1 >> (32 - y))
3186 // d) x << (32 - y) >> (32 - y)
3187 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3188  assert(
3189  (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
3190  "Should be either an and-mask, or right-shift after clearing high bits.");
3191 
3192  // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3193  if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3194  return false;
3195 
3196  MVT NVT = Node->getSimpleValueType(0);
3197 
3198  // Only supported for 32 and 64 bits.
3199  if (NVT != MVT::i32 && NVT != MVT::i64)
3200  return false;
3201 
3202  SDValue NBits;
3203 
3204  // If we have BMI2's BZHI, we are ok with muti-use patterns.
3205  // Else, if we only have BMI1's BEXTR, we require one-use.
3206  const bool CanHaveExtraUses = Subtarget->hasBMI2();
3207  auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
3208  return CanHaveExtraUses ||
3209  Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3210  };
3211  auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
3212  auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
3213 
3214  auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3215  if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3216  assert(V.getSimpleValueType() == MVT::i32 &&
3217  V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3218  "Expected i64 -> i32 truncation");
3219  V = V.getOperand(0);
3220  }
3221  return V;
3222  };
3223 
3224  // a) x & ((1 << nbits) + (-1))
3225  auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
3226  &NBits](SDValue Mask) -> bool {
3227  // Match `add`. Must only have one use!
3228  if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3229  return false;
3230  // We should be adding all-ones constant (i.e. subtracting one.)
3231  if (!isAllOnesConstant(Mask->getOperand(1)))
3232  return false;
3233  // Match `1 << nbits`. Might be truncated. Must only have one use!
3234  SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3235  if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3236  return false;
3237  if (!isOneConstant(M0->getOperand(0)))
3238  return false;
3239  NBits = M0->getOperand(1);
3240  return true;
3241  };
3242 
3243  auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3244  V = peekThroughOneUseTruncation(V);
3245  return CurDAG->MaskedValueIsAllOnes(
3246  V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3247  NVT.getSizeInBits()));
3248  };
3249 
3250  // b) x & ~(-1 << nbits)
3251  auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3252  &NBits](SDValue Mask) -> bool {
3253  // Match `~()`. Must only have one use!
3254  if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3255  return false;
3256  // The -1 only has to be all-ones for the final Node's NVT.
3257  if (!isAllOnes(Mask->getOperand(1)))
3258  return false;
3259  // Match `-1 << nbits`. Might be truncated. Must only have one use!
3260  SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3261  if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3262  return false;
3263  // The -1 only has to be all-ones for the final Node's NVT.
3264  if (!isAllOnes(M0->getOperand(0)))
3265  return false;
3266  NBits = M0->getOperand(1);
3267  return true;
3268  };
3269 
3270  // Match potentially-truncated (bitwidth - y)
3271  auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
3272  unsigned Bitwidth) {
3273  // Skip over a truncate of the shift amount.
3274  if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
3275  ShiftAmt = ShiftAmt.getOperand(0);
3276  // The trunc should have been the only user of the real shift amount.
3277  if (!checkOneUse(ShiftAmt))
3278  return false;
3279  }
3280  // Match the shift amount as: (bitwidth - y). It should go away, too.
3281  if (ShiftAmt.getOpcode() != ISD::SUB)
3282  return false;
3283  auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
3284  if (!V0 || V0->getZExtValue() != Bitwidth)
3285  return false;
3286  NBits = ShiftAmt.getOperand(1);
3287  return true;
3288  };
3289 
3290  // c) x & (-1 >> (32 - y))
3291  auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
3292  matchShiftAmt](SDValue Mask) -> bool {
3293  // The mask itself may be truncated.
3294  Mask = peekThroughOneUseTruncation(Mask);
3295  unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3296  // Match `l>>`. Must only have one use!
3297  if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3298  return false;
3299  // We should be shifting truly all-ones constant.
3300  if (!isAllOnesConstant(Mask.getOperand(0)))
3301  return false;
3302  SDValue M1 = Mask.getOperand(1);
3303  // The shift amount should not be used externally.
3304  if (!checkOneUse(M1))
3305  return false;
3306  return matchShiftAmt(M1, Bitwidth);
3307  };
3308 
3309  SDValue X;
3310 
3311  // d) x << (32 - y) >> (32 - y)
3312  auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
3313  &X](SDNode *Node) -> bool {
3314  if (Node->getOpcode() != ISD::SRL)
3315  return false;
3316  SDValue N0 = Node->getOperand(0);
3317  if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
3318  return false;
3319  unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3320  SDValue N1 = Node->getOperand(1);
3321  SDValue N01 = N0->getOperand(1);
3322  // Both of the shifts must be by the exact same value.
3323  // There should not be any uses of the shift amount outside of the pattern.
3324  if (N1 != N01 || !checkTwoUse(N1))
3325  return false;
3326  if (!matchShiftAmt(N1, Bitwidth))
3327  return false;
3328  X = N0->getOperand(0);
3329  return true;
3330  };
3331 
3332  auto matchLowBitMask = [matchPatternA, matchPatternB,
3333  matchPatternC](SDValue Mask) -> bool {
3334  return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3335  };
3336 
3337  if (Node->getOpcode() == ISD::AND) {
3338  X = Node->getOperand(0);
3339  SDValue Mask = Node->getOperand(1);
3340 
3341  if (matchLowBitMask(Mask)) {
3342  // Great.
3343  } else {
3344  std::swap(X, Mask);
3345  if (!matchLowBitMask(Mask))
3346  return false;
3347  }
3348  } else if (!matchPatternD(Node))
3349  return false;
3350 
3351  SDLoc DL(Node);
3352 
3353  // Truncate the shift amount.
3354  NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3355  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3356 
3357  // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3358  // All the other bits are undefined, we do not care about them.
3359  SDValue ImplDef = SDValue(
3360  CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3361  insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3362 
3363  SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3364  insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3365  NBits = SDValue(
3366  CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
3367  NBits, SRIdxVal), 0);
3368  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3369 
3370  if (Subtarget->hasBMI2()) {
3371  // Great, just emit the the BZHI..
3372  if (NVT != MVT::i32) {
3373  // But have to place the bit count into the wide-enough register first.
3374  NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3375  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3376  }
3377 
3378  SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3379  ReplaceNode(Node, Extract.getNode());
3380  SelectCode(Extract.getNode());
3381  return true;
3382  }
3383 
3384  // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3385  // *logically* shifted (potentially with one-use trunc inbetween),
3386  // and the truncation was the only use of the shift,
3387  // and if so look past one-use truncation.
3388  {
3389  SDValue RealX = peekThroughOneUseTruncation(X);
3390  // FIXME: only if the shift is one-use?
3391  if (RealX != X && RealX.getOpcode() == ISD::SRL)
3392  X = RealX;
3393  }
3394 
3395  MVT XVT = X.getSimpleValueType();
3396 
3397  // Else, emitting BEXTR requires one more step.
3398  // The 'control' of BEXTR has the pattern of:
3399  // [15...8 bit][ 7...0 bit] location
3400  // [ bit count][ shift] name
3401  // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3402 
3403  // Shift NBits left by 8 bits, thus producing 'control'.
3404  // This makes the low 8 bits to be zero.
3405  SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3406  SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3407  insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3408 
3409  // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3410  // FIXME: only if the shift is one-use?
3411  if (X.getOpcode() == ISD::SRL) {
3412  SDValue ShiftAmt = X.getOperand(1);
3413  X = X.getOperand(0);
3414 
3415  assert(ShiftAmt.getValueType() == MVT::i8 &&
3416  "Expected shift amount to be i8");
3417 
3418  // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3419  // We could zext to i16 in some form, but we intentionally don't do that.
3420  SDValue OrigShiftAmt = ShiftAmt;
3421  ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
3422  insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
3423 
3424  // And now 'or' these low 8 bits of shift amount into the 'control'.
3425  Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
3426  insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3427  }
3428 
3429  // But have to place the 'control' into the wide-enough register first.
3430  if (XVT != MVT::i32) {
3431  Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
3432  insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3433  }
3434 
3435  // And finally, form the BEXTR itself.
3436  SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
3437 
3438  // The 'X' was originally truncated. Do that now.
3439  if (XVT != NVT) {
3440  insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
3441  Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
3442  }
3443 
3444  ReplaceNode(Node, Extract.getNode());
3445  SelectCode(Extract.getNode());
3446 
3447  return true;
3448 }
3449 
3450 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3451 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
3452  MVT NVT = Node->getSimpleValueType(0);
3453  SDLoc dl(Node);
3454 
3455  SDValue N0 = Node->getOperand(0);
3456  SDValue N1 = Node->getOperand(1);
3457 
3458  // If we have TBM we can use an immediate for the control. If we have BMI
3459  // we should only do this if the BEXTR instruction is implemented well.
3460  // Otherwise moving the control into a register makes this more costly.
3461  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3462  // hoisting the move immediate would make it worthwhile with a less optimal
3463  // BEXTR?
3464  bool PreferBEXTR =
3465  Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
3466  if (!PreferBEXTR && !Subtarget->hasBMI2())
3467  return nullptr;
3468 
3469  // Must have a shift right.
3470  if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
3471  return nullptr;
3472 
3473  // Shift can't have additional users.
3474  if (!N0->hasOneUse())
3475  return nullptr;
3476 
3477  // Only supported for 32 and 64 bits.
3478  if (NVT != MVT::i32 && NVT != MVT::i64)
3479  return nullptr;
3480 
3481  // Shift amount and RHS of and must be constant.
3482  ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
3483  ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
3484  if (!MaskCst || !ShiftCst)
3485  return nullptr;
3486 
3487  // And RHS must be a mask.
3488  uint64_t Mask = MaskCst->getZExtValue();
3489  if (!isMask_64(Mask))
3490  return nullptr;
3491 
3492  uint64_t Shift = ShiftCst->getZExtValue();
3493  uint64_t MaskSize = countPopulation(Mask);
3494 
3495  // Don't interfere with something that can be handled by extracting AH.
3496  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3497  if (Shift == 8 && MaskSize == 8)
3498  return nullptr;
3499 
3500  // Make sure we are only using bits that were in the original value, not
3501  // shifted in.
3502  if (Shift + MaskSize > NVT.getSizeInBits())
3503  return nullptr;
3504 
3505  // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
3506  // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
3507  // does not fit into 32 bits. Load folding is not a sufficient reason.
3508  if (!PreferBEXTR && MaskSize <= 32)
3509  return nullptr;
3510 
3511  SDValue Control;
3512  unsigned ROpc, MOpc;
3513 
3514  if (!PreferBEXTR) {
3515  assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
3516  // If we can't make use of BEXTR then we can't fuse shift+mask stages.
3517  // Let's perform the mask first, and apply shift later. Note that we need to
3518  // widen the mask to account for the fact that we'll apply shift afterwards!
3519  Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
3520  ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
3521  MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
3522  unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3523  Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
3524  } else {
3525  // The 'control' of BEXTR has the pattern of:
3526  // [15...8 bit][ 7...0 bit] location
3527  // [ bit count][ shift] name
3528  // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3529  Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
3530  if (Subtarget->hasTBM()) {
3531  ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
3532  MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
3533  } else {
3534  assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
3535  // BMI requires the immediate to placed in a register.
3536  ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
3537  MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
3538  unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3539  Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
3540  }
3541  }
3542 
3543  MachineSDNode *NewNode;
3544  SDValue Input = N0->getOperand(0);
3545  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3546  if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3547  SDValue Ops[] = {
3548  Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
3549  SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
3550  NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3551  // Update the chain.
3552  ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
3553  // Record the mem-refs
3554  CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
3555  } else {
3556  NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
3557  }
3558 
3559  if (!PreferBEXTR) {
3560  // We still need to apply the shift.
3561  SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
3562  unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
3563  NewNode =
3564  CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
3565  }
3566 
3567  return NewNode;
3568 }
3569 
3570 // Emit a PCMISTR(I/M) instruction.
3571 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
3572  bool MayFoldLoad, const SDLoc &dl,
3573  MVT VT, SDNode *Node) {
3574  SDValue N0 = Node->getOperand(0);
3575  SDValue N1 = Node->getOperand(1);
3576  SDValue Imm = Node->getOperand(2);
3577  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3578  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3579 
3580  // Try to fold a load. No need to check alignment.
3581  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3582  if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3583  SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3584  N1.getOperand(0) };
3585  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
3586  MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3587  // Update the chain.
3588  ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
3589  // Record the mem-refs
3590  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3591  return CNode;
3592  }
3593 
3594  SDValue Ops[] = { N0, N1, Imm };
3595  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
3596  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3597  return CNode;
3598 }
3599 
3600 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3601 // to emit a second instruction after this one. This is needed since we have two
3602 // copyToReg nodes glued before this and we need to continue that glue through.
3603 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
3604  bool MayFoldLoad, const SDLoc &dl,
3605  MVT VT, SDNode *Node,
3606  SDValue &InFlag) {
3607  SDValue N0 = Node->getOperand(0);
3608  SDValue N2 = Node->getOperand(2);
3609  SDValue Imm = Node->getOperand(4);
3610  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3611  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3612 
3613  // Try to fold a load. No need to check alignment.
3614  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3615  if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3616  SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3617  N2.getOperand(0), InFlag };
3618  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
3619  MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3620  InFlag = SDValue(CNode, 3);
3621  // Update the chain.
3622  ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
3623  // Record the mem-refs
3624  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
3625  return CNode;
3626  }
3627 
3628  SDValue Ops[] = { N0, N2, Imm, InFlag };
3629  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
3630  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3631  InFlag = SDValue(CNode, 2);
3632  return CNode;
3633 }
3634 
3635 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
3636  EVT VT = N->getValueType(0);
3637 
3638  // Only handle scalar shifts.
3639  if (VT.isVector())
3640  return false;
3641 
3642  // Narrower shifts only mask to 5 bits in hardware.
3643  unsigned Size = VT == MVT::i64 ? 64 : 32;
3644 
3645  SDValue OrigShiftAmt = N->getOperand(1);
3646  SDValue ShiftAmt = OrigShiftAmt;
3647  SDLoc DL(N);
3648 
3649  // Skip over a truncate of the shift amount.
3650  if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
3651  ShiftAmt = ShiftAmt->getOperand(0);
3652 
3653  // This function is called after X86DAGToDAGISel::matchBitExtract(),
3654  // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3655 
3656  SDValue NewShiftAmt;
3657  if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
3658  SDValue Add0 = ShiftAmt->getOperand(0);
3659  SDValue Add1 = ShiftAmt->getOperand(1);
3660  // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3661  // to avoid the ADD/SUB.
3662  if (isa<ConstantSDNode>(Add1) &&
3663  cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
3664  NewShiftAmt = Add0;
3665  // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3666  // generate a NEG instead of a SUB of a constant.
3667  } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3668  isa<ConstantSDNode>(Add0) &&
3669  cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
3670  cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
3671  // Insert a negate op.
3672  // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3673  // that uses it that's not a shift.
3674  EVT SubVT = ShiftAmt.getValueType();
3675  SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
3676  SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
3677  NewShiftAmt = Neg;
3678 
3679  // Insert these operands into a valid topological order so they can
3680  // get selected independently.
3681  insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
3682  insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
3683  } else
3684  return false;
3685  } else
3686  return false;
3687 
3688  if (NewShiftAmt.getValueType() != MVT::i8) {
3689  // Need to truncate the shift amount.
3690  NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
3691  // Add to a correct topological ordering.
3692  insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3693  }
3694 
3695  // Insert a new mask to keep the shift amount legal. This should be removed
3696  // by isel patterns.
3697  NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
3698  CurDAG->getConstant(Size - 1, DL, MVT::i8));
3699  // Place in a correct topological ordering.
3700  insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3701 
3702  SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
3703  NewShiftAmt);
3704  if (UpdatedNode != N) {
3705  // If we found an existing node, we should replace ourselves with that node
3706  // and wait for it to be selected after its other users.
3707  ReplaceNode(N, UpdatedNode);
3708  return true;
3709  }
3710 
3711  // If the original shift amount is now dead, delete it so that we don't run
3712  // it through isel.
3713  if (OrigShiftAmt.getNode()->use_empty())
3714  CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
3715 
3716  // Now that we've optimized the shift amount, defer to normal isel to get
3717  // load folding and legacy vs BMI2 selection without repeating it here.
3718  SelectCode(N);
3719  return true;
3720 }
3721 
3722 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
3723  MVT NVT = N->getSimpleValueType(0);
3724  unsigned Opcode = N->getOpcode();
3725  SDLoc dl(N);
3726 
3727  // For operations of the form (x << C1) op C2, check if we can use a smaller
3728  // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3729  SDValue Shift = N->getOperand(0);
3730  SDValue N1 = N->getOperand(1);
3731 
3733  if (!Cst)
3734  return false;
3735 
3736  int64_t Val = Cst->getSExtValue();
3737 
3738  // If we have an any_extend feeding the AND, look through it to see if there
3739  // is a shift behind it. But only if the AND doesn't use the extended bits.
3740  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
3741  bool FoundAnyExtend = false;
3742  if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
3743  Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
3744  isUInt<32>(Val)) {
3745  FoundAnyExtend = true;
3746  Shift = Shift.getOperand(0);
3747  }
3748 
3749  if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
3750  return false;
3751 
3752  // i8 is unshrinkable, i16 should be promoted to i32.
3753  if (NVT != MVT::i32 && NVT != MVT::i64)
3754  return false;
3755 
3756  ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
3757  if (!ShlCst)
3758  return false;
3759 
3760  uint64_t ShAmt = ShlCst->getZExtValue();
3761 
3762  // Make sure that we don't change the operation by removing bits.
3763  // This only matters for OR and XOR, AND is unaffected.
3764  uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
3765  if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
3766  return false;
3767 
3768  // Check the minimum bitwidth for the new constant.
3769  // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3770  auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
3771  if (Opcode == ISD::AND) {
3772  // AND32ri is the same as AND64ri32 with zext imm.
3773  // Try this before sign extended immediates below.
3774  ShiftedVal = (uint64_t)Val >> ShAmt;
3775  if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3776  return true;
3777  // Also swap order when the AND can become MOVZX.
3778  if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
3779  return true;
3780  }
3781  ShiftedVal = Val >> ShAmt;
3782  if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
3783  (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
3784  return true;
3785  if (Opcode != ISD::AND) {
3786  // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
3787  ShiftedVal = (uint64_t)Val >> ShAmt;
3788  if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3789  return true;
3790  }
3791  return false;
3792  };
3793 
3794  int64_t ShiftedVal;
3795  if (!CanShrinkImmediate(ShiftedVal))
3796  return false;
3797 
3798  // Ok, we can reorder to get a smaller immediate.
3799 
3800  // But, its possible the original immediate allowed an AND to become MOVZX.
3801  // Doing this late due to avoid the MakedValueIsZero call as late as
3802  // possible.
3803  if (Opcode == ISD::AND) {
3804  // Find the smallest zext this could possibly be.
3805  unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
3806  ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
3807 
3808  // Figure out which bits need to be zero to achieve that mask.
3809  APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
3810  ZExtWidth);
3811  NeededMask &= ~Cst->getAPIntValue();
3812 
3813  if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
3814  return false;
3815  }
3816 
3817  SDValue X = Shift.getOperand(0);
3818  if (FoundAnyExtend) {
3819  SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
3820  insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
3821  X = NewX;
3822  }
3823 
3824  SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
3825  insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
3826  SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
3827  insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
3828  SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
3829  Shift.getOperand(1));
3830  ReplaceNode(N, NewSHL.getNode());
3831  SelectCode(NewSHL.getNode());
3832  return true;
3833 }
3834 
3835 /// Convert vector increment or decrement to sub/add with an all-ones constant:
3836 /// add X, <1, 1...> --> sub X, <-1, -1...>
3837 /// sub X, <1, 1...> --> add X, <-1, -1...>
3838 /// The all-ones vector constant can be materialized using a pcmpeq instruction
3839 /// that is commonly recognized as an idiom (has no register dependency), so
3840 /// that's better/smaller than loading a splat 1 constant.
3841 bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
3842  assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) &&
3843  "Unexpected opcode for increment/decrement transform");
3844 
3845  EVT VT = Node->getValueType(0);
3846  assert(VT.isVector() && "Should only be called for vectors.");
3847 
3848  SDValue X = Node->getOperand(0);
3849  SDValue OneVec = Node->getOperand(1);
3850 
3851  APInt SplatVal;
3852  if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue())
3853  return false;
3854 
3855  SDLoc DL(Node);
3856  SDValue OneConstant, AllOnesVec;
3857 
3858  APInt Ones = APInt::getAllOnesValue(32);
3859  assert(VT.getSizeInBits() % 32 == 0 &&
3860  "Expected bit count to be a multiple of 32");
3861  OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32);
3862  insertDAGNode(*CurDAG, X, OneConstant);
3863 
3864  unsigned NumElts = VT.getSizeInBits() / 32;
3865  assert(NumElts > 0 && "Expected to get non-empty vector.");
3866  AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts),
3867  DL, OneConstant);
3868  insertDAGNode(*CurDAG, X, AllOnesVec);
3869 
3870  AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
3871  insertDAGNode(*CurDAG, X, AllOnesVec);
3872 
3873  unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
3874  SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);
3875 
3876  ReplaceNode(Node, NewNode.getNode());
3877  SelectCode(NewNode.getNode());
3878  return true;
3879 }
3880 
3881 /// If the high bits of an 'and' operand are known zero, try setting the
3882 /// high bits of an 'and' constant operand to produce a smaller encoding by
3883 /// creating a small, sign-extended negative immediate rather than a large
3884 /// positive one. This reverses a transform in SimplifyDemandedBits that
3885 /// shrinks mask constants by clearing bits. There is also a possibility that
3886 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
3887 /// case, just replace the 'and'. Return 'true' if the node is replaced.
3888 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
3889  // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
3890  // have immediate operands.
3891  MVT VT = And->getSimpleValueType(0);
3892  if (VT != MVT::i32 && VT != MVT::i64)
3893  return false;
3894 
3895  auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
3896  if (!And1C)
3897  return false;
3898 
3899  // Bail out if the mask constant is already negative. It's can't shrink more.
3900  // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
3901  // patterns to use a 32-bit and instead of a 64-bit and by relying on the
3902  // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
3903  // are negative too.
3904  APInt MaskVal = And1C->getAPIntValue();
3905  unsigned MaskLZ = MaskVal.countLeadingZeros();
3906  if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
3907  return false;
3908 
3909  // Don't extend into the upper 32 bits of a 64 bit mask.
3910  if (VT == MVT::i64 && MaskLZ >= 32) {
3911  MaskLZ -= 32;
3912  MaskVal = MaskVal.trunc(32);
3913  }
3914 
3915  SDValue And0 = And->getOperand(0);
3916  APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
3917  APInt NegMaskVal = MaskVal | HighZeros;
3918 
3919  // If a negative constant would not allow a smaller encoding, there's no need
3920  // to continue. Only change the constant when we know it's a win.
3921  unsigned MinWidth = NegMaskVal.getMinSignedBits();
3922  if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
3923  return false;
3924 
3925  // Extend masks if we truncated above.
3926  if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
3927  NegMaskVal = NegMaskVal.zext(64);
3928  HighZeros = HighZeros.zext(64);
3929  }
3930 
3931  // The variable operand must be all zeros in the top bits to allow using the
3932  // new, negative constant as the mask.
3933  if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
3934  return false;
3935 
3936  // Check if the mask is -1. In that case, this is an unnecessary instruction
3937  // that escaped earlier analysis.
3938  if (NegMaskVal.isAllOnesValue()) {
3939  ReplaceNode(And, And0.getNode());
3940  return true;
3941  }
3942 
3943  // A negative mask allows a smaller encoding. Create a new 'and' node.
3944  SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
3945  SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
3946  ReplaceNode(And, NewAnd.getNode());
3947  SelectCode(NewAnd.getNode());
3948  return true;
3949 }
3950 
3951 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
3952  bool FoldedBCast, bool Masked) {
3953  if (Masked) {
3954  if (FoldedLoad) {
3955  switch (TestVT.SimpleTy) {
3956  default: llvm_unreachable("Unexpected VT!");
3957  case MVT::v16i8:
3958  return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
3959  case MVT::v8i16:
3960  return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
3961  case MVT::v4i32:
3962  return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
3963  case MVT::v2i64:
3964  return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
3965  case MVT::v32i8:
3966  return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
3967  case MVT::v16i16:
3968  return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
3969  case MVT::v8i32:
3970  return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
3971  case MVT::v4i64:
3972  return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
3973  case MVT::v64i8:
3974  return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
3975  case MVT::v32i16:
3976  return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
3977  case MVT::v16i32:
3978  return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
3979  case MVT::v8i64:
3980  return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
3981  }
3982  }
3983 
3984  if (FoldedBCast) {
3985  switch (TestVT.SimpleTy) {
3986  default: llvm_unreachable("Unexpected VT!");
3987  case MVT::v4i32:
3988  return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
3989  case MVT::v2i64:
3990  return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
3991  case MVT::v8i32:
3992  return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
3993  case MVT::v4i64:
3994  return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
3995  case MVT::v16i32:
3996  return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
3997  case MVT::v8i64:
3998  return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
3999  }
4000  }
4001 
4002  switch (TestVT.SimpleTy) {
4003  default: llvm_unreachable("Unexpected VT!");
4004  case MVT::v16i8:
4005  return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
4006  case MVT::v8i16:
4007  return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
4008  case MVT::v4i32:
4009  return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
4010  case MVT::v2i64:
4011  return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
4012  case MVT::v32i8:
4013  return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
4014  case MVT::v16i16:
4015  return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
4016  case MVT::v8i32:
4017  return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
4018  case MVT::v4i64:
4019  return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
4020  case MVT::v64i8:
4021  return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
4022  case MVT::v32i16:
4023  return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
4024  case MVT::v16i32:
4025  return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
4026  case MVT::v8i64:
4027  return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
4028  }
4029  }
4030 
4031  if (FoldedLoad) {
4032  switch (TestVT.SimpleTy) {
4033  default: llvm_unreachable("Unexpected VT!");
4034  case MVT::v16i8:
4035  return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
4036  case MVT::v8i16:
4037  return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
4038  case MVT::v4i32:
4039  return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
4040  case MVT::v2i64:
4041  return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
4042  case MVT::v32i8:
4043  return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
4044  case MVT::v16i16:
4045  return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
4046  case MVT::v8i32:
4047  return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
4048  case MVT::v4i64:
4049  return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
4050  case MVT::v64i8:
4051  return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
4052  case MVT::v32i16:
4053  return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
4054  case MVT::v16i32:
4055  return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
4056  case MVT::v8i64:
4057  return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
4058  }
4059  }
4060 
4061  if (FoldedBCast) {
4062  switch (TestVT.SimpleTy) {
4063  default: llvm_unreachable("Unexpected VT!");
4064  case MVT::v4i32:
4065  return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
4066  case MVT::v2i64:
4067  return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
4068  case MVT::v8i32:
4069  return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
4070  case MVT::v4i64:
4071  return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
4072  case MVT::v16i32:
4073  return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
4074  case MVT::v8i64:
4075  return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
4076  }
4077  }
4078 
4079  switch (TestVT.SimpleTy) {
4080  default: llvm_unreachable("Unexpected VT!");
4081  case MVT::v16i8:
4082  return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
4083  case MVT::v8i16:
4084  return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
4085  case MVT::v4i32:
4086  return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
4087  case MVT::v2i64:
4088  return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
4089  case MVT::v32i8:
4090  return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
4091  case MVT::v16i16:
4092  return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
4093  case MVT::v8i32:
4094  return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
4095  case MVT::v4i64:
4096  return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
4097  case MVT::v64i8:
4098  return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
4099  case MVT::v32i16:
4100  return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
4101  case MVT::v16i32:
4102  return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
4103  case MVT::v8i64:
4104  return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
4105  }
4106 }
4107 
4108 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4109 // to form a masked operation.
4110 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4111  SDValue InMask) {
4112  assert(Subtarget->hasAVX512() && "Expected AVX512!");
4114  "Unexpected VT!");
4115 
4116  // Look for equal and not equal compares.
4117  ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4118  if (CC != ISD::SETEQ && CC != ISD::SETNE)
4119  return false;
4120 
4121  SDValue SetccOp0 = Setcc.getOperand(0);
4122  SDValue SetccOp1 = Setcc.getOperand(1);
4123 
4124  // Canonicalize the all zero vector to the RHS.
4125  if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4126  std::swap(SetccOp0, SetccOp1);
4127 
4128  // See if we're comparing against zero.
4129  if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4130  return false;
4131 
4132  SDValue N0 = SetccOp0;
4133 
4134  MVT CmpVT = N0.getSimpleValueType();
4135  MVT CmpSVT = CmpVT.getVectorElementType();
4136 
4137  // Start with both operands the same. We'll try to refine this.
4138  SDValue Src0 = N0;
4139  SDValue Src1 = N0;
4140 
4141  {
4142  // Look through single use bitcasts.
4143  SDValue N0Temp = N0;
4144  if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4145  N0Temp = N0.getOperand(0);
4146 
4147  // Look for single use AND.
4148  if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4149  Src0 = N0Temp.getOperand(0);
4150  Src1 = N0Temp.getOperand(1);
4151  }
4152  }
4153 
4154  // Without VLX we need to widen the load.
4155  bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4156 
4157  // We can only fold loads if the sources are unique.
4158  bool CanFoldLoads = Src0 != Src1;
4159 
4160  // Try to fold loads unless we need to widen.
4161  bool FoldedLoad = false;
4162  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
4163  if (!Widen && CanFoldLoads) {
4164  Load = Src1;
4165  FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
4166  Tmp4);
4167  if (!FoldedLoad) {
4168  // And is computative.
4169  Load = Src0;
4170  FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
4171  Tmp3, Tmp4);
4172  if (FoldedLoad)
4173  std::swap(Src0, Src1);
4174  }
4175  }
4176 
4177  auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
4178  // Look through single use bitcasts.
4179  if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) {
4180  Parent = Src.getNode();
4181  Src = Src.getOperand(0);
4182  }
4183 
4184  if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) {
4185  auto *MemIntr = cast<MemIntrinsicSDNode>(Src);
4186  if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits())
4187  return Src;
4188  }
4189 
4190  return SDValue();
4191  };
4192 
4193  // If we didn't fold a load, try to match broadcast. No widening limitation
4194  // for this. But only 32 and 64 bit types are supported.
4195  bool FoldedBCast = false;
4196  if (!FoldedLoad && CanFoldLoads &&
4197  (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
4198  SDNode *ParentNode = N0.getNode();
4199  if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
4200  FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
4201  Tmp1, Tmp2, Tmp3, Tmp4);
4202  }
4203 
4204  // Try the other operand.
4205  if (!FoldedBCast) {
4206  SDNode *ParentNode = N0.getNode();
4207  if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
4208  FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
4209  Tmp1, Tmp2, Tmp3, Tmp4);
4210  if (FoldedBCast)
4211  std::swap(Src0, Src1);
4212  }
4213  }
4214  }
4215 
4216  auto getMaskRC = [](MVT MaskVT) {
4217  switch (MaskVT.SimpleTy) {
4218  default: llvm_unreachable("Unexpected VT!");
4219  case MVT::v2i1: return X86::VK2RegClassID;
4220  case MVT::v4i1: return X86::VK4RegClassID;
4221  case MVT::v8i1: return X86::VK8RegClassID;
4222  case MVT::v16i1: return X86::VK16RegClassID;
4223  case MVT::v32i1: return X86::VK32RegClassID;
4224  case MVT::v64i1: return X86::VK64RegClassID;
4225  }
4226  };
4227 
4228  bool IsMasked = InMask.getNode() != nullptr;
4229 
4230  SDLoc dl(Root);
4231 
4232  MVT ResVT = Setcc.getSimpleValueType();
4233  MVT MaskVT = ResVT;
4234  if (Widen) {
4235  // Widen the inputs using insert_subreg or copy_to_regclass.
4236  unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4237  unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4238  unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4239  CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4240  MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4241  SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4242  CmpVT), 0);
4243  Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4244 
4245  assert(!FoldedLoad && "Shouldn't have folded the load");
4246  if (!FoldedBCast)
4247  Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4248 
4249  if (IsMasked) {
4250  // Widen the mask.
4251  unsigned RegClass = getMaskRC(MaskVT);
4252  SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4253  InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4254  dl, MaskVT, InMask, RC), 0);
4255  }
4256  }
4257 
4258  bool IsTestN = CC == ISD::SETEQ;
4259  unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4260  IsMasked);
4261 
4262  MachineSDNode *CNode;
4263  if (FoldedLoad || FoldedBCast) {
4264  SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4265 
4266  if (IsMasked) {
4267  SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4268  Load.getOperand(0) };
4269  CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4270  } else {
4271  SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4272  Load.getOperand(0) };
4273  CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4274  }
4275 
4276  // Update the chain.
4277  ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
4278  // Record the mem-refs
4279  CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()});
4280  } else {
4281  if (IsMasked)
4282  CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4283  else
4284  CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4285  }
4286 
4287  // If we widened, we need to shrink the mask VT.
4288  if (Widen) {
4289  unsigned RegClass = getMaskRC(ResVT);
4290  SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4291  CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4292  dl, ResVT, SDValue(CNode, 0), RC);
4293  }
4294 
4295  ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4296  CurDAG->RemoveDeadNode(Root);
4297  return true;
4298 }
4299 
4300 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4301 // into vpternlog.
4302 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
4303  assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
4304 
4305  MVT NVT = N->getSimpleValueType(0);
4306 
4307  // Make sure we support VPTERNLOG.
4308  if (!NVT.isVector() || !Subtarget->hasAVX512())
4309  return false;
4310 
4311  // We need VLX for 128/256-bit.
4312  if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4313  return false;
4314 
4315  SDValue N0 = N->getOperand(0);
4316  SDValue N1 = N->getOperand(1);
4317 
4318  // Canonicalize AND to LHS.
4319  if (N1.getOpcode() == ISD::AND)
4320  std::swap(N0, N1);
4321 
4322  if (N0.getOpcode() != ISD::AND ||
4323  N1.getOpcode() != X86ISD::ANDNP ||
4324  !N0.hasOneUse() || !N1.hasOneUse())
4325  return false;
4326 
4327  // ANDN is not commutable, use it to pick down A and C.
4328  SDValue A = N1.getOperand(0);
4329  SDValue C = N1.getOperand(1);
4330 
4331  // AND is commutable, if one operand matches A, the other operand is B.
4332  // Otherwise this isn't a match.
4333  SDValue B;
4334  if (N0.getOperand(0) == A)
4335  B = N0.getOperand(1);
4336  else if (N0.getOperand(1) == A)
4337  B = N0.getOperand(0);
4338  else
4339  return false;
4340 
4341  SDLoc dl(N);
4342  SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
4343  SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
4344  ReplaceNode(N, Ternlog.getNode());
4345  SelectCode(Ternlog.getNode());
4346  return true;
4347 }
4348 
4349 void X86DAGToDAGISel::Select(SDNode *Node) {
4350  MVT NVT = Node->getSimpleValueType(0);
4351  unsigned Opcode = Node->getOpcode();
4352  SDLoc dl(Node);
4353 
4354  if (Node->isMachineOpcode()) {
4355  LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
4356  Node->setNodeId(-1);
4357  return; // Already selected.
4358  }
4359 
4360  switch (Opcode) {
4361  default: break;
4362  case ISD::INTRINSIC_VOID: {
4363  unsigned IntNo = Node->getConstantOperandVal(1);
4364  switch (IntNo) {
4365  default: break;
4366  case Intrinsic::x86_sse3_monitor:
4367  case Intrinsic::x86_monitorx:
4368  case Intrinsic::x86_clzero: {
4369  bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
4370 
4371  unsigned Opc = 0;
4372  switch (IntNo) {
4373  default: llvm_unreachable("Unexpected intrinsic!");
4374  case Intrinsic::x86_sse3_monitor:
4375  if (!Subtarget->hasSSE3())
4376  break;
4377  Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
4378  break;
4379  case Intrinsic::x86_monitorx:
4380  if (!Subtarget->hasMWAITX())
4381  break;
4382  Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
4383  break;
4384  case Intrinsic::x86_clzero:
4385  if (!Subtarget->hasCLZERO())
4386  break;
4387  Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
4388  break;
4389  }
4390 
4391  if (Opc) {
4392  unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
4393  SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
4394  Node->getOperand(2), SDValue());
4395  SDValue InFlag = Chain.getValue(1);
4396 
4397  if (IntNo == Intrinsic::x86_sse3_monitor ||
4398  IntNo == Intrinsic::x86_monitorx) {
4399  // Copy the other two operands to ECX and EDX.
4400  Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
4401  InFlag);
4402  InFlag = Chain.getValue(1);
4403  Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
4404  InFlag);
4405  InFlag = Chain.getValue(1);
4406  }
4407 
4408  MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
4409  { Chain, InFlag});
4410  ReplaceNode(Node, CNode);
4411  return;
4412  }
4413  }
4414  }
4415 
4416  break;
4417  }
4418  case ISD::BRIND: {
4419  if (Subtarget->isTargetNaCl())
4420  // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
4421  // leave the instruction alone.
4422  break;
4423  if (Subtarget->isTarget64BitILP32()) {
4424  // Converts a 32-bit register to a 64-bit, zero-extended version of
4425  // it. This is needed because x86-64 can do many things, but jmp %r32
4426  // ain't one of them.
4427  const SDValue &Target = Node->getOperand(1);
4429  SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
4430  SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
4431  Node->getOperand(0), ZextTarget);
4432  ReplaceNode(Node, Brind.getNode());
4433  SelectCode(ZextTarget.getNode());
4434  SelectCode(Brind.getNode());
4435  return;
4436  }
4437  break;
4438  }
4439  case X86ISD::GlobalBaseReg:
4440  ReplaceNode(Node, getGlobalBaseReg());
4441  return;
4442 
4443  case ISD::BITCAST:
4444  // Just drop all 128/256/512-bit bitcasts.
4445  if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
4446  NVT == MVT::f128) {
4447  ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
4448  CurDAG->RemoveDeadNode(Node);
4449  return;
4450  }
4451  break;
4452 
4453  case ISD::VSELECT: {
4454  // Replace VSELECT with non-mask conditions with with BLENDV.
4456  break;
4457 
4458  assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
4459  SDValue Blendv = CurDAG->getNode(
4460  X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
4461  Node->getOperand(1), Node->getOperand(2));
4462  ReplaceNode(Node, Blendv.getNode());
4463  SelectCode(Blendv.getNode());
4464  // We already called ReplaceUses.
4465  return;
4466  }
4467 
4468  case ISD::SRL:
4469  if (matchBitExtract(Node))
4470  return;
4472  case ISD::SRA:
4473  case ISD::SHL:
4474  if (tryShiftAmountMod(Node))
4475  return;
4476  break;
4477 
4478  case ISD::AND:
4479  if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
4480  // Try to form a masked VPTESTM. Operands can be in either order.
4481  SDValue N0 = Node->getOperand(0);
4482  SDValue N1 = Node->getOperand(1);
4483  if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
4484  tryVPTESTM(Node, N0, N1))
4485  return;
4486  if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
4487  tryVPTESTM(Node, N1, N0))
4488  return;
4489  }
4490 
4491  if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
4492  ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4493  CurDAG->RemoveDeadNode(Node);
4494  return;
4495  }
4496  if (matchBitExtract(Node))
4497  return;
4498  if (AndImmShrink && shrinkAndImmediate(Node))
4499  return;
4500 
4502  case ISD::OR:
4503  case ISD::XOR:
4504  if (tryShrinkShlLogicImm(Node))
4505  return;
4506 
4507  if (Opcode == ISD::OR && tryMatchBitSelect(Node))
4508  return;
4509 
4511  case ISD::ADD:
4512  case ISD::SUB: {
4513  if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() &&
4514  combineIncDecVector(Node))
4515  return;
4516 
4517  // Try to avoid folding immediates with multiple uses for optsize.
4518  // This code tries to select to register form directly to avoid going
4519  // through the isel table which might fold the immediate. We can't change
4520  // the patterns on the add/sub/and/or/xor with immediate paterns in the
4521  // tablegen files to check immediate use count without making the patterns
4522  // unavailable to the fast-isel table.
4523  if (!OptForSize)
4524  break;
4525 
4526  // Only handle i8/i16/i32/i64.
4527  if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
4528  break;
4529 
4530  SDValue N0 = Node->getOperand(0);
4531  SDValue N1 = Node->getOperand(1);
4532 
4534  if (!Cst)
4535  break;
4536 
4537  int64_t Val = Cst->getSExtValue();
4538 
4539  // Make sure its an immediate that is considered foldable.
4540  // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
4541  if (!isInt<8>(Val) && !isInt<32>(Val))
4542  break;
4543 
4544  // If this can match to INC/DEC, let it go.
4545  if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
4546  break;
4547 
4548  // Check if we should avoid folding this immediate.
4549  if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
4550  break;
4551 
4552  // We should not fold the immediate. So we need a register form instead.
4553  unsigned ROpc, MOpc;
4554  switch (NVT.SimpleTy) {
4555  default: llvm_unreachable("Unexpected VT!");
4556  case MVT::i8:
4557  switch (Opcode) {
4558  default: llvm_unreachable("Unexpected opcode!");
4559  case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
4560  case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
4561  case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
4562  case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break;
4563  case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
4564  }
4565  break;
4566  case MVT::i16:
4567  switch (Opcode) {
4568  default: llvm_unreachable("Unexpected opcode!");
4569  case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
4570  case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
4571  case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
4572  case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break;
4573  case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
4574  }
4575  break;
4576  case MVT::i32:
4577  switch (Opcode) {
4578  default: llvm_unreachable("Unexpected opcode!");
4579  case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
4580  case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
4581  case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
4582  case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break;
4583  case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
4584  }
4585  break;
4586  case MVT::i64:
4587  switch (Opcode) {
4588  default: llvm_unreachable("Unexpected opcode!");
4589  case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
4590  case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
4591  case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
4592  case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break;
4593  case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
4594  }
4595  break;
4596  }
4597 
4598  // Ok this is a AND/OR/XOR/ADD/SUB with constant.
4599 
4600  // If this is a not a subtract, we can still try to fold a load.
4601  if (Opcode != ISD::SUB) {
4602  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4603  if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4604  SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4605  SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4606  MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4607  // Update the chain.
4608  ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
4609  // Record the mem-refs
4610  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
4611  ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4612  CurDAG->RemoveDeadNode(Node);
4613  return;
4614  }
4615  }
4616 
4617  CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
4618  return;
4619  }
4620 
4621  case X86ISD::SMUL:
4622  // i16/i32/i64 are handled with isel patterns.
4623  if (NVT != MVT::i8)
4624  break;
4626  case X86ISD::UMUL: {
4627  SDValue N0 = Node->getOperand(0);
4628  SDValue N1 = Node->getOperand(1);
4629 
4630  unsigned LoReg, ROpc, MOpc;
4631  switch (NVT.SimpleTy) {
4632  default: llvm_unreachable("Unsupported VT!");
4633  case MVT::i8:
4634  LoReg = X86::AL;
4635  ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
4636  MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
4637  break;
4638  case MVT::i16:
4639  LoReg = X86::AX;
4640  ROpc = X86::MUL16r;
4641  MOpc = X86::MUL16m;
4642  break;
4643  case MVT::i32:
4644  LoReg = X86::EAX;
4645  ROpc = X86::MUL32r;
4646  MOpc = X86::MUL32m;
4647  break;
4648  case MVT::i64:
4649  LoReg = X86::RAX;
4650  ROpc = X86::MUL64r;
4651  MOpc = X86::MUL64m;
4652  break;
4653  }
4654 
4655  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4656  bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4657  // Multiply is commmutative.
4658  if (!FoldedLoad) {
4659  FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4660  if (FoldedLoad)
4661  std::swap(N0, N1);
4662  }
4663 
4664  SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
4665  N0, SDValue()).getValue(1);
4666 
4667  MachineSDNode *CNode;
4668  if (FoldedLoad) {
4669  // i16/i32/i64 use an instruction that produces a low and high result even
4670  // though only the low result is used.
4671  SDVTList VTs;
4672  if (NVT == MVT::i8)
4673  VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4674  else
4675  VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
4676 
4677  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4678  InFlag };
4679  CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4680 
4681  // Update the chain.
4682  ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
4683  // Record the mem-refs
4684  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4685  } else {
4686  // i16/i32/i64 use an instruction that produces a low and high result even
4687  // though only the low result is used.
4688  SDVTList VTs;
4689  if (NVT == MVT::i8)
4690  VTs = CurDAG->getVTList(NVT, MVT::i32);
4691  else
4692  VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
4693 
4694  CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
4695  }
4696 
4697  ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4698  ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
4699  CurDAG->RemoveDeadNode(Node);
4700  return;
4701  }
4702 
4703  case ISD::SMUL_LOHI:
4704  case ISD::UMUL_LOHI: {
4705  SDValue N0 = Node->getOperand(0);
4706  SDValue N1 = Node->getOperand(1);
4707 
4708  unsigned Opc, MOpc;
4709  bool isSigned = Opcode == ISD::SMUL_LOHI;
4710  if (!isSigned) {
4711  switch (NVT.SimpleTy) {
4712  default: llvm_unreachable("Unsupported VT!");
4713  case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
4714  case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
4715  }
4716  } else {
4717  switch (NVT.SimpleTy) {
4718  default: llvm_unreachable("Unsupported VT!");
4719  case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
4720  case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
4721  }
4722  }
4723 
4724  unsigned SrcReg, LoReg, HiReg;
4725  switch (Opc) {
4726  default: llvm_unreachable("Unknown MUL opcode!");
4727  case X86::IMUL32r:
4728  case X86::MUL32r:
4729  SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
4730  break;
4731  case X86::IMUL64r:
4732  case X86::MUL64r:
4733  SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
4734  break;
4735  }
4736 
4737  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4738  bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4739  // Multiply is commmutative.
4740  if (!foldedLoad) {
4741  foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4742  if (foldedLoad)
4743  std::swap(N0, N1);
4744  }
4745 
4746  SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
4747  N0, SDValue()).getValue(1);
4748  if (foldedLoad) {
4749  SDValue Chain;
4750  MachineSDNode *CNode = nullptr;
4751  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4752  InFlag };
4753  SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
4754  CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4755  Chain = SDValue(CNode, 0);
4756  InFlag = SDValue(CNode, 1);
4757 
4758  // Update the chain.
4759  ReplaceUses(N1.getValue(1), Chain);
4760  // Record the mem-refs
4761  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4762  } else {
4763  SDValue Ops[] = { N1, InFlag };
4764  SDVTList VTs = CurDAG->getVTList(MVT::Glue);
4765  SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4766  InFlag = SDValue(CNode, 0);
4767  }
4768 
4769  // Copy the low half of the result, if it is needed.
4770  if (!SDValue(Node, 0).use_empty()) {
4771  assert(LoReg && "Register for low half is not defined!");
4772  SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
4773  NVT, InFlag);
4774  InFlag = ResLo.getValue(2);
4775  ReplaceUses(SDValue(Node, 0), ResLo);
4776  LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
4777  dbgs() << '\n');
4778  }
4779  // Copy the high half of the result, if it is needed.
4780  if (!SDValue(Node, 1).use_empty()) {
4781  assert(HiReg && "Register for high half is not defined!");
4782  SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
4783  NVT, InFlag);
4784  InFlag = ResHi.getValue(2);
4785  ReplaceUses(SDValue(Node, 1), ResHi);
4786  LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
4787  dbgs() << '\n');
4788  }
4789 
4790  CurDAG->RemoveDeadNode(Node);
4791  return;
4792  }
4793 
4794  case ISD::SDIVREM:
4795  case ISD::UDIVREM: {
4796  SDValue N0 = Node->getOperand(0);
4797  SDValue N1 = Node->getOperand(1);
4798 
4799  unsigned Opc, MOpc;
4800  bool isSigned = Opcode == ISD::SDIVREM;
4801  if (!isSigned) {
4802  switch (NVT.SimpleTy) {
4803  default: llvm_unreachable("Unsupported VT!");
4804  case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
4805  case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
4806  case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
4807  case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
4808  }
4809  } else {
4810  switch (NVT.SimpleTy) {
4811  default: llvm_unreachable("Unsupported VT!");
4812  case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
4813  case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
4814  case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
4815  case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
4816  }
4817  }
4818 
4819  unsigned LoReg, HiReg, ClrReg;
4820  unsigned SExtOpcode;
4821  switch (NVT.SimpleTy) {
4822  default: llvm_unreachable("Unsupported VT!");
4823  case MVT::i8:
4824  LoReg = X86::AL; ClrReg = HiReg = X86::AH;
4825  SExtOpcode = 0; // Not used.
4826  break;
4827  case MVT::i16:
4828  LoReg = X86::AX; HiReg = X86::DX;
4829  ClrReg = X86::DX;
4830  SExtOpcode = X86::CWD;
4831  break;
4832  case MVT::i32:
4833  LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
4834  SExtOpcode = X86::CDQ;
4835  break;
4836  case MVT::i64:
4837  LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
4838  SExtOpcode = X86::CQO;
4839  break;
4840  }
4841 
4842  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4843  bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4844  bool signBitIsZero = CurDAG->SignBitIsZero(N0);
4845 
4846  SDValue InFlag;
4847  if (NVT == MVT::i8) {
4848  // Special case for div8, just use a move with zero extension to AX to
4849  // clear the upper 8 bits (AH).
4850  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
4851  MachineSDNode *Move;
4852  if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4853  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4854  unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
4855  : X86::MOVZX16rm8;
4856  Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
4857  Chain = SDValue(Move, 1);
4858  ReplaceUses(N0.getValue(1), Chain);
4859  // Record the mem-refs
4860  CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
4861  } else {
4862  unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
4863  : X86::MOVZX16rr8;
4864  Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
4865  Chain = CurDAG->getEntryNode();
4866  }
4867  Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
4868  SDValue());
4869  InFlag = Chain.getValue(1);
4870  } else {
4871  InFlag =
4872  CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
4873  LoReg, N0, SDValue()).getValue(1);
4874  if (isSigned && !signBitIsZero) {
4875  // Sign extend the low part into the high part.
4876  InFlag =
4877  SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
4878  } else {
4879  // Zero out the high part, effectively zero extending the input.
4880  SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
4881  switch (NVT.SimpleTy) {
4882  case MVT::i16:
4883  ClrNode =
4884  SDValue(CurDAG->getMachineNode(
4885  TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
4886  CurDAG->getTargetConstant(X86::sub_16bit, dl,
4887  MVT::i32)),
4888  0);
4889  break;
4890  case MVT::i32:
4891  break;
4892  case MVT::i64:
4893  ClrNode =
4894  SDValue(CurDAG->getMachineNode(
4895  TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
4896  CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
4897  CurDAG->getTargetConstant(X86::sub_32bit, dl,
4898  MVT::i32)),
4899  0);
4900  break;
4901  default:
4902  llvm_unreachable("Unexpected division source");
4903  }
4904 
4905  InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
4906  ClrNode, InFlag).getValue(1);
4907  }
4908  }
4909 
4910  if (foldedLoad) {
4911  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4912  InFlag };
4913  MachineSDNode *CNode =
4914  CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
4915  InFlag = SDValue(CNode, 1);
4916  // Update the chain.
4917  ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
4918  // Record the mem-refs
4919  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4920  } else {
4921  InFlag =
4922  SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
4923  }
4924 
4925  // Prevent use of AH in a REX instruction by explicitly copying it to
4926  // an ABCD_L register.
4927  //
4928  // The current assumption of the register allocator is that isel
4929  // won't generate explicit references to the GR8_ABCD_H registers. If
4930  // the allocator and/or the backend get enhanced to be more robust in
4931  // that regard, this can be, and should be, removed.
4932  if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
4933  SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
4934  unsigned AHExtOpcode =
4935  isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
4936 
4937  SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
4938  MVT::Glue, AHCopy, InFlag);
4939  SDValue Result(RNode, 0);
4940  InFlag = SDValue(RNode, 1);
4941 
4942  Result =
4943  CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
4944 
4945  ReplaceUses(SDValue(Node, 1), Result);
4946  LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4947  dbgs() << '\n');
4948  }
4949  // Copy the division (low) result, if it is needed.
4950  if (!SDValue(Node, 0).use_empty()) {
4951  SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4952  LoReg, NVT, InFlag);
4953  InFlag = Result.getValue(2);
4954  ReplaceUses(SDValue(Node, 0), Result);
4955  LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4956  dbgs() << '\n');
4957  }
4958  // Copy the remainder (high) result, if it is needed.
4959  if (!SDValue(Node, 1).use_empty()) {
4960  SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4961  HiReg, NVT, InFlag);
4962  InFlag = Result.getValue(2);
4963  ReplaceUses(SDValue(Node, 1), Result);
4964  LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4965  dbgs() << '\n');
4966  }
4967  CurDAG->RemoveDeadNode(Node);
4968  return;
4969  }
4970 
4971  case X86ISD::CMP: {
4972  SDValue N0 = Node->getOperand(0);
4973  SDValue N1 = Node->getOperand(1);
4974 
4975  // Optimizations for TEST compares.
4976  if (!isNullConstant(N1))
4977  break;
4978 
4979  // Save the original VT of the compare.
4980  MVT CmpVT = N0.getSimpleValueType();
4981 
4982  // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
4983  // by a test instruction. The test should be removed later by
4984  // analyzeCompare if we are using only the zero flag.
4985  // TODO: Should we check the users and use the BEXTR flags directly?
4986  if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
4987  if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
4988  unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
4989  : X86::TEST32rr;
4990  SDValue BEXTR = SDValue(NewNode, 0);
4991  NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
4992  ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4993  CurDAG->RemoveDeadNode(Node);
4994  return;
4995  }
4996  }
4997 
4998  // We can peek through truncates, but we need to be careful below.
4999  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
5000  N0 = N0.getOperand(0);
5001 
5002  // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5003  // use a smaller encoding.
5004  // Look past the truncate if CMP is the only use of it.
5005  if (N0.getOpcode() == ISD::AND &&
5006  N0.getNode()->hasOneUse() &&
5007  N0.getValueType() != MVT::i8) {
5009  if (!C) break;
5010  uint64_t Mask = C->getZExtValue();
5011 
5012  // Check if we can replace AND+IMM64 with a shift. This is possible for
5013  // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
5014  // flag.
5015  if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
5016  onlyUsesZeroFlag(SDValue(Node, 0))) {
5017  if (isMask_64(~Mask)) {
5018  unsigned TrailingZeros = countTrailingZeros(Mask);
5019  SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
5020  SDValue Shift =
5021  SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
5022  N0.getOperand(0), Imm), 0);
5023  MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
5024  MVT::i32, Shift, Shift);
5025  ReplaceNode(Node, Test);
5026  return;
5027  }
5028  if (isMask_64(Mask)) {
5029  unsigned LeadingZeros = countLeadingZeros(Mask);
5030  SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
5031  SDValue Shift =
5032  SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
5033  N0.getOperand(0), Imm), 0);
5034  MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
5035  MVT::i32, Shift, Shift);
5036  ReplaceNode(Node, Test);
5037  return;
5038  }
5039  }
5040 
5041  MVT VT;
5042  int SubRegOp;
5043  unsigned ROpc, MOpc;
5044 
5045  // For each of these checks we need to be careful if the sign flag is
5046  // being used. It is only safe to use the sign flag in two conditions,
5047  // either the sign bit in the shrunken mask is zero or the final test
5048  // size is equal to the original compare size.
5049 
5050  if (isUInt<8>(Mask) &&
5051  (!(Mask & 0x80) || CmpVT == MVT::i8 ||
5052  hasNoSignFlagUses(SDValue(Node, 0)))) {
5053  // For example, convert "testl %eax, $8" to "testb %al, $8"
5054  VT = MVT::i8;
5055  SubRegOp = X86::sub_8bit;
5056  ROpc = X86::TEST8ri;
5057  MOpc = X86::TEST8mi;
5058  } else if (OptForMinSize && isUInt<16>(Mask) &&
5059  (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
5060  hasNoSignFlagUses(SDValue(Node, 0)))) {
5061  // For example, "testl %eax, $32776" to "testw %ax, $32776".
5062  // NOTE: We only want to form TESTW instructions if optimizing for
5063  // min size. Otherwise we only save one byte and possibly get a length
5064  // changing prefix penalty in the decoders.
5065  VT = MVT::i16;
5066  SubRegOp = X86::sub_16bit;
5067  ROpc = X86::TEST16ri;
5068  MOpc = X86::TEST16mi;
5069  } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
5070  ((!(Mask & 0x80000000) &&
5071  // Without minsize 16-bit Cmps can get here so we need to
5072  // be sure we calculate the correct sign flag if needed.
5073  (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
5074  CmpVT == MVT::i32 ||
5075  hasNoSignFlagUses(SDValue(Node, 0)))) {
5076  // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
5077  // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
5078  // Otherwize, we find ourselves in a position where we have to do
5079  // promotion. If previous passes did not promote the and, we assume
5080  // they had a good reason not to and do not promote here.
5081  VT = MVT::i32;
5082  SubRegOp = X86::sub_32bit;
5083  ROpc = X86::TEST32ri;
5084  MOpc = X86::TEST32mi;
5085  } else {
5086  // No eligible transformation was found.
5087  break;
5088  }
5089 
5090  SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
5091  SDValue Reg = N0.getOperand(0);
5092 
5093  // Emit a testl or testw.
5094  MachineSDNode *NewNode;
5095  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5096  if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5097  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
5098  Reg.getOperand(0) };
5099  NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
5100  // Update the chain.
5101  ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
5102  // Record the mem-refs
5103  CurDAG->setNodeMemRefs(NewNode,
5104  {cast<LoadSDNode>(Reg)->getMemOperand()});
5105  } else {
5106  // Extract the subregister if necessary.
5107  if (N0.getValueType() != VT)
5108  Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
5109 
5110  NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
5111  }
5112  // Replace CMP with TEST.
5113  ReplaceNode(Node, NewNode);
5114  return;
5115  }
5116  break;
5117  }
5118  case X86ISD::PCMPISTR: {
5119  if (!Subtarget->hasSSE42())
5120  break;
5121 
5122  bool NeedIndex = !SDValue(Node, 0).use_empty();
5123  bool NeedMask = !SDValue(Node, 1).use_empty();
5124  // We can't fold a load if we are going to make two instructions.
5125  bool MayFoldLoad = !NeedIndex || !NeedMask;
5126 
5127  MachineSDNode *CNode;
5128  if (NeedMask) {
5129  unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
5130  unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
5131  CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
5132  ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
5133  }
5134  if (NeedIndex || !NeedMask) {
5135  unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
5136  unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
5137  CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
5138  ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5139  }
5140 
5141  // Connect the flag usage to the last instruction created.
5142  ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
5143  CurDAG->RemoveDeadNode(Node);
5144  return;
5145  }
5146  case X86ISD::PCMPESTR: {
5147  if (!Subtarget->hasSSE42())
5148  break;
5149 
5150  // Copy the two implicit register inputs.
5151  SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
5152  Node->getOperand(1),
5153  SDValue()).getValue(1);
5154  InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
5155  Node->getOperand(3), InFlag).getValue(1);
5156 
5157  bool NeedIndex = !SDValue(Node, 0).use_empty();
5158  bool NeedMask = !SDValue(Node, 1).use_empty();
5159  // We can't fold a load if we are going to make two instructions.
5160  bool MayFoldLoad = !NeedIndex || !NeedMask;
5161 
5162  MachineSDNode *CNode;
5163  if (NeedMask) {
5164  unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
5165  unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
5166  CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
5167  InFlag);
5168  ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
5169  }
5170  if (NeedIndex || !NeedMask) {
5171  unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
5172  unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
5173  CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
5174  ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5175  }
5176  // Connect the flag usage to the last instruction created.
5177  ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
5178  CurDAG->RemoveDeadNode(Node);
5179  return;
5180  }
5181 
5182  case ISD::SETCC: {
5183  if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
5184  return;
5185 
5186  break;
5187  }
5188 
5189  case ISD::STORE:
5190  if (foldLoadStoreIntoMemOperand(Node))
5191  return;
5192  break;
5193  case ISD::FCEIL:
5194  case ISD::FFLOOR:
5195  case ISD::FTRUNC:
5196  case ISD::FNEARBYINT:
5197  case ISD::FRINT: {
5198  // Replace fp rounding with their X86 specific equivalent so we don't
5199  // need 2 sets of patterns.
5200  // FIXME: This can only happen when the nodes started as STRICT_* and have
5201  // been mutated into their non-STRICT equivalents. Eventually this
5202  // mutation will be removed and we should switch the STRICT_ nodes to a
5203  // strict version of RNDSCALE in PreProcessISelDAG.
5204  unsigned Imm;
5205  switch (Node->getOpcode()) {
5206  default: llvm_unreachable("Unexpected opcode!");
5207  case ISD::FCEIL: Imm = 0xA; break;
5208  case ISD::FFLOOR: Imm = 0x9; break;
5209  case ISD::FTRUNC: Imm = 0xB; break;
5210  case ISD::FNEARBYINT: Imm = 0xC; break;
5211  case ISD::FRINT: Imm = 0x4; break;
5212  }
5213  SDLoc dl(Node);
5214  SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, Node->getValueType(0),
5215  Node->getOperand(0),
5216  CurDAG->getTargetConstant(Imm, dl, MVT::i8));
5217  ReplaceNode(Node, Res.getNode());
5218  SelectCode(Res.getNode());
5219  return;
5220  }
5221  }
5222 
5223  SelectCode(Node);
5224 }
5225 
5226 bool X86DAGToDAGISel::
5227 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
5228  std::vector<SDValue> &OutOps) {
5229  SDValue Op0, Op1, Op2, Op3, Op4;
5230  switch (ConstraintID) {
5231  default:
5232  llvm_unreachable("Unexpected asm memory constraint");
5234  // FIXME: It seems strange that 'i' is needed here since it's supposed to
5235  // be an immediate and not a memory constraint.
5237  case InlineAsm::Constraint_o: // offsetable ??
5238  case InlineAsm::Constraint_v: // not offsetable ??
5239  case InlineAsm::Constraint_m: // memory
5241  if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
5242  return true;
5243  break;
5244  }
5245 
5246  OutOps.push_back(Op0);
5247  OutOps.push_back(Op1);
5248  OutOps.push_back(Op2);
5249  OutOps.push_back(Op3);
5250  OutOps.push_back(Op4);
5251  return false;
5252 }
5253 
5254 /// This pass converts a legalized DAG into a X86-specific DAG,
5255 /// ready for instruction scheduling.
5257  CodeGenOpt::Level OptLevel) {
5258  return new X86DAGToDAGISel(TM, OptLevel);
5259 }
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
uint64_t CallInst * C
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:596
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:385
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:570
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, unsigned LoadOpNo, LoadSDNode *&LoadNode, SDValue &InputChain)
Check whether or not the chain ending in StoreNode is suitable for doing the {load; op; store} to mod...
EVT getValueType() const
Return the ValueType of the referenced return value.
Vector comparison generating mask bits for fp and integer signed and unsigned data types...
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
const SDValue & getOffset() const
bool isUndef() const
APInt getSignedMax() const
Return the largest signed value contained in the ConstantRange.
static APInt getAllOnesValue(unsigned numBits)
Get the all-ones value.
Definition: APInt.h:561
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isConstantSplat(SDValue Op, APInt &SplatVal)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
This class represents lattice values for constants.
Definition: AllocatorList.h:23
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N)
static bool IsMasked(Instruction *I)
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Tail call return.
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:622
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:551
bool isVector() const
Return true if this is a vector value type.
const SDValue & getBasePtr() const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:912
X86 conditional moves.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1203
unsigned Reg
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition: APInt.h:647
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:252
bool hasFastBEXTR() const
Definition: X86Subtarget.h:666
unsigned getVectorNumElements() const
const SDValue & getChain() const
constexpr bool isInt< 8 >(int64_t x)
Definition: MathExtras.h:339
XOP - Opcode prefix used by XOP instructions.
Definition: X86BaseInfo.h:590
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc, or post-dec.
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:130
unsigned getAlignment() const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:865
STATISTIC(NumFunctions, "Total number of functions")
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
F(f)