LLVM  9.0.0svn
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86.h"
15 #include "X86MachineFunctionInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Config/llvm-config.h"
24 #include "llvm/IR/ConstantRange.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/KnownBits.h"
36 #include <stdint.h>
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "x86-isel"
40 
41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42 
43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44  cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45  cl::Hidden);
46 
47 //===----------------------------------------------------------------------===//
48 // Pattern Matcher Implementation
49 //===----------------------------------------------------------------------===//
50 
51 namespace {
52  /// This corresponds to X86AddressMode, but uses SDValue's instead of register
53  /// numbers for the leaves of the matched tree.
54  struct X86ISelAddressMode {
55  enum {
56  RegBase,
57  FrameIndexBase
58  } BaseType;
59 
60  // This is really a union, discriminated by BaseType!
61  SDValue Base_Reg;
62  int Base_FrameIndex;
63 
64  unsigned Scale;
65  SDValue IndexReg;
66  int32_t Disp;
67  SDValue Segment;
68  const GlobalValue *GV;
69  const Constant *CP;
70  const BlockAddress *BlockAddr;
71  const char *ES;
72  MCSymbol *MCSym;
73  int JT;
74  unsigned Align; // CP alignment.
75  unsigned char SymbolFlags; // X86II::MO_*
76  bool NegateIndex = false;
77 
78  X86ISelAddressMode()
79  : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
80  Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
81  MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
82 
83  bool hasSymbolicDisplacement() const {
84  return GV != nullptr || CP != nullptr || ES != nullptr ||
85  MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
86  }
87 
88  bool hasBaseOrIndexReg() const {
89  return BaseType == FrameIndexBase ||
90  IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
91  }
92 
93  /// Return true if this addressing mode is already RIP-relative.
94  bool isRIPRelative() const {
95  if (BaseType != RegBase) return false;
96  if (RegisterSDNode *RegNode =
97  dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
98  return RegNode->getReg() == X86::RIP;
99  return false;
100  }
101 
102  void setBaseReg(SDValue Reg) {
103  BaseType = RegBase;
104  Base_Reg = Reg;
105  }
106 
107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
108  void dump(SelectionDAG *DAG = nullptr) {
109  dbgs() << "X86ISelAddressMode " << this << '\n';
110  dbgs() << "Base_Reg ";
111  if (Base_Reg.getNode())
112  Base_Reg.getNode()->dump(DAG);
113  else
114  dbgs() << "nul\n";
115  if (BaseType == FrameIndexBase)
116  dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
117  dbgs() << " Scale " << Scale << '\n'
118  << "IndexReg ";
119  if (NegateIndex)
120  dbgs() << "negate ";
121  if (IndexReg.getNode())
122  IndexReg.getNode()->dump(DAG);
123  else
124  dbgs() << "nul\n";
125  dbgs() << " Disp " << Disp << '\n'
126  << "GV ";
127  if (GV)
128  GV->dump();
129  else
130  dbgs() << "nul";
131  dbgs() << " CP ";
132  if (CP)
133  CP->dump();
134  else
135  dbgs() << "nul";
136  dbgs() << '\n'
137  << "ES ";
138  if (ES)
139  dbgs() << ES;
140  else
141  dbgs() << "nul";
142  dbgs() << " MCSym ";
143  if (MCSym)
144  dbgs() << MCSym;
145  else
146  dbgs() << "nul";
147  dbgs() << " JT" << JT << " Align" << Align << '\n';
148  }
149 #endif
150  };
151 }
152 
153 namespace {
154  //===--------------------------------------------------------------------===//
155  /// ISel - X86-specific code to select X86 machine instructions for
156  /// SelectionDAG operations.
157  ///
158  class X86DAGToDAGISel final : public SelectionDAGISel {
159  /// Keep a pointer to the X86Subtarget around so that we can
160  /// make the right decision when generating code for different targets.
161  const X86Subtarget *Subtarget;
162 
163  /// If true, selector should try to optimize for code size instead of
164  /// performance.
165  bool OptForSize;
166 
167  /// If true, selector should try to optimize for minimum code size.
168  bool OptForMinSize;
169 
170  /// Disable direct TLS access through segment registers.
171  bool IndirectTlsSegRefs;
172 
173  public:
174  explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
175  : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
176  OptForMinSize(false), IndirectTlsSegRefs(false) {}
177 
178  StringRef getPassName() const override {
179  return "X86 DAG->DAG Instruction Selection";
180  }
181 
182  bool runOnMachineFunction(MachineFunction &MF) override {
183  // Reset the subtarget each time through.
184  Subtarget = &MF.getSubtarget<X86Subtarget>();
185  IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
186  "indirect-tls-seg-refs");
187 
188  // OptFor[Min]Size are used in pattern predicates that isel is matching.
189  OptForSize = MF.getFunction().hasOptSize();
190  OptForMinSize = MF.getFunction().hasMinSize();
191  assert((!OptForMinSize || OptForSize) &&
192  "OptForMinSize implies OptForSize");
193 
195  return true;
196  }
197 
198  void EmitFunctionEntryCode() override;
199 
200  bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
201 
202  void PreprocessISelDAG() override;
203  void PostprocessISelDAG() override;
204 
205 // Include the pieces autogenerated from the target description.
206 #include "X86GenDAGISel.inc"
207 
208  private:
209  void Select(SDNode *N) override;
210 
211  bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
212  bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
213  bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
214  bool matchAddress(SDValue N, X86ISelAddressMode &AM);
215  bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
216  bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
217  bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218  unsigned Depth);
219  bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
220  bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
221  SDValue &Scale, SDValue &Index, SDValue &Disp,
222  SDValue &Segment);
223  bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
224  SDValue &Scale, SDValue &Index, SDValue &Disp,
225  SDValue &Segment);
226  bool selectMOV64Imm32(SDValue N, SDValue &Imm);
227  bool selectLEAAddr(SDValue N, SDValue &Base,
228  SDValue &Scale, SDValue &Index, SDValue &Disp,
229  SDValue &Segment);
230  bool selectLEA64_32Addr(SDValue N, SDValue &Base,
231  SDValue &Scale, SDValue &Index, SDValue &Disp,
232  SDValue &Segment);
233  bool selectTLSADDRAddr(SDValue N, SDValue &Base,
234  SDValue &Scale, SDValue &Index, SDValue &Disp,
235  SDValue &Segment);
236  bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
237  SDValue &Base, SDValue &Scale,
238  SDValue &Index, SDValue &Disp,
239  SDValue &Segment,
240  SDValue &NodeWithChain);
241  bool selectRelocImm(SDValue N, SDValue &Op);
242 
243  bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
244  SDValue &Base, SDValue &Scale,
245  SDValue &Index, SDValue &Disp,
246  SDValue &Segment);
247 
248  // Convenience method where P is also root.
249  bool tryFoldLoad(SDNode *P, SDValue N,
250  SDValue &Base, SDValue &Scale,
251  SDValue &Index, SDValue &Disp,
252  SDValue &Segment) {
253  return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
254  }
255 
256  /// Implement addressing mode selection for inline asm expressions.
257  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
258  unsigned ConstraintID,
259  std::vector<SDValue> &OutOps) override;
260 
261  void emitSpecialCodeForMain();
262 
263  inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
264  MVT VT, SDValue &Base, SDValue &Scale,
265  SDValue &Index, SDValue &Disp,
266  SDValue &Segment) {
267  if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
268  Base = CurDAG->getTargetFrameIndex(
269  AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
270  else if (AM.Base_Reg.getNode())
271  Base = AM.Base_Reg;
272  else
273  Base = CurDAG->getRegister(0, VT);
274 
275  Scale = getI8Imm(AM.Scale, DL);
276 
277  // Negate the index if needed.
278  if (AM.NegateIndex) {
279  unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
280  SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
281  AM.IndexReg), 0);
282  AM.IndexReg = Neg;
283  }
284 
285  if (AM.IndexReg.getNode())
286  Index = AM.IndexReg;
287  else
288  Index = CurDAG->getRegister(0, VT);
289 
290  // These are 32-bit even in 64-bit mode since RIP-relative offset
291  // is 32-bit.
292  if (AM.GV)
293  Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
294  MVT::i32, AM.Disp,
295  AM.SymbolFlags);
296  else if (AM.CP)
297  Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
298  AM.Align, AM.Disp, AM.SymbolFlags);
299  else if (AM.ES) {
300  assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
301  Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
302  } else if (AM.MCSym) {
303  assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
304  assert(AM.SymbolFlags == 0 && "oo");
305  Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
306  } else if (AM.JT != -1) {
307  assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
308  Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
309  } else if (AM.BlockAddr)
310  Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
311  AM.SymbolFlags);
312  else
313  Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
314 
315  if (AM.Segment.getNode())
316  Segment = AM.Segment;
317  else
318  Segment = CurDAG->getRegister(0, MVT::i16);
319  }
320 
321  // Utility function to determine whether we should avoid selecting
322  // immediate forms of instructions for better code size or not.
323  // At a high level, we'd like to avoid such instructions when
324  // we have similar constants used within the same basic block
325  // that can be kept in a register.
326  //
327  bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
328  uint32_t UseCount = 0;
329 
330  // Do not want to hoist if we're not optimizing for size.
331  // TODO: We'd like to remove this restriction.
332  // See the comment in X86InstrInfo.td for more info.
333  if (!OptForSize)
334  return false;
335 
336  // Walk all the users of the immediate.
337  for (SDNode::use_iterator UI = N->use_begin(),
338  UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
339 
340  SDNode *User = *UI;
341 
342  // This user is already selected. Count it as a legitimate use and
343  // move on.
344  if (User->isMachineOpcode()) {
345  UseCount++;
346  continue;
347  }
348 
349  // We want to count stores of immediates as real uses.
350  if (User->getOpcode() == ISD::STORE &&
351  User->getOperand(1).getNode() == N) {
352  UseCount++;
353  continue;
354  }
355 
356  // We don't currently match users that have > 2 operands (except
357  // for stores, which are handled above)
358  // Those instruction won't match in ISEL, for now, and would
359  // be counted incorrectly.
360  // This may change in the future as we add additional instruction
361  // types.
362  if (User->getNumOperands() != 2)
363  continue;
364 
365  // Immediates that are used for offsets as part of stack
366  // manipulation should be left alone. These are typically
367  // used to indicate SP offsets for argument passing and
368  // will get pulled into stores/pushes (implicitly).
369  if (User->getOpcode() == X86ISD::ADD ||
370  User->getOpcode() == ISD::ADD ||
371  User->getOpcode() == X86ISD::SUB ||
372  User->getOpcode() == ISD::SUB) {
373 
374  // Find the other operand of the add/sub.
375  SDValue OtherOp = User->getOperand(0);
376  if (OtherOp.getNode() == N)
377  OtherOp = User->getOperand(1);
378 
379  // Don't count if the other operand is SP.
380  RegisterSDNode *RegNode;
381  if (OtherOp->getOpcode() == ISD::CopyFromReg &&
382  (RegNode = dyn_cast_or_null<RegisterSDNode>(
383  OtherOp->getOperand(1).getNode())))
384  if ((RegNode->getReg() == X86::ESP) ||
385  (RegNode->getReg() == X86::RSP))
386  continue;
387  }
388 
389  // ... otherwise, count this and move on.
390  UseCount++;
391  }
392 
393  // If we have more than 1 use, then recommend for hoisting.
394  return (UseCount > 1);
395  }
396 
397  /// Return a target constant with the specified value of type i8.
398  inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
399  return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
400  }
401 
402  /// Return a target constant with the specified value, of type i32.
403  inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
404  return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
405  }
406 
407  /// Return a target constant with the specified value, of type i64.
408  inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
409  return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
410  }
411 
412  SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
413  const SDLoc &DL) {
414  assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
415  uint64_t Index = N->getConstantOperandVal(1);
416  MVT VecVT = N->getOperand(0).getSimpleValueType();
417  return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
418  }
419 
420  SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
421  const SDLoc &DL) {
422  assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
423  uint64_t Index = N->getConstantOperandVal(2);
424  MVT VecVT = N->getSimpleValueType(0);
425  return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
426  }
427 
428  // Helper to detect unneeded and instructions on shift amounts. Called
429  // from PatFrags in tablegen.
430  bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
431  assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
432  const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
433 
434  if (Val.countTrailingOnes() >= Width)
435  return true;
436 
437  APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
438  return Mask.countTrailingOnes() >= Width;
439  }
440 
441  /// Return an SDNode that returns the value of the global base register.
442  /// Output instructions required to initialize the global base register,
443  /// if necessary.
444  SDNode *getGlobalBaseReg();
445 
446  /// Return a reference to the TargetMachine, casted to the target-specific
447  /// type.
448  const X86TargetMachine &getTargetMachine() const {
449  return static_cast<const X86TargetMachine &>(TM);
450  }
451 
452  /// Return a reference to the TargetInstrInfo, casted to the target-specific
453  /// type.
454  const X86InstrInfo *getInstrInfo() const {
455  return Subtarget->getInstrInfo();
456  }
457 
458  /// Address-mode matching performs shift-of-and to and-of-shift
459  /// reassociation in order to expose more scaled addressing
460  /// opportunities.
461  bool ComplexPatternFuncMutatesDAG() const override {
462  return true;
463  }
464 
465  bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
466 
467  /// Returns whether this is a relocatable immediate in the range
468  /// [-2^Width .. 2^Width-1].
469  template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
470  if (auto *CN = dyn_cast<ConstantSDNode>(N))
471  return isInt<Width>(CN->getSExtValue());
472  return isSExtAbsoluteSymbolRef(Width, N);
473  }
474 
475  // Indicates we should prefer to use a non-temporal load for this load.
476  bool useNonTemporalLoad(LoadSDNode *N) const {
477  if (!N->isNonTemporal())
478  return false;
479 
480  unsigned StoreSize = N->getMemoryVT().getStoreSize();
481 
482  if (N->getAlignment() < StoreSize)
483  return false;
484 
485  switch (StoreSize) {
486  default: llvm_unreachable("Unsupported store size");
487  case 4:
488  case 8:
489  return false;
490  case 16:
491  return Subtarget->hasSSE41();
492  case 32:
493  return Subtarget->hasAVX2();
494  case 64:
495  return Subtarget->hasAVX512();
496  }
497  }
498 
499  bool foldLoadStoreIntoMemOperand(SDNode *Node);
500  MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
501  bool matchBitExtract(SDNode *Node);
502  bool shrinkAndImmediate(SDNode *N);
503  bool isMaskZeroExtended(SDNode *N) const;
504  bool tryShiftAmountMod(SDNode *N);
505  bool tryShrinkShlLogicImm(SDNode *N);
506  bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
507 
508  MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
509  const SDLoc &dl, MVT VT, SDNode *Node);
510  MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
511  const SDLoc &dl, MVT VT, SDNode *Node,
512  SDValue &InFlag);
513 
514  bool tryOptimizeRem8Extend(SDNode *N);
515 
516  bool onlyUsesZeroFlag(SDValue Flags) const;
517  bool hasNoSignFlagUses(SDValue Flags) const;
518  bool hasNoCarryFlagUses(SDValue Flags) const;
519  };
520 }
521 
522 
523 // Returns true if this masked compare can be implemented legally with this
524 // type.
525 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
526  unsigned Opcode = N->getOpcode();
527  if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
528  Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) {
529  // We can get 256-bit 8 element types here without VLX being enabled. When
530  // this happens we will use 512-bit operations and the mask will not be
531  // zero extended.
532  EVT OpVT = N->getOperand(0).getValueType();
533  if (OpVT.is256BitVector() || OpVT.is128BitVector())
534  return Subtarget->hasVLX();
535 
536  return true;
537  }
538  // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
539  if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
540  Opcode == X86ISD::FSETCCM_SAE)
541  return true;
542 
543  return false;
544 }
545 
546 // Returns true if we can assume the writer of the mask has zero extended it
547 // for us.
548 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
549  // If this is an AND, check if we have a compare on either side. As long as
550  // one side guarantees the mask is zero extended, the AND will preserve those
551  // zeros.
552  if (N->getOpcode() == ISD::AND)
553  return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
554  isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
555 
556  return isLegalMaskCompare(N, Subtarget);
557 }
558 
559 bool
560 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
561  if (OptLevel == CodeGenOpt::None) return false;
562 
563  if (!N.hasOneUse())
564  return false;
565 
566  if (N.getOpcode() != ISD::LOAD)
567  return true;
568 
569  // Don't fold non-temporal loads if we have an instruction for them.
570  if (useNonTemporalLoad(cast<LoadSDNode>(N)))
571  return false;
572 
573  // If N is a load, do additional profitability checks.
574  if (U == Root) {
575  switch (U->getOpcode()) {
576  default: break;
577  case X86ISD::ADD:
578  case X86ISD::ADC:
579  case X86ISD::SUB:
580  case X86ISD::SBB:
581  case X86ISD::AND:
582  case X86ISD::XOR:
583  case X86ISD::OR:
584  case ISD::ADD:
585  case ISD::ADDCARRY:
586  case ISD::AND:
587  case ISD::OR:
588  case ISD::XOR: {
589  SDValue Op1 = U->getOperand(1);
590 
591  // If the other operand is a 8-bit immediate we should fold the immediate
592  // instead. This reduces code size.
593  // e.g.
594  // movl 4(%esp), %eax
595  // addl $4, %eax
596  // vs.
597  // movl $4, %eax
598  // addl 4(%esp), %eax
599  // The former is 2 bytes shorter. In case where the increment is 1, then
600  // the saving can be 4 bytes (by using incl %eax).
601  if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
602  if (Imm->getAPIntValue().isSignedIntN(8))
603  return false;
604 
605  // If this is a 64-bit AND with an immediate that fits in 32-bits,
606  // prefer using the smaller and over folding the load. This is needed to
607  // make sure immediates created by shrinkAndImmediate are always folded.
608  // Ideally we would narrow the load during DAG combine and get the
609  // best of both worlds.
610  if (U->getOpcode() == ISD::AND &&
611  Imm->getAPIntValue().getBitWidth() == 64 &&
612  Imm->getAPIntValue().isIntN(32))
613  return false;
614 
615  // If this really a zext_inreg that can be represented with a movzx
616  // instruction, prefer that.
617  // TODO: We could shrink the load and fold if it is non-volatile.
618  if (U->getOpcode() == ISD::AND &&
619  (Imm->getAPIntValue() == UINT8_MAX ||
620  Imm->getAPIntValue() == UINT16_MAX ||
621  Imm->getAPIntValue() == UINT32_MAX))
622  return false;
623 
624  // ADD/SUB with can negate the immediate and use the opposite operation
625  // to fit 128 into a sign extended 8 bit immediate.
626  if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
627  (-Imm->getAPIntValue()).isSignedIntN(8))
628  return false;
629  }
630 
631  // If the other operand is a TLS address, we should fold it instead.
632  // This produces
633  // movl %gs:0, %eax
634  // leal i@NTPOFF(%eax), %eax
635  // instead of
636  // movl $i@NTPOFF, %eax
637  // addl %gs:0, %eax
638  // if the block also has an access to a second TLS address this will save
639  // a load.
640  // FIXME: This is probably also true for non-TLS addresses.
641  if (Op1.getOpcode() == X86ISD::Wrapper) {
642  SDValue Val = Op1.getOperand(0);
644  return false;
645  }
646 
647  // Don't fold load if this matches the BTS/BTR/BTC patterns.
648  // BTS: (or X, (shl 1, n))
649  // BTR: (and X, (rotl -2, n))
650  // BTC: (xor X, (shl 1, n))
651  if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
652  if (U->getOperand(0).getOpcode() == ISD::SHL &&
654  return false;
655 
656  if (U->getOperand(1).getOpcode() == ISD::SHL &&
658  return false;
659  }
660  if (U->getOpcode() == ISD::AND) {
661  SDValue U0 = U->getOperand(0);
662  SDValue U1 = U->getOperand(1);
663  if (U0.getOpcode() == ISD::ROTL) {
664  auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
665  if (C && C->getSExtValue() == -2)
666  return false;
667  }
668 
669  if (U1.getOpcode() == ISD::ROTL) {
670  auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
671  if (C && C->getSExtValue() == -2)
672  return false;
673  }
674  }
675 
676  break;
677  }
678  case ISD::SHL:
679  case ISD::SRA:
680  case ISD::SRL:
681  // Don't fold a load into a shift by immediate. The BMI2 instructions
682  // support folding a load, but not an immediate. The legacy instructions
683  // support folding an immediate, but can't fold a load. Folding an
684  // immediate is preferable to folding a load.
685  if (isa<ConstantSDNode>(U->getOperand(1)))
686  return false;
687 
688  break;
689  }
690  }
691 
692  // Prevent folding a load if this can implemented with an insert_subreg or
693  // a move that implicitly zeroes.
694  if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
695  isNullConstant(Root->getOperand(2)) &&
696  (Root->getOperand(0).isUndef() ||
698  return false;
699 
700  return true;
701 }
702 
703 /// Replace the original chain operand of the call with
704 /// load's chain operand and move load below the call's chain operand.
706  SDValue Call, SDValue OrigChain) {
708  SDValue Chain = OrigChain.getOperand(0);
709  if (Chain.getNode() == Load.getNode())
710  Ops.push_back(Load.getOperand(0));
711  else {
712  assert(Chain.getOpcode() == ISD::TokenFactor &&
713  "Unexpected chain operand");
714  for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
715  if (Chain.getOperand(i).getNode() == Load.getNode())
716  Ops.push_back(Load.getOperand(0));
717  else
718  Ops.push_back(Chain.getOperand(i));
719  SDValue NewChain =
720  CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
721  Ops.clear();
722  Ops.push_back(NewChain);
723  }
724  Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
725  CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
726  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
727  Load.getOperand(1), Load.getOperand(2));
728 
729  Ops.clear();
730  Ops.push_back(SDValue(Load.getNode(), 1));
731  Ops.append(Call->op_begin() + 1, Call->op_end());
732  CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
733 }
734 
735 /// Return true if call address is a load and it can be
736 /// moved below CALLSEQ_START and the chains leading up to the call.
737 /// Return the CALLSEQ_START by reference as a second output.
738 /// In the case of a tail call, there isn't a callseq node between the call
739 /// chain and the load.
740 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
741  // The transformation is somewhat dangerous if the call's chain was glued to
742  // the call. After MoveBelowOrigChain the load is moved between the call and
743  // the chain, this can create a cycle if the load is not folded. So it is
744  // *really* important that we are sure the load will be folded.
745  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
746  return false;
747  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
748  if (!LD ||
749  LD->isVolatile() ||
752  return false;
753 
754  // Now let's find the callseq_start.
755  while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
756  if (!Chain.hasOneUse())
757  return false;
758  Chain = Chain.getOperand(0);
759  }
760 
761  if (!Chain.getNumOperands())
762  return false;
763  // Since we are not checking for AA here, conservatively abort if the chain
764  // writes to memory. It's not safe to move the callee (a load) across a store.
765  if (isa<MemSDNode>(Chain.getNode()) &&
766  cast<MemSDNode>(Chain.getNode())->writeMem())
767  return false;
768  if (Chain.getOperand(0).getNode() == Callee.getNode())
769  return true;
770  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
771  Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
772  Callee.getValue(1).hasOneUse())
773  return true;
774  return false;
775 }
776 
777 void X86DAGToDAGISel::PreprocessISelDAG() {
778  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
779  E = CurDAG->allnodes_end(); I != E; ) {
780  SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
781 
782  // If this is a target specific AND node with no flag usages, turn it back
783  // into ISD::AND to enable test instruction matching.
784  if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
785  SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
786  N->getOperand(0), N->getOperand(1));
787  --I;
788  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
789  ++I;
790  CurDAG->DeleteNode(N);
791  continue;
792  }
793 
794  switch (N->getOpcode()) {
795  case ISD::SHL:
796  case ISD::SRA:
797  case ISD::SRL: {
798  // Replace vector shifts with their X86 specific equivalent so we don't
799  // need 2 sets of patterns.
800  if (!N->getValueType(0).isVector())
801  break;
802 
803  unsigned NewOpc;
804  switch (N->getOpcode()) {
805  default: llvm_unreachable("Unexpected opcode!");
806  case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
807  case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
808  case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
809  }
810  SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
811  N->getOperand(0), N->getOperand(1));
812  --I;
813  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
814  ++I;
815  CurDAG->DeleteNode(N);
816  continue;
817  }
818  case ISD::ANY_EXTEND:
820  // Replace vector any extend with the zero extend equivalents so we don't
821  // need 2 sets of patterns. Ignore vXi1 extensions.
822  if (!N->getValueType(0).isVector() ||
824  break;
825 
826  unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
829 
830  SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
831  N->getOperand(0));
832  --I;
833  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
834  ++I;
835  CurDAG->DeleteNode(N);
836  continue;
837  }
838  case ISD::FCEIL:
839  case ISD::FFLOOR:
840  case ISD::FTRUNC:
841  case ISD::FNEARBYINT:
842  case ISD::FRINT: {
843  // Replace fp rounding with their X86 specific equivalent so we don't
844  // need 2 sets of patterns.
845  unsigned Imm;
846  switch (N->getOpcode()) {
847  default: llvm_unreachable("Unexpected opcode!");
848  case ISD::FCEIL: Imm = 0xA; break;
849  case ISD::FFLOOR: Imm = 0x9; break;
850  case ISD::FTRUNC: Imm = 0xB; break;
851  case ISD::FNEARBYINT: Imm = 0xC; break;
852  case ISD::FRINT: Imm = 0x4; break;
853  }
854  SDLoc dl(N);
855  SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
856  N->getValueType(0),
857  N->getOperand(0),
858  CurDAG->getConstant(Imm, dl, MVT::i8));
859  --I;
860  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
861  ++I;
862  CurDAG->DeleteNode(N);
863  continue;
864  }
865  case X86ISD::FANDN:
866  case X86ISD::FAND:
867  case X86ISD::FOR:
868  case X86ISD::FXOR: {
869  // Widen scalar fp logic ops to vector to reduce isel patterns.
870  // FIXME: Can we do this during lowering/combine.
871  MVT VT = N->getSimpleValueType(0);
872  if (VT.isVector() || VT == MVT::f128)
873  break;
874 
875  MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
876  SDLoc dl(N);
877  SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
878  N->getOperand(0));
879  SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
880  N->getOperand(1));
881 
882  SDValue Res;
883  if (Subtarget->hasSSE2()) {
884  EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
885  Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
886  Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
887  unsigned Opc;
888  switch (N->getOpcode()) {
889  default: llvm_unreachable("Unexpected opcode!");
890  case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
891  case X86ISD::FAND: Opc = ISD::AND; break;
892  case X86ISD::FOR: Opc = ISD::OR; break;
893  case X86ISD::FXOR: Opc = ISD::XOR; break;
894  }
895  Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
896  Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
897  } else {
898  Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
899  }
900  Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
901  CurDAG->getIntPtrConstant(0, dl));
902  --I;
903  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
904  ++I;
905  CurDAG->DeleteNode(N);
906  continue;
907  }
908  }
909 
910  if (OptLevel != CodeGenOpt::None &&
911  // Only do this when the target can fold the load into the call or
912  // jmp.
913  !Subtarget->useRetpolineIndirectCalls() &&
914  ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
915  (N->getOpcode() == X86ISD::TC_RETURN &&
916  (Subtarget->is64Bit() ||
917  !getTargetMachine().isPositionIndependent())))) {
918  /// Also try moving call address load from outside callseq_start to just
919  /// before the call to allow it to be folded.
920  ///
921  /// [Load chain]
922  /// ^
923  /// |
924  /// [Load]
925  /// ^ ^
926  /// | |
927  /// / \--
928  /// / |
929  ///[CALLSEQ_START] |
930  /// ^ |
931  /// | |
932  /// [LOAD/C2Reg] |
933  /// | |
934  /// \ /
935  /// \ /
936  /// [CALL]
937  bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
938  SDValue Chain = N->getOperand(0);
939  SDValue Load = N->getOperand(1);
940  if (!isCalleeLoad(Load, Chain, HasCallSeq))
941  continue;
942  moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
943  ++NumLoadMoved;
944  continue;
945  }
946 
947  // Lower fpround and fpextend nodes that target the FP stack to be store and
948  // load to the stack. This is a gross hack. We would like to simply mark
949  // these as being illegal, but when we do that, legalize produces these when
950  // it expands calls, then expands these in the same legalize pass. We would
951  // like dag combine to be able to hack on these between the call expansion
952  // and the node legalization. As such this pass basically does "really
953  // late" legalization of these inline with the X86 isel pass.
954  // FIXME: This should only happen when not compiled with -O0.
955  switch (N->getOpcode()) {
956  default: continue;
957  case ISD::FP_ROUND:
958  case ISD::FP_EXTEND:
959  {
960  MVT SrcVT = N->getOperand(0).getSimpleValueType();
961  MVT DstVT = N->getSimpleValueType(0);
962 
963  // If any of the sources are vectors, no fp stack involved.
964  if (SrcVT.isVector() || DstVT.isVector())
965  continue;
966 
967  // If the source and destination are SSE registers, then this is a legal
968  // conversion that should not be lowered.
969  const X86TargetLowering *X86Lowering =
970  static_cast<const X86TargetLowering *>(TLI);
971  bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
972  bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
973  if (SrcIsSSE && DstIsSSE)
974  continue;
975 
976  if (!SrcIsSSE && !DstIsSSE) {
977  // If this is an FPStack extension, it is a noop.
978  if (N->getOpcode() == ISD::FP_EXTEND)
979  continue;
980  // If this is a value-preserving FPStack truncation, it is a noop.
981  if (N->getConstantOperandVal(1))
982  continue;
983  }
984 
985  // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
986  // FPStack has extload and truncstore. SSE can fold direct loads into other
987  // operations. Based on this, decide what we want to do.
988  MVT MemVT;
989  if (N->getOpcode() == ISD::FP_ROUND)
990  MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
991  else
992  MemVT = SrcIsSSE ? SrcVT : DstVT;
993 
994  SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
995  SDLoc dl(N);
996 
997  // FIXME: optimize the case where the src/dest is a load or store?
998 
999  SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
1000  MemTmp, MachinePointerInfo(), MemVT);
1001  SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1002  MachinePointerInfo(), MemVT);
1003 
1004  // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1005  // extload we created. This will cause general havok on the dag because
1006  // anything below the conversion could be folded into other existing nodes.
1007  // To avoid invalidating 'I', back it up to the convert node.
1008  --I;
1009  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1010  break;
1011  }
1012 
1013  //The sequence of events for lowering STRICT_FP versions of these nodes requires
1014  //dealing with the chain differently, as there is already a preexisting chain.
1015  case ISD::STRICT_FP_ROUND:
1016  case ISD::STRICT_FP_EXTEND:
1017  {
1018  MVT SrcVT = N->getOperand(1).getSimpleValueType();
1019  MVT DstVT = N->getSimpleValueType(0);
1020 
1021  // If any of the sources are vectors, no fp stack involved.
1022  if (SrcVT.isVector() || DstVT.isVector())
1023  continue;
1024 
1025  // If the source and destination are SSE registers, then this is a legal
1026  // conversion that should not be lowered.
1027  const X86TargetLowering *X86Lowering =
1028  static_cast<const X86TargetLowering *>(TLI);
1029  bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1030  bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1031  if (SrcIsSSE && DstIsSSE)
1032  continue;
1033 
1034  if (!SrcIsSSE && !DstIsSSE) {
1035  // If this is an FPStack extension, it is a noop.
1036  if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1037  continue;
1038  // If this is a value-preserving FPStack truncation, it is a noop.
1039  if (N->getConstantOperandVal(2))
1040  continue;
1041  }
1042 
1043  // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1044  // FPStack has extload and truncstore. SSE can fold direct loads into other
1045  // operations. Based on this, decide what we want to do.
1046  MVT MemVT;
1047  if (N->getOpcode() == ISD::STRICT_FP_ROUND)
1048  MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1049  else
1050  MemVT = SrcIsSSE ? SrcVT : DstVT;
1051 
1052  SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1053  SDLoc dl(N);
1054 
1055  // FIXME: optimize the case where the src/dest is a load or store?
1056 
1057  //Since the operation is StrictFP, use the preexisting chain.
1058  SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
1059  MemTmp, MachinePointerInfo(), MemVT);
1060  SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1061  MachinePointerInfo(), MemVT);
1062 
1063  // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1064  // extload we created. This will cause general havok on the dag because
1065  // anything below the conversion could be folded into other existing nodes.
1066  // To avoid invalidating 'I', back it up to the convert node.
1067  --I;
1068  CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1069  break;
1070  }
1071  }
1072 
1073 
1074  // Now that we did that, the node is dead. Increment the iterator to the
1075  // next node to process, then delete N.
1076  ++I;
1077  CurDAG->DeleteNode(N);
1078  }
1079 
1080  // The load+call transform above can leave some dead nodes in the graph. Make
1081  // sure we remove them. Its possible some of the other transforms do to so
1082  // just remove dead nodes unconditionally.
1083  CurDAG->RemoveDeadNodes();
1084 }
1085 
1086 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1087 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1088  unsigned Opc = N->getMachineOpcode();
1089  if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1090  Opc != X86::MOVSX64rr8)
1091  return false;
1092 
1093  SDValue N0 = N->getOperand(0);
1094 
1095  // We need to be extracting the lower bit of an extend.
1096  if (!N0.isMachineOpcode() ||
1097  N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1098  N0.getConstantOperandVal(1) != X86::sub_8bit)
1099  return false;
1100 
1101  // We're looking for either a movsx or movzx to match the original opcode.
1102  unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1103  : X86::MOVSX32rr8_NOREX;
1104  SDValue N00 = N0.getOperand(0);
1105  if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1106  return false;
1107 
1108  if (Opc == X86::MOVSX64rr8) {
1109  // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1110  // to 64.
1111  MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1112  MVT::i64, N00);
1113  ReplaceUses(N, Extend);
1114  } else {
1115  // Ok we can drop this extend and just use the original extend.
1116  ReplaceUses(N, N00.getNode());
1117  }
1118 
1119  return true;
1120 }
1121 
1122 void X86DAGToDAGISel::PostprocessISelDAG() {
1123  // Skip peepholes at -O0.
1124  if (TM.getOptLevel() == CodeGenOpt::None)
1125  return;
1126 
1127  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1128 
1129  bool MadeChange = false;
1130  while (Position != CurDAG->allnodes_begin()) {
1131  SDNode *N = &*--Position;
1132  // Skip dead nodes and any non-machine opcodes.
1133  if (N->use_empty() || !N->isMachineOpcode())
1134  continue;
1135 
1136  if (tryOptimizeRem8Extend(N)) {
1137  MadeChange = true;
1138  continue;
1139  }
1140 
1141  // Look for a TESTrr+ANDrr pattern where both operands of the test are
1142  // the same. Rewrite to remove the AND.
1143  unsigned Opc = N->getMachineOpcode();
1144  if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
1145  Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
1146  N->getOperand(0) == N->getOperand(1) &&
1147  N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1148  N->getOperand(0).isMachineOpcode()) {
1149  SDValue And = N->getOperand(0);
1150  unsigned N0Opc = And.getMachineOpcode();
1151  if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
1152  N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
1153  MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1154  MVT::i32,
1155  And.getOperand(0),
1156  And.getOperand(1));
1157  ReplaceUses(N, Test);
1158  MadeChange = true;
1159  continue;
1160  }
1161  if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
1162  N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
1163  unsigned NewOpc;
1164  switch (N0Opc) {
1165  case X86::AND8rm: NewOpc = X86::TEST8mr; break;
1166  case X86::AND16rm: NewOpc = X86::TEST16mr; break;
1167  case X86::AND32rm: NewOpc = X86::TEST32mr; break;
1168  case X86::AND64rm: NewOpc = X86::TEST64mr; break;
1169  }
1170 
1171  // Need to swap the memory and register operand.
1172  SDValue Ops[] = { And.getOperand(1),
1173  And.getOperand(2),
1174  And.getOperand(3),
1175  And.getOperand(4),
1176  And.getOperand(5),
1177  And.getOperand(0),
1178  And.getOperand(6) /* Chain */ };
1179  MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1180  MVT::i32, MVT::Other, Ops);
1181  ReplaceUses(N, Test);
1182  MadeChange = true;
1183  continue;
1184  }
1185  }
1186 
1187  // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1188  // used. We're doing this late so we can prefer to fold the AND into masked
1189  // comparisons. Doing that can be better for the live range of the mask
1190  // register.
1191  if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
1192  Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
1193  N->getOperand(0) == N->getOperand(1) &&
1194  N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1195  N->getOperand(0).isMachineOpcode() &&
1196  onlyUsesZeroFlag(SDValue(N, 0))) {
1197  SDValue And = N->getOperand(0);
1198  unsigned N0Opc = And.getMachineOpcode();
1199  // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1200  // KAND instructions and KTEST use the same ISA feature.
1201  if (N0Opc == X86::KANDBrr ||
1202  (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
1203  N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
1204  unsigned NewOpc;
1205  switch (Opc) {
1206  default: llvm_unreachable("Unexpected opcode!");
1207  case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1208  case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1209  case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1210  case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1211  }
1212  MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1213  MVT::i32,
1214  And.getOperand(0),
1215  And.getOperand(1));
1216  ReplaceUses(N, KTest);
1217  MadeChange = true;
1218  continue;
1219  }
1220  }
1221 
1222  // Attempt to remove vectors moves that were inserted to zero upper bits.
1223  if (Opc != TargetOpcode::SUBREG_TO_REG)
1224  continue;
1225 
1226  unsigned SubRegIdx = N->getConstantOperandVal(2);
1227  if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1228  continue;
1229 
1230  SDValue Move = N->getOperand(1);
1231  if (!Move.isMachineOpcode())
1232  continue;
1233 
1234  // Make sure its one of the move opcodes we recognize.
1235  switch (Move.getMachineOpcode()) {
1236  default:
1237  continue;
1238  case X86::VMOVAPDrr: case X86::VMOVUPDrr:
1239  case X86::VMOVAPSrr: case X86::VMOVUPSrr:
1240  case X86::VMOVDQArr: case X86::VMOVDQUrr:
1241  case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
1242  case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
1243  case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
1244  case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
1245  case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
1246  case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1247  case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1248  case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
1249  case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
1250  case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1251  case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1252  break;
1253  }
1254 
1255  SDValue In = Move.getOperand(0);
1256  if (!In.isMachineOpcode() ||
1257  In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1258  continue;
1259 
1260  // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1261  // the SHA instructions which use a legacy encoding.
1262  uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1263  if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1264  (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1265  (TSFlags & X86II::EncodingMask) != X86II::XOP)
1266  continue;
1267 
1268  // Producing instruction is another vector instruction. We can drop the
1269  // move.
1270  CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1271  MadeChange = true;
1272  }
1273 
1274  if (MadeChange)
1275  CurDAG->RemoveDeadNodes();
1276 }
1277 
1278 
1279 /// Emit any code that needs to be executed only in the main function.
1280 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1281  if (Subtarget->isTargetCygMing()) {
1283  auto &DL = CurDAG->getDataLayout();
1284 
1285  TargetLowering::CallLoweringInfo CLI(*CurDAG);
1286  CLI.setChain(CurDAG->getRoot())
1287  .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1288  CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1289  std::move(Args));
1290  const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1291  std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1292  CurDAG->setRoot(Result.second);
1293  }
1294 }
1295 
1296 void X86DAGToDAGISel::EmitFunctionEntryCode() {
1297  // If this is main, emit special code for main.
1298  const Function &F = MF->getFunction();
1299  if (F.hasExternalLinkage() && F.getName() == "main")
1300  emitSpecialCodeForMain();
1301 }
1302 
1303 static bool isDispSafeForFrameIndex(int64_t Val) {
1304  // On 64-bit platforms, we can run into an issue where a frame index
1305  // includes a displacement that, when added to the explicit displacement,
1306  // will overflow the displacement field. Assuming that the frame index
1307  // displacement fits into a 31-bit integer (which is only slightly more
1308  // aggressive than the current fundamental assumption that it fits into
1309  // a 32-bit integer), a 31-bit disp should always be safe.
1310  return isInt<31>(Val);
1311 }
1312 
1313 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1314  X86ISelAddressMode &AM) {
1315  // If there's no offset to fold, we don't need to do any work.
1316  if (Offset == 0)
1317  return false;
1318 
1319  // Cannot combine ExternalSymbol displacements with integer offsets.
1320  if (AM.ES || AM.MCSym)
1321  return true;
1322 
1323  int64_t Val = AM.Disp + Offset;
1324  CodeModel::Model M = TM.getCodeModel();
1325  if (Subtarget->is64Bit()) {
1327  AM.hasSymbolicDisplacement()))
1328  return true;
1329  // In addition to the checks required for a register base, check that
1330  // we do not try to use an unsafe Disp with a frame index.
1331  if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1333  return true;
1334  }
1335  AM.Disp = Val;
1336  return false;
1337 
1338 }
1339 
1340 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
1341  SDValue Address = N->getOperand(1);
1342 
1343  // load gs:0 -> GS segment register.
1344  // load fs:0 -> FS segment register.
1345  //
1346  // This optimization is valid because the GNU TLS model defines that
1347  // gs:0 (or fs:0 on X86-64) contains its own address.
1348  // For more information see http://people.redhat.com/drepper/tls.pdf
1349  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
1350  if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
1351  !IndirectTlsSegRefs &&
1352  (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1353  Subtarget->isTargetFuchsia()))
1354  switch (N->getPointerInfo().getAddrSpace()) {
1355  case 256:
1356  AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1357  return false;
1358  case 257:
1359  AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1360  return false;
1361  // Address space 258 is not handled here, because it is not used to
1362  // address TLS areas.
1363  }
1364 
1365  return true;
1366 }
1367 
1368 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1369 /// mode. These wrap things that will resolve down into a symbol reference.
1370 /// If no match is possible, this returns true, otherwise it returns false.
1371 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1372  // If the addressing mode already has a symbol as the displacement, we can
1373  // never match another symbol.
1374  if (AM.hasSymbolicDisplacement())
1375  return true;
1376 
1377  bool IsRIPRelTLS = false;
1378  bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1379  if (IsRIPRel) {
1380  SDValue Val = N.getOperand(0);
1382  IsRIPRelTLS = true;
1383  }
1384 
1385  // We can't use an addressing mode in the 64-bit large code model.
1386  // Global TLS addressing is an exception. In the medium code model,
1387  // we use can use a mode when RIP wrappers are present.
1388  // That signifies access to globals that are known to be "near",
1389  // such as the GOT itself.
1390  CodeModel::Model M = TM.getCodeModel();
1391  if (Subtarget->is64Bit() &&
1392  ((M == CodeModel::Large && !IsRIPRelTLS) ||
1393  (M == CodeModel::Medium && !IsRIPRel)))
1394  return true;
1395 
1396  // Base and index reg must be 0 in order to use %rip as base.
1397  if (IsRIPRel && AM.hasBaseOrIndexReg())
1398  return true;
1399 
1400  // Make a local copy in case we can't do this fold.
1401  X86ISelAddressMode Backup = AM;
1402 
1403  int64_t Offset = 0;
1404  SDValue N0 = N.getOperand(0);
1405  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1406  AM.GV = G->getGlobal();
1407  AM.SymbolFlags = G->getTargetFlags();
1408  Offset = G->getOffset();
1409  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1410  AM.CP = CP->getConstVal();
1411  AM.Align = CP->getAlignment();
1412  AM.SymbolFlags = CP->getTargetFlags();
1413  Offset = CP->getOffset();
1414  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1415  AM.ES = S->getSymbol();
1416  AM.SymbolFlags = S->getTargetFlags();
1417  } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1418  AM.MCSym = S->getMCSymbol();
1419  } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
1420  AM.JT = J->getIndex();
1421  AM.SymbolFlags = J->getTargetFlags();
1422  } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1423  AM.BlockAddr = BA->getBlockAddress();
1424  AM.SymbolFlags = BA->getTargetFlags();
1425  Offset = BA->getOffset();
1426  } else
1427  llvm_unreachable("Unhandled symbol reference node.");
1428 
1429  if (foldOffsetIntoAddress(Offset, AM)) {
1430  AM = Backup;
1431  return true;
1432  }
1433 
1434  if (IsRIPRel)
1435  AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1436 
1437  // Commit the changes now that we know this fold is safe.
1438  return false;
1439 }
1440 
1441 /// Add the specified node to the specified addressing mode, returning true if
1442 /// it cannot be done. This just pattern matches for the addressing mode.
1443 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1444  if (matchAddressRecursively(N, AM, 0))
1445  return true;
1446 
1447  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1448  // a smaller encoding and avoids a scaled-index.
1449  if (AM.Scale == 2 &&
1450  AM.BaseType == X86ISelAddressMode::RegBase &&
1451  AM.Base_Reg.getNode() == nullptr) {
1452  AM.Base_Reg = AM.IndexReg;
1453  AM.Scale = 1;
1454  }
1455 
1456  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1457  // because it has a smaller encoding.
1458  // TODO: Which other code models can use this?
1459  switch (TM.getCodeModel()) {
1460  default: break;
1461  case CodeModel::Small:
1462  case CodeModel::Kernel:
1463  if (Subtarget->is64Bit() &&
1464  AM.Scale == 1 &&
1465  AM.BaseType == X86ISelAddressMode::RegBase &&
1466  AM.Base_Reg.getNode() == nullptr &&
1467  AM.IndexReg.getNode() == nullptr &&
1468  AM.SymbolFlags == X86II::MO_NO_FLAG &&
1469  AM.hasSymbolicDisplacement())
1470  AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1471  break;
1472  }
1473 
1474  return false;
1475 }
1476 
1477 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1478  unsigned Depth) {
1479  // Add an artificial use to this node so that we can keep track of
1480  // it if it gets CSE'd with a different node.
1481  HandleSDNode Handle(N);
1482 
1483  X86ISelAddressMode Backup = AM;
1484  if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1485  !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1486  return false;
1487  AM = Backup;
1488 
1489  // Try again after commuting the operands.
1490  if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
1491  !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
1492  return false;
1493  AM = Backup;
1494 
1495  // If we couldn't fold both operands into the address at the same time,
1496  // see if we can just put each operand into a register and fold at least
1497  // the add.
1498  if (AM.BaseType == X86ISelAddressMode::RegBase &&
1499  !AM.Base_Reg.getNode() &&
1500  !AM.IndexReg.getNode()) {
1501  N = Handle.getValue();
1502  AM.Base_Reg = N.getOperand(0);
1503  AM.IndexReg = N.getOperand(1);
1504  AM.Scale = 1;
1505  return false;
1506  }
1507  N = Handle.getValue();
1508  return true;
1509 }
1510 
1511 // Insert a node into the DAG at least before the Pos node's position. This
1512 // will reposition the node as needed, and will assign it a node ID that is <=
1513 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1514 // IDs! The selection DAG must no longer depend on their uniqueness when this
1515 // is used.
1516 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1517  if (N->getNodeId() == -1 ||
1520  DAG.RepositionNode(Pos->getIterator(), N.getNode());
1521  // Mark Node as invalid for pruning as after this it may be a successor to a
1522  // selected node but otherwise be in the same position of Pos.
1523  // Conservatively mark it with the same -abs(Id) to assure node id
1524  // invariant is preserved.
1525  N->setNodeId(Pos->getNodeId());
1527  }
1528 }
1529 
1530 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1531 // safe. This allows us to convert the shift and and into an h-register
1532 // extract and a scaled index. Returns false if the simplification is
1533 // performed.
1535  uint64_t Mask,
1536  SDValue Shift, SDValue X,
1537  X86ISelAddressMode &AM) {
1538  if (Shift.getOpcode() != ISD::SRL ||
1539  !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1540  !Shift.hasOneUse())
1541  return true;
1542 
1543  int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1544  if (ScaleLog <= 0 || ScaleLog >= 4 ||
1545  Mask != (0xffu << ScaleLog))
1546  return true;
1547 
1548  MVT VT = N.getSimpleValueType();
1549  SDLoc DL(N);
1550  SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
1551  SDValue NewMask = DAG.getConstant(0xff, DL, VT);
1552  SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
1553  SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
1554  SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
1555  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
1556 
1557  // Insert the new nodes into the topological ordering. We must do this in
1558  // a valid topological ordering as nothing is going to go back and re-sort
1559  // these nodes. We continually insert before 'N' in sequence as this is
1560  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1561  // hierarchy left to express.
1562  insertDAGNode(DAG, N, Eight);
1563  insertDAGNode(DAG, N, Srl);
1564  insertDAGNode(DAG, N, NewMask);
1565  insertDAGNode(DAG, N, And);
1566  insertDAGNode(DAG, N, ShlCount);
1567  insertDAGNode(DAG, N, Shl);
1568  DAG.ReplaceAllUsesWith(N, Shl);
1569  DAG.RemoveDeadNode(N.getNode());
1570  AM.IndexReg = And;
1571  AM.Scale = (1 << ScaleLog);
1572  return false;
1573 }
1574 
1575 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1576 // allows us to fold the shift into this addressing mode. Returns false if the
1577 // transform succeeded.
1579  X86ISelAddressMode &AM) {
1580  SDValue Shift = N.getOperand(0);
1581 
1582  // Use a signed mask so that shifting right will insert sign bits. These
1583  // bits will be removed when we shift the result left so it doesn't matter
1584  // what we use. This might allow a smaller immediate encoding.
1585  int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
1586 
1587  // If we have an any_extend feeding the AND, look through it to see if there
1588  // is a shift behind it. But only if the AND doesn't use the extended bits.
1589  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
1590  bool FoundAnyExtend = false;
1591  if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
1592  Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
1593  isUInt<32>(Mask)) {
1594  FoundAnyExtend = true;
1595  Shift = Shift.getOperand(0);
1596  }
1597 
1598  if (Shift.getOpcode() != ISD::SHL ||
1599  !isa<ConstantSDNode>(Shift.getOperand(1)))
1600  return true;
1601 
1602  SDValue X = Shift.getOperand(0);
1603 
1604  // Not likely to be profitable if either the AND or SHIFT node has more
1605  // than one use (unless all uses are for address computation). Besides,
1606  // isel mechanism requires their node ids to be reused.
1607  if (!N.hasOneUse() || !Shift.hasOneUse())
1608  return true;
1609 
1610  // Verify that the shift amount is something we can fold.
1611  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1612  if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
1613  return true;
1614 
1615  MVT VT = N.getSimpleValueType();
1616  SDLoc DL(N);
1617  if (FoundAnyExtend) {
1618  SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
1619  insertDAGNode(DAG, N, NewX);
1620  X = NewX;
1621  }
1622 
1623  SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
1624  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
1625  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
1626 
1627  // Insert the new nodes into the topological ordering. We must do this in
1628  // a valid topological ordering as nothing is going to go back and re-sort
1629  // these nodes. We continually insert before 'N' in sequence as this is
1630  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1631  // hierarchy left to express.
1632  insertDAGNode(DAG, N, NewMask);
1633  insertDAGNode(DAG, N, NewAnd);
1634  insertDAGNode(DAG, N, NewShift);
1635  DAG.ReplaceAllUsesWith(N, NewShift);
1636  DAG.RemoveDeadNode(N.getNode());
1637 
1638  AM.Scale = 1 << ShiftAmt;
1639  AM.IndexReg = NewAnd;
1640  return false;
1641 }
1642 
1643 // Implement some heroics to detect shifts of masked values where the mask can
1644 // be replaced by extending the shift and undoing that in the addressing mode
1645 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1646 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1647 // the addressing mode. This results in code such as:
1648 //
1649 // int f(short *y, int *lookup_table) {
1650 // ...
1651 // return *y + lookup_table[*y >> 11];
1652 // }
1653 //
1654 // Turning into:
1655 // movzwl (%rdi), %eax
1656 // movl %eax, %ecx
1657 // shrl $11, %ecx
1658 // addl (%rsi,%rcx,4), %eax
1659 //
1660 // Instead of:
1661 // movzwl (%rdi), %eax
1662 // movl %eax, %ecx
1663 // shrl $9, %ecx
1664 // andl $124, %rcx
1665 // addl (%rsi,%rcx), %eax
1666 //
1667 // Note that this function assumes the mask is provided as a mask *after* the
1668 // value is shifted. The input chain may or may not match that, but computing
1669 // such a mask is trivial.
1671  uint64_t Mask,
1672  SDValue Shift, SDValue X,
1673  X86ISelAddressMode &AM) {
1674  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
1675  !isa<ConstantSDNode>(Shift.getOperand(1)))
1676  return true;
1677 
1678  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1679  unsigned MaskLZ = countLeadingZeros(Mask);
1680  unsigned MaskTZ = countTrailingZeros(Mask);
1681 
1682  // The amount of shift we're trying to fit into the addressing mode is taken
1683  // from the trailing zeros of the mask.
1684  unsigned AMShiftAmt = MaskTZ;
1685 
1686  // There is nothing we can do here unless the mask is removing some bits.
1687  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1688  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1689 
1690  // We also need to ensure that mask is a continuous run of bits.
1691  if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
1692 
1693  // Scale the leading zero count down based on the actual size of the value.
1694  // Also scale it down based on the size of the shift.
1695  unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
1696  if (MaskLZ < ScaleDown)
1697  return true;
1698  MaskLZ -= ScaleDown;
1699 
1700  // The final check is to ensure that any masked out high bits of X are
1701  // already known to be zero. Otherwise, the mask has a semantic impact
1702  // other than masking out a couple of low bits. Unfortunately, because of
1703  // the mask, zero extensions will be removed from operands in some cases.
1704  // This code works extra hard to look through extensions because we can
1705  // replace them with zero extensions cheaply if necessary.
1706  bool ReplacingAnyExtend = false;
1707  if (X.getOpcode() == ISD::ANY_EXTEND) {
1708  unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
1710  // Assume that we'll replace the any-extend with a zero-extend, and
1711  // narrow the search to the extended value.
1712  X = X.getOperand(0);
1713  MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
1714  ReplacingAnyExtend = true;
1715  }
1716  APInt MaskedHighBits =
1718  KnownBits Known = DAG.computeKnownBits(X);
1719  if (MaskedHighBits != Known.Zero) return true;
1720 
1721  // We've identified a pattern that can be transformed into a single shift
1722  // and an addressing mode. Make it so.
1723  MVT VT = N.getSimpleValueType();
1724  if (ReplacingAnyExtend) {
1725  assert(X.getValueType() != VT);
1726  // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1727  SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
1728  insertDAGNode(DAG, N, NewX);
1729  X = NewX;
1730  }
1731  SDLoc DL(N);
1732  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1733  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1734  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1735  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
1736 
1737  // Insert the new nodes into the topological ordering. We must do this in
1738  // a valid topological ordering as nothing is going to go back and re-sort
1739  // these nodes. We continually insert before 'N' in sequence as this is
1740  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1741  // hierarchy left to express.
1742  insertDAGNode(DAG, N, NewSRLAmt);
1743  insertDAGNode(DAG, N, NewSRL);
1744  insertDAGNode(DAG, N, NewSHLAmt);
1745  insertDAGNode(DAG, N, NewSHL);
1746  DAG.ReplaceAllUsesWith(N, NewSHL);
1747  DAG.RemoveDeadNode(N.getNode());
1748 
1749  AM.Scale = 1 << AMShiftAmt;
1750  AM.IndexReg = NewSRL;
1751  return false;
1752 }
1753 
1754 // Transform "(X >> SHIFT) & (MASK << C1)" to
1755 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1756 // matched to a BEXTR later. Returns false if the simplification is performed.
1758  uint64_t Mask,
1759  SDValue Shift, SDValue X,
1760  X86ISelAddressMode &AM,
1761  const X86Subtarget &Subtarget) {
1762  if (Shift.getOpcode() != ISD::SRL ||
1763  !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1764  !Shift.hasOneUse() || !N.hasOneUse())
1765  return true;
1766 
1767  // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1768  if (!Subtarget.hasTBM() &&
1769  !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
1770  return true;
1771 
1772  // We need to ensure that mask is a continuous run of bits.
1773  if (!isShiftedMask_64(Mask)) return true;
1774 
1775  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1776 
1777  // The amount of shift we're trying to fit into the addressing mode is taken
1778  // from the trailing zeros of the mask.
1779  unsigned AMShiftAmt = countTrailingZeros(Mask);
1780 
1781  // There is nothing we can do here unless the mask is removing some bits.
1782  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1783  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1784 
1785  MVT VT = N.getSimpleValueType();
1786  SDLoc DL(N);
1787  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1788  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1789  SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
1790  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
1791  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1792  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
1793 
1794  // Insert the new nodes into the topological ordering. We must do this in
1795  // a valid topological ordering as nothing is going to go back and re-sort
1796  // these nodes. We continually insert before 'N' in sequence as this is
1797  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1798  // hierarchy left to express.
1799  insertDAGNode(DAG, N, NewSRLAmt);
1800  insertDAGNode(DAG, N, NewSRL);
1801  insertDAGNode(DAG, N, NewMask);
1802  insertDAGNode(DAG, N, NewAnd);
1803  insertDAGNode(DAG, N, NewSHLAmt);
1804  insertDAGNode(DAG, N, NewSHL);
1805  DAG.ReplaceAllUsesWith(N, NewSHL);
1806  DAG.RemoveDeadNode(N.getNode());
1807 
1808  AM.Scale = 1 << AMShiftAmt;
1809  AM.IndexReg = NewAnd;
1810  return false;
1811 }
1812 
1813 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
1814  unsigned Depth) {
1815  SDLoc dl(N);
1816  LLVM_DEBUG({
1817  dbgs() << "MatchAddress: ";
1818  AM.dump(CurDAG);
1819  });
1820  // Limit recursion.
1821  if (Depth > 5)
1822  return matchAddressBase(N, AM);
1823 
1824  // If this is already a %rip relative address, we can only merge immediates
1825  // into it. Instead of handling this in every case, we handle it here.
1826  // RIP relative addressing: %rip + 32-bit displacement!
1827  if (AM.isRIPRelative()) {
1828  // FIXME: JumpTable and ExternalSymbol address currently don't like
1829  // displacements. It isn't very important, but this should be fixed for
1830  // consistency.
1831  if (!(AM.ES || AM.MCSym) && AM.JT != -1)
1832  return true;
1833 
1834  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
1835  if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
1836  return false;
1837  return true;
1838  }
1839 
1840  switch (N.getOpcode()) {
1841  default: break;
1842  case ISD::LOCAL_RECOVER: {
1843  if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
1844  if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
1845  // Use the symbol and don't prefix it.
1846  AM.MCSym = ESNode->getMCSymbol();
1847  return false;
1848  }
1849  break;
1850  }
1851  case ISD::Constant: {
1852  uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
1853  if (!foldOffsetIntoAddress(Val, AM))
1854  return false;
1855  break;
1856  }
1857 
1858  case X86ISD::Wrapper:
1859  case X86ISD::WrapperRIP:
1860  if (!matchWrapper(N, AM))
1861  return false;
1862  break;
1863 
1864  case ISD::LOAD:
1865  if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
1866  return false;
1867  break;
1868 
1869  case ISD::FrameIndex:
1870  if (AM.BaseType == X86ISelAddressMode::RegBase &&
1871  AM.Base_Reg.getNode() == nullptr &&
1872  (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
1873  AM.BaseType = X86ISelAddressMode::FrameIndexBase;
1874  AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
1875  return false;
1876  }
1877  break;
1878 
1879  case ISD::SHL:
1880  if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
1881  break;
1882 
1883  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
1884  unsigned Val = CN->getZExtValue();
1885  // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1886  // that the base operand remains free for further matching. If
1887  // the base doesn't end up getting used, a post-processing step
1888  // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1889  if (Val == 1 || Val == 2 || Val == 3) {
1890  AM.Scale = 1 << Val;
1891  SDValue ShVal = N.getOperand(0);
1892 
1893  // Okay, we know that we have a scale by now. However, if the scaled
1894  // value is an add of something and a constant, we can fold the
1895  // constant into the disp field here.
1896  if (CurDAG->isBaseWithConstantOffset(ShVal)) {
1897  AM.IndexReg = ShVal.getOperand(0);
1898  ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
1899  uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
1900  if (!foldOffsetIntoAddress(Disp, AM))
1901  return false;
1902  }
1903 
1904  AM.IndexReg = ShVal;
1905  return false;
1906  }
1907  }
1908  break;
1909 
1910  case ISD::SRL: {
1911  // Scale must not be used already.
1912  if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
1913 
1914  // We only handle up to 64-bit values here as those are what matter for
1915  // addressing mode optimizations.
1916  assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
1917  "Unexpected value size!");
1918 
1919  SDValue And = N.getOperand(0);
1920  if (And.getOpcode() != ISD::AND) break;
1921  SDValue X = And.getOperand(0);
1922 
1923  // The mask used for the transform is expected to be post-shift, but we
1924  // found the shift first so just apply the shift to the mask before passing
1925  // it down.
1926  if (!isa<ConstantSDNode>(N.getOperand(1)) ||
1927  !isa<ConstantSDNode>(And.getOperand(1)))
1928  break;
1929  uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
1930 
1931  // Try to fold the mask and shift into the scale, and return false if we
1932  // succeed.
1933  if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
1934  return false;
1935  break;
1936  }
1937 
1938  case ISD::SMUL_LOHI:
1939  case ISD::UMUL_LOHI:
1940  // A mul_lohi where we need the low part can be folded as a plain multiply.
1941  if (N.getResNo() != 0) break;
1943  case ISD::MUL:
1944  case X86ISD::MUL_IMM:
1945  // X*[3,5,9] -> X+X*[2,4,8]
1946  if (AM.BaseType == X86ISelAddressMode::RegBase &&
1947  AM.Base_Reg.getNode() == nullptr &&
1948  AM.IndexReg.getNode() == nullptr) {
1949  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
1950  if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
1951  CN->getZExtValue() == 9) {
1952  AM.Scale = unsigned(CN->getZExtValue())-1;
1953 
1954  SDValue MulVal = N.getOperand(0);
1955  SDValue Reg;
1956 
1957  // Okay, we know that we have a scale by now. However, if the scaled
1958  // value is an add of something and a constant, we can fold the
1959  // constant into the disp field here.
1960  if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
1961  isa<ConstantSDNode>(MulVal.getOperand(1))) {
1962  Reg = MulVal.getOperand(0);
1963  ConstantSDNode *AddVal =
1964  cast<ConstantSDNode>(MulVal.getOperand(1));
1965  uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
1966  if (foldOffsetIntoAddress(Disp, AM))
1967  Reg = N.getOperand(0);
1968  } else {
1969  Reg = N.getOperand(0);
1970  }
1971 
1972  AM.IndexReg = AM.Base_Reg = Reg;
1973  return false;
1974  }
1975  }
1976  break;
1977 
1978  case ISD::SUB: {
1979  // Given A-B, if A can be completely folded into the address and
1980  // the index field with the index field unused, use -B as the index.
1981  // This is a win if a has multiple parts that can be folded into
1982  // the address. Also, this saves a mov if the base register has
1983  // other uses, since it avoids a two-address sub instruction, however
1984  // it costs an additional mov if the index register has other uses.
1985 
1986  // Add an artificial use to this node so that we can keep track of
1987  // it if it gets CSE'd with a different node.
1988  HandleSDNode Handle(N);
1989 
1990  // Test if the LHS of the sub can be folded.
1991  X86ISelAddressMode Backup = AM;
1992  if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
1993  N = Handle.getValue();
1994  AM = Backup;
1995  break;
1996  }
1997  N = Handle.getValue();
1998  // Test if the index field is free for use.
1999  if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2000  AM = Backup;
2001  break;
2002  }
2003 
2004  int Cost = 0;
2005  SDValue RHS = N.getOperand(1);
2006  // If the RHS involves a register with multiple uses, this
2007  // transformation incurs an extra mov, due to the neg instruction
2008  // clobbering its operand.
2009  if (!RHS.getNode()->hasOneUse() ||
2010  RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2011  RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2012  RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2013  (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2014  RHS.getOperand(0).getValueType() == MVT::i32))
2015  ++Cost;
2016  // If the base is a register with multiple uses, this
2017  // transformation may save a mov.
2018  if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2019  !AM.Base_Reg.getNode()->hasOneUse()) ||
2020  AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2021  --Cost;
2022  // If the folded LHS was interesting, this transformation saves
2023  // address arithmetic.
2024  if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2025  ((AM.Disp != 0) && (Backup.Disp == 0)) +
2026  (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2027  --Cost;
2028  // If it doesn't look like it may be an overall win, don't do it.
2029  if (Cost >= 0) {
2030  AM = Backup;
2031  break;
2032  }
2033 
2034  // Ok, the transformation is legal and appears profitable. Go for it.
2035  // Negation will be emitted later to avoid creating dangling nodes if this
2036  // was an unprofitable LEA.
2037  AM.IndexReg = RHS;
2038  AM.NegateIndex = true;
2039  AM.Scale = 1;
2040  return false;
2041  }
2042 
2043  case ISD::ADD:
2044  if (!matchAdd(N, AM, Depth))
2045  return false;
2046  break;
2047 
2048  case ISD::OR:
2049  // We want to look through a transform in InstCombine and DAGCombiner that
2050  // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
2051  // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
2052  // An 'lea' can then be used to match the shift (multiply) and add:
2053  // and $1, %esi
2054  // lea (%rsi, %rdi, 8), %rax
2055  if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
2056  !matchAdd(N, AM, Depth))
2057  return false;
2058  break;
2059 
2060  case ISD::AND: {
2061  // Perform some heroic transforms on an and of a constant-count shift
2062  // with a constant to enable use of the scaled offset field.
2063 
2064  // Scale must not be used already.
2065  if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2066 
2067  // We only handle up to 64-bit values here as those are what matter for
2068  // addressing mode optimizations.
2069  assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2070  "Unexpected value size!");
2071 
2072  if (!isa<ConstantSDNode>(N.getOperand(1)))
2073  break;
2074 
2075  if (N.getOperand(0).getOpcode() == ISD::SRL) {
2076  SDValue Shift = N.getOperand(0);
2077  SDValue X = Shift.getOperand(0);
2078 
2079  uint64_t Mask = N.getConstantOperandVal(1);
2080 
2081  // Try to fold the mask and shift into an extract and scale.
2082  if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2083  return false;
2084 
2085  // Try to fold the mask and shift directly into the scale.
2086  if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2087  return false;
2088 
2089  // Try to fold the mask and shift into BEXTR and scale.
2090  if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2091  return false;
2092  }
2093 
2094  // Try to swap the mask and shift to place shifts which can be done as
2095  // a scale on the outside of the mask.
2096  if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2097  return false;
2098 
2099  break;
2100  }
2101  case ISD::ZERO_EXTEND: {
2102  // Try to widen a zexted shift left to the same size as its use, so we can
2103  // match the shift as a scale factor.
2104  if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2105  break;
2106  if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
2107  break;
2108 
2109  // Give up if the shift is not a valid scale factor [1,2,3].
2110  SDValue Shl = N.getOperand(0);
2111  auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
2112  if (!ShAmtC || ShAmtC->getZExtValue() > 3)
2113  break;
2114 
2115  // The narrow shift must only shift out zero bits (it must be 'nuw').
2116  // That makes it safe to widen to the destination type.
2118  ShAmtC->getZExtValue());
2119  if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
2120  break;
2121 
2122  // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
2123  MVT VT = N.getSimpleValueType();
2124  SDLoc DL(N);
2125  SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
2126  SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
2127 
2128  // Convert the shift to scale factor.
2129  AM.Scale = 1 << ShAmtC->getZExtValue();
2130  AM.IndexReg = Zext;
2131 
2132  insertDAGNode(*CurDAG, N, Zext);
2133  insertDAGNode(*CurDAG, N, NewShl);
2134  CurDAG->ReplaceAllUsesWith(N, NewShl);
2135  CurDAG->RemoveDeadNode(N.getNode());
2136  return false;
2137  }
2138  }
2139 
2140  return matchAddressBase(N, AM);
2141 }
2142 
2143 /// Helper for MatchAddress. Add the specified node to the
2144 /// specified addressing mode without any further recursion.
2145 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2146  // Is the base register already occupied?
2147  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2148  // If so, check to see if the scale index register is set.
2149  if (!AM.IndexReg.getNode()) {
2150  AM.IndexReg = N;
2151  AM.Scale = 1;
2152  return false;
2153  }
2154 
2155  // Otherwise, we cannot select it.
2156  return true;
2157  }
2158 
2159  // Default, generate it as a register.
2160  AM.BaseType = X86ISelAddressMode::RegBase;
2161  AM.Base_Reg = N;
2162  return false;
2163 }
2164 
2165 /// Helper for selectVectorAddr. Handles things that can be folded into a
2166 /// gather scatter address. The index register and scale should have already
2167 /// been handled.
2168 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2169  // TODO: Support other operations.
2170  switch (N.getOpcode()) {
2171  case ISD::Constant: {
2172  uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2173  if (!foldOffsetIntoAddress(Val, AM))
2174  return false;
2175  break;
2176  }
2177  case X86ISD::Wrapper:
2178  if (!matchWrapper(N, AM))
2179  return false;
2180  break;
2181  }
2182 
2183  return matchAddressBase(N, AM);
2184 }
2185 
2186 bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
2187  SDValue &Scale, SDValue &Index,
2188  SDValue &Disp, SDValue &Segment) {
2189  X86ISelAddressMode AM;
2190  auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
2191  AM.IndexReg = Mgs->getIndex();
2192  AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
2193 
2194  unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2195  // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2196  if (AddrSpace == 256)
2197  AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2198  if (AddrSpace == 257)
2199  AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2200  if (AddrSpace == 258)
2201  AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2202 
2203  SDLoc DL(N);
2204  MVT VT = N.getSimpleValueType();
2205 
2206  // Try to match into the base and displacement fields.
2207  if (matchVectorAddress(N, AM))
2208  return false;
2209 
2210  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2211  return true;
2212 }
2213 
2214 /// Returns true if it is able to pattern match an addressing mode.
2215 /// It returns the operands which make up the maximal addressing mode it can
2216 /// match by reference.
2217 ///
2218 /// Parent is the parent node of the addr operand that is being matched. It
2219 /// is always a load, store, atomic node, or null. It is only null when
2220 /// checking memory operands for inline asm nodes.
2221 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2222  SDValue &Scale, SDValue &Index,
2223  SDValue &Disp, SDValue &Segment) {
2224  X86ISelAddressMode AM;
2225 
2226  if (Parent &&
2227  // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2228  // that are not a MemSDNode, and thus don't have proper addrspace info.
2229  Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2230  Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2231  Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2232  Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2233  Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2234  Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2235  Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2236  unsigned AddrSpace =
2237  cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2238  // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2239  if (AddrSpace == 256)
2240  AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2241  if (AddrSpace == 257)
2242  AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2243  if (AddrSpace == 258)
2244  AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2245  }
2246 
2247  // Save the DL and VT before calling matchAddress, it can invalidate N.
2248  SDLoc DL(N);
2249  MVT VT = N.getSimpleValueType();
2250 
2251  if (matchAddress(N, AM))
2252  return false;
2253 
2254  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2255  return true;
2256 }
2257 
2258 // We can only fold a load if all nodes between it and the root node have a
2259 // single use. If there are additional uses, we could end up duplicating the
2260 // load.
2261 static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
2262  while (User != Root) {
2263  if (!User->hasOneUse())
2264  return false;
2265  User = *User->use_begin();
2266  }
2267 
2268  return true;
2269 }
2270 
2271 /// Match a scalar SSE load. In particular, we want to match a load whose top
2272 /// elements are either undef or zeros. The load flavor is derived from the
2273 /// type of N, which is either v4f32 or v2f64.
2274 ///
2275 /// We also return:
2276 /// PatternChainNode: this is the matched node that has a chain input and
2277 /// output.
2278 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
2279  SDValue N, SDValue &Base,
2280  SDValue &Scale, SDValue &Index,
2281  SDValue &Disp, SDValue &Segment,
2282  SDValue &PatternNodeWithChain) {
2283  if (!hasSingleUsesFromRoot(Root, Parent))
2284  return false;
2285 
2286  // We can allow a full vector load here since narrowing a load is ok.
2287  if (ISD::isNON_EXTLoad(N.getNode())) {
2288  PatternNodeWithChain = N;
2289  if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2290  IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
2291  LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2292  return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2293  Segment);
2294  }
2295  }
2296 
2297  // We can also match the special zero extended load opcode.
2298  if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
2299  PatternNodeWithChain = N;
2300  if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2301  IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
2302  auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
2303  return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
2304  Segment);
2305  }
2306  }
2307 
2308  // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
2309  // once. Otherwise the load might get duplicated and the chain output of the
2310  // duplicate load will not be observed by all dependencies.
2311  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
2312  PatternNodeWithChain = N.getOperand(0);
2313  if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2314  IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2315  IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2316  LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2317  return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2318  Segment);
2319  }
2320  }
2321 
2322  // Also handle the case where we explicitly require zeros in the top
2323  // elements. This is a vector shuffle from the zero vector.
2324  if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
2325  // Check to see if the top elements are all zeros (or bitcast of zeros).
2327  N.getOperand(0).getNode()->hasOneUse()) {
2328  PatternNodeWithChain = N.getOperand(0).getOperand(0);
2329  if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2330  IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2331  IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2332  // Okay, this is a zero extending load. Fold it.
2333  LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2334  return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2335  Segment);
2336  }
2337  }
2338 
2339  return false;
2340 }
2341 
2342 
2343 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2344  if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
2345  uint64_t ImmVal = CN->getZExtValue();
2346  if (!isUInt<32>(ImmVal))
2347  return false;
2348 
2349  Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
2350  return true;
2351  }
2352 
2353  // In static codegen with small code model, we can get the address of a label
2354  // into a register with 'movl'
2355  if (N->getOpcode() != X86ISD::Wrapper)
2356  return false;
2357 
2358  N = N.getOperand(0);
2359 
2360  // At least GNU as does not accept 'movl' for TPOFF relocations.
2361  // FIXME: We could use 'movl' when we know we are targeting MC.
2363  return false;
2364 
2365  Imm = N;
2366  if (N->getOpcode() != ISD::TargetGlobalAddress)
2367  return TM.getCodeModel() == CodeModel::Small;
2368 
2370  cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
2371  if (!CR)
2372  return TM.getCodeModel() == CodeModel::Small;
2373 
2374  return CR->getUnsignedMax().ult(1ull << 32);
2375 }
2376 
2377 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2378  SDValue &Scale, SDValue &Index,
2379  SDValue &Disp, SDValue &Segment) {
2380  // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2381  SDLoc DL(N);
2382 
2383  if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2384  return false;
2385 
2387  if (RN && RN->getReg() == 0)
2388  Base = CurDAG->getRegister(0, MVT::i64);
2389  else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2390  // Base could already be %rip, particularly in the x32 ABI.
2391  SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2392  MVT::i64), 0);
2393  Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2394  Base);
2395  }
2396 
2397  RN = dyn_cast<RegisterSDNode>(Index);
2398  if (RN && RN->getReg() == 0)
2399  Index = CurDAG->getRegister(0, MVT::i64);
2400  else {
2401  assert(Index.getValueType() == MVT::i32 &&
2402  "Expect to be extending 32-bit registers for use in LEA");
2403  SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2404  MVT::i64), 0);
2405  Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2406  Index);
2407  }
2408 
2409  return true;
2410 }
2411 
2412 /// Calls SelectAddr and determines if the maximal addressing
2413 /// mode it matches can be cost effectively emitted as an LEA instruction.
2414 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
2415  SDValue &Base, SDValue &Scale,
2416  SDValue &Index, SDValue &Disp,
2417  SDValue &Segment) {
2418  X86ISelAddressMode AM;
2419 
2420  // Save the DL and VT before calling matchAddress, it can invalidate N.
2421  SDLoc DL(N);
2422  MVT VT = N.getSimpleValueType();
2423 
2424  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2425  // segments.
2426  SDValue Copy = AM.Segment;
2427  SDValue T = CurDAG->getRegister(0, MVT::i32);
2428  AM.Segment = T;
2429  if (matchAddress(N, AM))
2430  return false;
2431  assert (T == AM.Segment);
2432  AM.Segment = Copy;
2433 
2434  unsigned Complexity = 0;
2435  if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
2436  Complexity = 1;
2437  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2438  Complexity = 4;
2439 
2440  if (AM.IndexReg.getNode())
2441  Complexity++;
2442 
2443  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2444  // a simple shift.
2445  if (AM.Scale > 1)
2446  Complexity++;
2447 
2448  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2449  // to a LEA. This is determined with some experimentation but is by no means
2450  // optimal (especially for code size consideration). LEA is nice because of
2451  // its three-address nature. Tweak the cost function again when we can run
2452  // convertToThreeAddress() at register allocation time.
2453  if (AM.hasSymbolicDisplacement()) {
2454  // For X86-64, always use LEA to materialize RIP-relative addresses.
2455  if (Subtarget->is64Bit())
2456  Complexity = 4;
2457  else
2458  Complexity += 2;
2459  }
2460 
2461  if (AM.Disp)
2462  Complexity++;
2463 
2464  // If it isn't worth using an LEA, reject it.
2465  if (Complexity <= 2)
2466  return false;
2467 
2468  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2469  return true;
2470 }
2471 
2472 /// This is only run on TargetGlobalTLSAddress nodes.
2473 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
2474  SDValue &Scale, SDValue &Index,
2475  SDValue &Disp, SDValue &Segment) {
2477  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
2478 
2479  X86ISelAddressMode AM;
2480  AM.GV = GA->getGlobal();
2481  AM.Disp += GA->getOffset();
2482  AM.SymbolFlags = GA->getTargetFlags();
2483 
2484  MVT VT = N.getSimpleValueType();
2485  if (VT == MVT::i32) {
2486  AM.Scale = 1;
2487  AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
2488  }
2489 
2490  getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
2491  return true;
2492 }
2493 
2494 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
2495  if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
2496  Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
2497  N.getValueType());
2498  return true;
2499  }
2500 
2501  // Keep track of the original value type and whether this value was
2502  // truncated. If we see a truncation from pointer type to VT that truncates
2503  // bits that are known to be zero, we can use a narrow reference.
2504  EVT VT = N.getValueType();
2505  bool WasTruncated = false;
2506  if (N.getOpcode() == ISD::TRUNCATE) {
2507  WasTruncated = true;
2508  N = N.getOperand(0);
2509  }
2510 
2511  if (N.getOpcode() != X86ISD::Wrapper)
2512  return false;
2513 
2514  // We can only use non-GlobalValues as immediates if they were not truncated,
2515  // as we do not have any range information. If we have a GlobalValue and the
2516  // address was not truncated, we can select it as an operand directly.
2517  unsigned Opc = N.getOperand(0)->getOpcode();
2518  if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
2519  Op = N.getOperand(0);
2520  // We can only select the operand directly if we didn't have to look past a
2521  // truncate.
2522  return !WasTruncated;
2523  }
2524 
2525  // Check that the global's range fits into VT.
2526  auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
2527  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2528  if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
2529  return false;
2530 
2531  // Okay, we can use a narrow reference.
2532  Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
2533  GA->getOffset(), GA->getTargetFlags());
2534  return true;
2535 }
2536 
2537 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
2538  SDValue &Base, SDValue &Scale,
2539  SDValue &Index, SDValue &Disp,
2540  SDValue &Segment) {
2541  if (!ISD::isNON_EXTLoad(N.getNode()) ||
2542  !IsProfitableToFold(N, P, Root) ||
2543  !IsLegalToFold(N, P, Root, OptLevel))
2544  return false;
2545 
2546  return selectAddr(N.getNode(),
2547  N.getOperand(1), Base, Scale, Index, Disp, Segment);
2548 }
2549 
2550 /// Return an SDNode that returns the value of the global base register.
2551 /// Output instructions required to initialize the global base register,
2552 /// if necessary.
2553 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
2554  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
2555  auto &DL = MF->getDataLayout();
2556  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
2557 }
2558 
2559 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
2560  if (N->getOpcode() == ISD::TRUNCATE)
2561  N = N->getOperand(0).getNode();
2562  if (N->getOpcode() != X86ISD::Wrapper)
2563  return false;
2564 
2565  auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
2566  if (!GA)
2567  return false;
2568 
2569  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2570  return CR && CR->getSignedMin().sge(-1ull << Width) &&
2571  CR->getSignedMax().slt(1ull << Width);
2572 }
2573 
2575  assert(N->isMachineOpcode() && "Unexpected node");
2577  unsigned Opc = N->getMachineOpcode();
2578  if (Opc == X86::JCC_1)
2579  CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
2580  else if (Opc == X86::SETCCr)
2581  CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
2582  else if (Opc == X86::SETCCm)
2583  CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
2584  else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
2585  Opc == X86::CMOV64rr)
2586  CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
2587  else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
2588  Opc == X86::CMOV64rm)
2589  CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
2590 
2591  return CC;
2592 }
2593 
2594 /// Test whether the given X86ISD::CMP node has any users that use a flag
2595 /// other than ZF.
2596 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
2597  // Examine each user of the node.
2598  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2599  UI != UE; ++UI) {
2600  // Only check things that use the flags.
2601  if (UI.getUse().getResNo() != Flags.getResNo())
2602  continue;
2603  // Only examine CopyToReg uses that copy to EFLAGS.
2604  if (UI->getOpcode() != ISD::CopyToReg ||
2605  cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2606  return false;
2607  // Examine each user of the CopyToReg use.
2608  for (SDNode::use_iterator FlagUI = UI->use_begin(),
2609  FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2610  // Only examine the Flag result.
2611  if (FlagUI.getUse().getResNo() != 1) continue;
2612  // Anything unusual: assume conservatively.
2613  if (!FlagUI->isMachineOpcode()) return false;
2614  // Examine the condition code of the user.
2615  X86::CondCode CC = getCondFromNode(*FlagUI);
2616 
2617  switch (CC) {
2618  // Comparisons which only use the zero flag.
2619  case X86::COND_E: case X86::COND_NE:
2620  continue;
2621  // Anything else: assume conservatively.
2622  default:
2623  return false;
2624  }
2625  }
2626  }
2627  return true;
2628 }
2629 
2630 /// Test whether the given X86ISD::CMP node has any uses which require the SF
2631 /// flag to be accurate.
2632 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
2633  // Examine each user of the node.
2634  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2635  UI != UE; ++UI) {
2636  // Only check things that use the flags.
2637  if (UI.getUse().getResNo() != Flags.getResNo())
2638  continue;
2639  // Only examine CopyToReg uses that copy to EFLAGS.
2640  if (UI->getOpcode() != ISD::CopyToReg ||
2641  cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2642  return false;
2643  // Examine each user of the CopyToReg use.
2644  for (SDNode::use_iterator FlagUI = UI->use_begin(),
2645  FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2646  // Only examine the Flag result.
2647  if (FlagUI.getUse().getResNo() != 1) continue;
2648  // Anything unusual: assume conservatively.
2649  if (!FlagUI->isMachineOpcode()) return false;
2650  // Examine the condition code of the user.
2651  X86::CondCode CC = getCondFromNode(*FlagUI);
2652 
2653  switch (CC) {
2654  // Comparisons which don't examine the SF flag.
2655  case X86::COND_A: case X86::COND_AE:
2656  case X86::COND_B: case X86::COND_BE:
2657  case X86::COND_E: case X86::COND_NE:
2658  case X86::COND_O: case X86::COND_NO:
2659  case X86::COND_P: case X86::COND_NP:
2660  continue;
2661  // Anything else: assume conservatively.
2662  default:
2663  return false;
2664  }
2665  }
2666  }
2667  return true;
2668 }
2669 
2671  switch (CC) {
2672  // Comparisons which don't examine the CF flag.
2673  case X86::COND_O: case X86::COND_NO:
2674  case X86::COND_E: case X86::COND_NE:
2675  case X86::COND_S: case X86::COND_NS:
2676  case X86::COND_P: case X86::COND_NP:
2677  case X86::COND_L: case X86::COND_GE:
2678  case X86::COND_G: case X86::COND_LE:
2679  return false;
2680  // Anything else: assume conservatively.
2681  default:
2682  return true;
2683  }
2684 }
2685 
2686 /// Test whether the given node which sets flags has any uses which require the
2687 /// CF flag to be accurate.
2688  bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
2689  // Examine each user of the node.
2690  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2691  UI != UE; ++UI) {
2692  // Only check things that use the flags.
2693  if (UI.getUse().getResNo() != Flags.getResNo())
2694  continue;
2695 
2696  unsigned UIOpc = UI->getOpcode();
2697 
2698  if (UIOpc == ISD::CopyToReg) {
2699  // Only examine CopyToReg uses that copy to EFLAGS.
2700  if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2701  return false;
2702  // Examine each user of the CopyToReg use.
2703  for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
2704  FlagUI != FlagUE; ++FlagUI) {
2705  // Only examine the Flag result.
2706  if (FlagUI.getUse().getResNo() != 1)
2707  continue;
2708  // Anything unusual: assume conservatively.
2709  if (!FlagUI->isMachineOpcode())
2710  return false;
2711  // Examine the condition code of the user.
2712  X86::CondCode CC = getCondFromNode(*FlagUI);
2713 
2714  if (mayUseCarryFlag(CC))
2715  return false;
2716  }
2717 
2718  // This CopyToReg is ok. Move on to the next user.
2719  continue;
2720  }
2721 
2722  // This might be an unselected node. So look for the pre-isel opcodes that
2723  // use flags.
2724  unsigned CCOpNo;
2725  switch (UIOpc) {
2726  default:
2727  // Something unusual. Be conservative.
2728  return false;
2729  case X86ISD::SETCC: CCOpNo = 0; break;
2730  case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
2731  case X86ISD::CMOV: CCOpNo = 2; break;
2732  case X86ISD::BRCOND: CCOpNo = 2; break;
2733  }
2734 
2735  X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
2736  if (mayUseCarryFlag(CC))
2737  return false;
2738  }
2739  return true;
2740 }
2741 
2742 /// Check whether or not the chain ending in StoreNode is suitable for doing
2743 /// the {load; op; store} to modify transformation.
2745  SDValue StoredVal, SelectionDAG *CurDAG,
2746  unsigned LoadOpNo,
2747  LoadSDNode *&LoadNode,
2748  SDValue &InputChain) {
2749  // Is the stored value result 0 of the operation?
2750  if (StoredVal.getResNo() != 0) return false;
2751 
2752  // Are there other uses of the operation other than the store?
2753  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
2754 
2755  // Is the store non-extending and non-indexed?
2756  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
2757  return false;
2758 
2759  SDValue Load = StoredVal->getOperand(LoadOpNo);
2760  // Is the stored value a non-extending and non-indexed load?
2761  if (!ISD::isNormalLoad(Load.getNode())) return false;
2762 
2763  // Return LoadNode by reference.
2764  LoadNode = cast<LoadSDNode>(Load);
2765 
2766  // Is store the only read of the loaded value?
2767  if (!Load.hasOneUse())
2768  return false;
2769 
2770  // Is the address of the store the same as the load?
2771  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
2772  LoadNode->getOffset() != StoreNode->getOffset())
2773  return false;
2774 
2775  bool FoundLoad = false;
2776  SmallVector<SDValue, 4> ChainOps;
2777  SmallVector<const SDNode *, 4> LoopWorklist;
2779  const unsigned int Max = 1024;
2780 
2781  // Visualization of Load-Op-Store fusion:
2782  // -------------------------
2783  // Legend:
2784  // *-lines = Chain operand dependencies.
2785  // |-lines = Normal operand dependencies.
2786  // Dependencies flow down and right. n-suffix references multiple nodes.
2787  //
2788  // C Xn C
2789  // * * *
2790  // * * *
2791  // Xn A-LD Yn TF Yn
2792  // * * \ | * |
2793  // * * \ | * |
2794  // * * \ | => A--LD_OP_ST
2795  // * * \| \
2796  // TF OP \
2797  // * | \ Zn
2798  // * | \
2799  // A-ST Zn
2800  //
2801 
2802  // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2803  // #2: Yn -> LD
2804  // #3: ST -> Zn
2805 
2806  // Ensure the transform is safe by checking for the dual
2807  // dependencies to make sure we do not induce a loop.
2808 
2809  // As LD is a predecessor to both OP and ST we can do this by checking:
2810  // a). if LD is a predecessor to a member of Xn or Yn.
2811  // b). if a Zn is a predecessor to ST.
2812 
2813  // However, (b) can only occur through being a chain predecessor to
2814  // ST, which is the same as Zn being a member or predecessor of Xn,
2815  // which is a subset of LD being a predecessor of Xn. So it's
2816  // subsumed by check (a).
2817 
2818  SDValue Chain = StoreNode->getChain();
2819 
2820  // Gather X elements in ChainOps.
2821  if (Chain == Load.getValue(1)) {
2822  FoundLoad = true;
2823  ChainOps.push_back(Load.getOperand(0));
2824  } else if (Chain.getOpcode() == ISD::TokenFactor) {
2825  for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
2826  SDValue Op = Chain.getOperand(i);
2827  if (Op == Load.getValue(1)) {
2828  FoundLoad = true;
2829  // Drop Load, but keep its chain. No cycle check necessary.
2830  ChainOps.push_back(Load.getOperand(0));
2831  continue;
2832  }
2833  LoopWorklist.push_back(Op.getNode());
2834  ChainOps.push_back(Op);
2835  }
2836  }
2837 
2838  if (!FoundLoad)
2839  return false;
2840 
2841  // Worklist is currently Xn. Add Yn to worklist.
2842  for (SDValue Op : StoredVal->ops())
2843  if (Op.getNode() != LoadNode)
2844  LoopWorklist.push_back(Op.getNode());
2845 
2846  // Check (a) if Load is a predecessor to Xn + Yn
2847  if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
2848  true))
2849  return false;
2850 
2851  InputChain =
2852  CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
2853  return true;
2854 }
2855 
2856 // Change a chain of {load; op; store} of the same value into a simple op
2857 // through memory of that value, if the uses of the modified value and its
2858 // address are suitable.
2859 //
2860 // The tablegen pattern memory operand pattern is currently not able to match
2861 // the case where the EFLAGS on the original operation are used.
2862 //
2863 // To move this to tablegen, we'll need to improve tablegen to allow flags to
2864 // be transferred from a node in the pattern to the result node, probably with
2865 // a new keyword. For example, we have this
2866 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2867 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2868 // (implicit EFLAGS)]>;
2869 // but maybe need something like this
2870 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2871 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2872 // (transferrable EFLAGS)]>;
2873 //
2874 // Until then, we manually fold these and instruction select the operation
2875 // here.
2876 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
2877  StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
2878  SDValue StoredVal = StoreNode->getOperand(1);
2879  unsigned Opc = StoredVal->getOpcode();
2880 
2881  // Before we try to select anything, make sure this is memory operand size
2882  // and opcode we can handle. Note that this must match the code below that
2883  // actually lowers the opcodes.
2884  EVT MemVT = StoreNode->getMemoryVT();
2885  if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
2886  MemVT != MVT::i8)
2887  return false;
2888 
2889  bool IsCommutable = false;
2890  bool IsNegate = false;
2891  switch (Opc) {
2892  default:
2893  return false;
2894  case X86ISD::SUB:
2895  IsNegate = isNullConstant(StoredVal.getOperand(0));
2896  break;
2897  case X86ISD::SBB:
2898  break;
2899  case X86ISD::ADD:
2900  case X86ISD::ADC:
2901  case X86ISD::AND:
2902  case X86ISD::OR:
2903  case X86ISD::XOR:
2904  IsCommutable = true;
2905  break;
2906  }
2907 
2908  unsigned LoadOpNo = IsNegate ? 1 : 0;
2909  LoadSDNode *LoadNode = nullptr;
2910  SDValue InputChain;
2911  if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2912  LoadNode, InputChain)) {
2913  if (!IsCommutable)
2914  return false;
2915 
2916  // This operation is commutable, try the other operand.
2917  LoadOpNo = 1;
2918  if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2919  LoadNode, InputChain))
2920  return false;
2921  }
2922 
2923  SDValue Base, Scale, Index, Disp, Segment;
2924  if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
2925  Segment))
2926  return false;
2927 
2928  auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
2929  unsigned Opc8) {
2930  switch (MemVT.getSimpleVT().SimpleTy) {
2931  case MVT::i64:
2932  return Opc64;
2933  case MVT::i32:
2934  return Opc32;
2935  case MVT::i16:
2936  return Opc16;
2937  case MVT::i8:
2938  return Opc8;
2939  default:
2940  llvm_unreachable("Invalid size!");
2941  }
2942  };
2943 
2944  MachineSDNode *Result;
2945  switch (Opc) {
2946  case X86ISD::SUB:
2947  // Handle negate.
2948  if (IsNegate) {
2949  unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
2950  X86::NEG8m);
2951  const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
2952  Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
2953  MVT::Other, Ops);
2954  break;
2955  }
2957  case X86ISD::ADD:
2958  // Try to match inc/dec.
2959  if (!Subtarget->slowIncDec() || OptForSize) {
2960  bool IsOne = isOneConstant(StoredVal.getOperand(1));
2961  bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
2962  // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
2963  if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
2964  unsigned NewOpc =
2965  ((Opc == X86ISD::ADD) == IsOne)
2966  ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
2967  : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
2968  const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
2969  Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
2970  MVT::Other, Ops);
2971  break;
2972  }
2973  }
2975  case X86ISD::ADC:
2976  case X86ISD::SBB:
2977  case X86ISD::AND:
2978  case X86ISD::OR:
2979  case X86ISD::XOR: {
2980  auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
2981  switch (Opc) {
2982  case X86ISD::ADD:
2983  return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
2984  X86::ADD8mr);
2985  case X86ISD::ADC:
2986  return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
2987  X86::ADC8mr);
2988  case X86ISD::SUB:
2989  return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
2990  X86::SUB8mr);
2991  case X86ISD::SBB:
2992  return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
2993  X86::SBB8mr);
2994  case X86ISD::AND:
2995  return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
2996  X86::AND8mr);
2997  case X86ISD::OR:
2998  return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
2999  case X86ISD::XOR:
3000  return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3001  X86::XOR8mr);
3002  default:
3003  llvm_unreachable("Invalid opcode!");
3004  }
3005  };
3006  auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
3007  switch (Opc) {
3008  case X86ISD::ADD:
3009  return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
3010  case X86ISD::ADC:
3011  return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
3012  case X86ISD::SUB:
3013  return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
3014  case X86ISD::SBB:
3015  return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
3016  case X86ISD::AND:
3017  return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
3018  case X86ISD::OR:
3019  return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
3020  case X86ISD::XOR:
3021  return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
3022  default:
3023  llvm_unreachable("Invalid opcode!");
3024  }
3025  };
3026  auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3027  switch (Opc) {
3028  case X86ISD::ADD:
3029  return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3030  X86::ADD8mi);
3031  case X86ISD::ADC:
3032  return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3033  X86::ADC8mi);
3034  case X86ISD::SUB:
3035  return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3036  X86::SUB8mi);
3037  case X86ISD::SBB:
3038  return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3039  X86::SBB8mi);
3040  case X86ISD::AND:
3041  return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3042  X86::AND8mi);
3043  case X86ISD::OR:
3044  return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3045  X86::OR8mi);
3046  case X86ISD::XOR:
3047  return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3048  X86::XOR8mi);
3049  default:
3050  llvm_unreachable("Invalid opcode!");
3051  }
3052  };
3053 
3054  unsigned NewOpc = SelectRegOpcode(Opc);
3055  SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3056 
3057  // See if the operand is a constant that we can fold into an immediate
3058  // operand.
3059  if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3060  int64_t OperandV = OperandC->getSExtValue();
3061 
3062  // Check if we can shrink the operand enough to fit in an immediate (or
3063  // fit into a smaller immediate) by negating it and switching the
3064  // operation.
3065  if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3066  ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3067  (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3068  isInt<32>(-OperandV))) &&
3069  hasNoCarryFlagUses(StoredVal.getValue(1))) {
3070  OperandV = -OperandV;
3071  Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3072  }
3073 
3074  // First try to fit this into an Imm8 operand. If it doesn't fit, then try
3075  // the larger immediate operand.
3076  if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
3077  Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3078  NewOpc = SelectImm8Opcode(Opc);
3079  } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3080  Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3081  NewOpc = SelectImmOpcode(Opc);
3082  }
3083  }
3084 
3085  if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3086  SDValue CopyTo =
3087  CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3088  StoredVal.getOperand(2), SDValue());
3089 
3090  const SDValue Ops[] = {Base, Scale, Index, Disp,
3091  Segment, Operand, CopyTo, CopyTo.getValue(1)};
3092  Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3093  Ops);
3094  } else {
3095  const SDValue Ops[] = {Base, Scale, Index, Disp,
3096  Segment, Operand, InputChain};
3097  Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3098  Ops);
3099  }
3100  break;
3101  }
3102  default:
3103  llvm_unreachable("Invalid opcode!");
3104  }
3105 
3106  MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3107  LoadNode->getMemOperand()};
3108  CurDAG->setNodeMemRefs(Result, MemOps);
3109 
3110  // Update Load Chain uses as well.
3111  ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3112  ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3113  ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3114  CurDAG->RemoveDeadNode(Node);
3115  return true;
3116 }
3117 
3118 // See if this is an X & Mask that we can match to BEXTR/BZHI.
3119 // Where Mask is one of the following patterns:
3120 // a) x & (1 << nbits) - 1
3121 // b) x & ~(-1 << nbits)
3122 // c) x & (-1 >> (32 - y))
3123 // d) x << (32 - y) >> (32 - y)
3124 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3125  assert(
3126  (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
3127  "Should be either an and-mask, or right-shift after clearing high bits.");
3128 
3129  // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3130  if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3131  return false;
3132 
3133  MVT NVT = Node->getSimpleValueType(0);
3134 
3135  // Only supported for 32 and 64 bits.
3136  if (NVT != MVT::i32 && NVT != MVT::i64)
3137  return false;
3138 
3139  unsigned Size = NVT.getSizeInBits();
3140 
3141  SDValue NBits;
3142 
3143  // If we have BMI2's BZHI, we are ok with muti-use patterns.
3144  // Else, if we only have BMI1's BEXTR, we require one-use.
3145  const bool CanHaveExtraUses = Subtarget->hasBMI2();
3146  auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
3147  return CanHaveExtraUses ||
3148  Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3149  };
3150  auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
3151  auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
3152 
3153  // a) x & ((1 << nbits) + (-1))
3154  auto matchPatternA = [&checkOneUse, &NBits](SDValue Mask) -> bool {
3155  // Match `add`. Must only have one use!
3156  if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3157  return false;
3158  // We should be adding all-ones constant (i.e. subtracting one.)
3159  if (!isAllOnesConstant(Mask->getOperand(1)))
3160  return false;
3161  // Match `1 << nbits`. Must only have one use!
3162  SDValue M0 = Mask->getOperand(0);
3163  if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3164  return false;
3165  if (!isOneConstant(M0->getOperand(0)))
3166  return false;
3167  NBits = M0->getOperand(1);
3168  return true;
3169  };
3170 
3171  // b) x & ~(-1 << nbits)
3172  auto matchPatternB = [&checkOneUse, &NBits](SDValue Mask) -> bool {
3173  // Match `~()`. Must only have one use!
3174  if (!isBitwiseNot(Mask) || !checkOneUse(Mask))
3175  return false;
3176  // Match `-1 << nbits`. Must only have one use!
3177  SDValue M0 = Mask->getOperand(0);
3178  if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3179  return false;
3180  if (!isAllOnesConstant(M0->getOperand(0)))
3181  return false;
3182  NBits = M0->getOperand(1);
3183  return true;
3184  };
3185 
3186  // Match potentially-truncated (bitwidth - y)
3187  auto matchShiftAmt = [checkOneUse, Size, &NBits](SDValue ShiftAmt) {
3188  // Skip over a truncate of the shift amount.
3189  if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
3190  ShiftAmt = ShiftAmt.getOperand(0);
3191  // The trunc should have been the only user of the real shift amount.
3192  if (!checkOneUse(ShiftAmt))
3193  return false;
3194  }
3195  // Match the shift amount as: (bitwidth - y). It should go away, too.
3196  if (ShiftAmt.getOpcode() != ISD::SUB)
3197  return false;
3198  auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
3199  if (!V0 || V0->getZExtValue() != Size)
3200  return false;
3201  NBits = ShiftAmt.getOperand(1);
3202  return true;
3203  };
3204 
3205  // c) x & (-1 >> (32 - y))
3206  auto matchPatternC = [&checkOneUse, matchShiftAmt](SDValue Mask) -> bool {
3207  // Match `l>>`. Must only have one use!
3208  if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3209  return false;
3210  // We should be shifting all-ones constant.
3211  if (!isAllOnesConstant(Mask.getOperand(0)))
3212  return false;
3213  SDValue M1 = Mask.getOperand(1);
3214  // The shift amount should not be used externally.
3215  if (!checkOneUse(M1))
3216  return false;
3217  return matchShiftAmt(M1);
3218  };
3219 
3220  SDValue X;
3221 
3222  // d) x << (32 - y) >> (32 - y)
3223  auto matchPatternD = [&checkOneUse, &checkTwoUse, matchShiftAmt,
3224  &X](SDNode *Node) -> bool {
3225  if (Node->getOpcode() != ISD::SRL)
3226  return false;
3227  SDValue N0 = Node->getOperand(0);
3228  if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
3229  return false;
3230  SDValue N1 = Node->getOperand(1);
3231  SDValue N01 = N0->getOperand(1);
3232  // Both of the shifts must be by the exact same value.
3233  // There should not be any uses of the shift amount outside of the pattern.
3234  if (N1 != N01 || !checkTwoUse(N1))
3235  return false;
3236  if (!matchShiftAmt(N1))
3237  return false;
3238  X = N0->getOperand(0);
3239  return true;
3240  };
3241 
3242  auto matchLowBitMask = [&matchPatternA, &matchPatternB,
3243  &matchPatternC](SDValue Mask) -> bool {
3244  // FIXME: pattern c.
3245  return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3246  };
3247 
3248  if (Node->getOpcode() == ISD::AND) {
3249  X = Node->getOperand(0);
3250  SDValue Mask = Node->getOperand(1);
3251 
3252  if (matchLowBitMask(Mask)) {
3253  // Great.
3254  } else {
3255  std::swap(X, Mask);
3256  if (!matchLowBitMask(Mask))
3257  return false;
3258  }
3259  } else if (!matchPatternD(Node))
3260  return false;
3261 
3262  SDLoc DL(Node);
3263 
3264  // If we do *NOT* have BMI2, let's find out if the if the 'X' is *logically*
3265  // shifted (potentially with one-use trunc inbetween),
3266  // and if so look past one-use truncation.
3267  MVT XVT = NVT;
3268  if (!Subtarget->hasBMI2() && X.getOpcode() == ISD::TRUNCATE &&
3269  X.hasOneUse() && X.getOperand(0).getOpcode() == ISD::SRL) {
3270  assert(NVT == MVT::i32 && "Expected target valuetype to be i32");
3271  X = X.getOperand(0);
3272  XVT = X.getSimpleValueType();
3273  assert(XVT == MVT::i64 && "Expected truncation from i64");
3274  }
3275 
3276  // Truncate the shift amount.
3277  NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3278  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3279 
3280  // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3281  // All the other bits are undefined, we do not care about them.
3282  SDValue ImplDef = SDValue(
3283  CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3284  insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3285  NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
3286  NBits);
3287  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3288 
3289  if (Subtarget->hasBMI2()) {
3290  // Great, just emit the the BZHI..
3291  if (XVT != MVT::i32) {
3292  // But have to place the bit count into the wide-enough register first.
3293  NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, NBits);
3294  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3295  }
3296 
3297  SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, XVT, X, NBits);
3298  ReplaceNode(Node, Extract.getNode());
3299  SelectCode(Extract.getNode());
3300  return true;
3301  }
3302 
3303  // Else, emitting BEXTR requires one more step.
3304  // The 'control' of BEXTR has the pattern of:
3305  // [15...8 bit][ 7...0 bit] location
3306  // [ bit count][ shift] name
3307  // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3308 
3309  // Shift NBits left by 8 bits, thus producing 'control'.
3310  // This makes the low 8 bits to be zero.
3311  SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3312  SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3313  insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3314 
3315  // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3316  if (X.getOpcode() == ISD::SRL) {
3317  SDValue ShiftAmt = X.getOperand(1);
3318  X = X.getOperand(0);
3319 
3320  assert(ShiftAmt.getValueType() == MVT::i8 &&
3321  "Expected shift amount to be i8");
3322 
3323  // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3324  // We could zext to i16 in some form, but we intentionally don't do that.
3325  SDValue OrigShiftAmt = ShiftAmt;
3326  ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
3327  insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
3328 
3329  // And now 'or' these low 8 bits of shift amount into the 'control'.
3330  Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
3331  insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3332  }
3333 
3334  // But have to place the 'control' into the wide-enough register first.
3335  if (XVT != MVT::i32) {
3336  Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
3337  insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3338  }
3339 
3340  // And finally, form the BEXTR itself.
3341  SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
3342 
3343  // The 'X' was originally truncated. Do that now.
3344  if (XVT != NVT) {
3345  insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
3346  Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
3347  }
3348 
3349  ReplaceNode(Node, Extract.getNode());
3350  SelectCode(Extract.getNode());
3351 
3352  return true;
3353 }
3354 
3355 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3356 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
3357  MVT NVT = Node->getSimpleValueType(0);
3358  SDLoc dl(Node);
3359 
3360  SDValue N0 = Node->getOperand(0);
3361  SDValue N1 = Node->getOperand(1);
3362 
3363  // If we have TBM we can use an immediate for the control. If we have BMI
3364  // we should only do this if the BEXTR instruction is implemented well.
3365  // Otherwise moving the control into a register makes this more costly.
3366  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3367  // hoisting the move immediate would make it worthwhile with a less optimal
3368  // BEXTR?
3369  if (!Subtarget->hasTBM() &&
3370  !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
3371  return nullptr;
3372 
3373  // Must have a shift right.
3374  if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
3375  return nullptr;
3376 
3377  // Shift can't have additional users.
3378  if (!N0->hasOneUse())
3379  return nullptr;
3380 
3381  // Only supported for 32 and 64 bits.
3382  if (NVT != MVT::i32 && NVT != MVT::i64)
3383  return nullptr;
3384 
3385  // Shift amount and RHS of and must be constant.
3386  ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
3387  ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
3388  if (!MaskCst || !ShiftCst)
3389  return nullptr;
3390 
3391  // And RHS must be a mask.
3392  uint64_t Mask = MaskCst->getZExtValue();
3393  if (!isMask_64(Mask))
3394  return nullptr;
3395 
3396  uint64_t Shift = ShiftCst->getZExtValue();
3397  uint64_t MaskSize = countPopulation(Mask);
3398 
3399  // Don't interfere with something that can be handled by extracting AH.
3400  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3401  if (Shift == 8 && MaskSize == 8)
3402  return nullptr;
3403 
3404  // Make sure we are only using bits that were in the original value, not
3405  // shifted in.
3406  if (Shift + MaskSize > NVT.getSizeInBits())
3407  return nullptr;
3408 
3409  SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
3410  unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
3411  unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
3412 
3413  // BMI requires the immediate to placed in a register.
3414  if (!Subtarget->hasTBM()) {
3415  ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
3416  MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
3417  unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3418  New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
3419  }
3420 
3421  MachineSDNode *NewNode;
3422  SDValue Input = N0->getOperand(0);
3423  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3424  if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3425  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
3426  SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
3427  NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3428  // Update the chain.
3429  ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
3430  // Record the mem-refs
3431  CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
3432  } else {
3433  NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
3434  }
3435 
3436  return NewNode;
3437 }
3438 
3439 // Emit a PCMISTR(I/M) instruction.
3440 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
3441  bool MayFoldLoad, const SDLoc &dl,
3442  MVT VT, SDNode *Node) {
3443  SDValue N0 = Node->getOperand(0);
3444  SDValue N1 = Node->getOperand(1);
3445  SDValue Imm = Node->getOperand(2);
3446  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3447  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3448 
3449  // Try to fold a load. No need to check alignment.
3450  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3451  if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3452  SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3453  N1.getOperand(0) };
3454  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
3455  MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3456  // Update the chain.
3457  ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
3458  // Record the mem-refs
3459  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3460  return CNode;
3461  }
3462 
3463  SDValue Ops[] = { N0, N1, Imm };
3464  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
3465  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3466  return CNode;
3467 }
3468 
3469 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3470 // to emit a second instruction after this one. This is needed since we have two
3471 // copyToReg nodes glued before this and we need to continue that glue through.
3472 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
3473  bool MayFoldLoad, const SDLoc &dl,
3474  MVT VT, SDNode *Node,
3475  SDValue &InFlag) {
3476  SDValue N0 = Node->getOperand(0);
3477  SDValue N2 = Node->getOperand(2);
3478  SDValue Imm = Node->getOperand(4);
3479  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3480  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3481 
3482  // Try to fold a load. No need to check alignment.
3483  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3484  if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3485  SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3486  N2.getOperand(0), InFlag };
3487  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
3488  MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3489  InFlag = SDValue(CNode, 3);
3490  // Update the chain.
3491  ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
3492  // Record the mem-refs
3493  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
3494  return CNode;
3495  }
3496 
3497  SDValue Ops[] = { N0, N2, Imm, InFlag };
3498  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
3499  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3500  InFlag = SDValue(CNode, 2);
3501  return CNode;
3502 }
3503 
3504 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
3505  EVT VT = N->getValueType(0);
3506 
3507  // Only handle scalar shifts.
3508  if (VT.isVector())
3509  return false;
3510 
3511  // Narrower shifts only mask to 5 bits in hardware.
3512  unsigned Size = VT == MVT::i64 ? 64 : 32;
3513 
3514  SDValue OrigShiftAmt = N->getOperand(1);
3515  SDValue ShiftAmt = OrigShiftAmt;
3516  SDLoc DL(N);
3517 
3518  // Skip over a truncate of the shift amount.
3519  if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
3520  ShiftAmt = ShiftAmt->getOperand(0);
3521 
3522  // This function is called after X86DAGToDAGISel::matchBitExtract(),
3523  // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3524 
3525  SDValue NewShiftAmt;
3526  if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
3527  SDValue Add0 = ShiftAmt->getOperand(0);
3528  SDValue Add1 = ShiftAmt->getOperand(1);
3529  // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3530  // to avoid the ADD/SUB.
3531  if (isa<ConstantSDNode>(Add1) &&
3532  cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
3533  NewShiftAmt = Add0;
3534  // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3535  // generate a NEG instead of a SUB of a constant.
3536  } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3537  isa<ConstantSDNode>(Add0) &&
3538  cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
3539  cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
3540  // Insert a negate op.
3541  // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3542  // that uses it that's not a shift.
3543  EVT SubVT = ShiftAmt.getValueType();
3544  SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
3545  SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
3546  NewShiftAmt = Neg;
3547 
3548  // Insert these operands into a valid topological order so they can
3549  // get selected independently.
3550  insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
3551  insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
3552  } else
3553  return false;
3554  } else
3555  return false;
3556 
3557  if (NewShiftAmt.getValueType() != MVT::i8) {
3558  // Need to truncate the shift amount.
3559  NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
3560  // Add to a correct topological ordering.
3561  insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3562  }
3563 
3564  // Insert a new mask to keep the shift amount legal. This should be removed
3565  // by isel patterns.
3566  NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
3567  CurDAG->getConstant(Size - 1, DL, MVT::i8));
3568  // Place in a correct topological ordering.
3569  insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3570 
3571  SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
3572  NewShiftAmt);
3573  if (UpdatedNode != N) {
3574  // If we found an existing node, we should replace ourselves with that node
3575  // and wait for it to be selected after its other users.
3576  ReplaceNode(N, UpdatedNode);
3577  return true;
3578  }
3579 
3580  // If the original shift amount is now dead, delete it so that we don't run
3581  // it through isel.
3582  if (OrigShiftAmt.getNode()->use_empty())
3583  CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
3584 
3585  // Now that we've optimized the shift amount, defer to normal isel to get
3586  // load folding and legacy vs BMI2 selection without repeating it here.
3587  SelectCode(N);
3588  return true;
3589 }
3590 
3591 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
3592  MVT NVT = N->getSimpleValueType(0);
3593  unsigned Opcode = N->getOpcode();
3594  SDLoc dl(N);
3595 
3596  // For operations of the form (x << C1) op C2, check if we can use a smaller
3597  // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3598  SDValue Shift = N->getOperand(0);
3599  SDValue N1 = N->getOperand(1);
3600 
3602  if (!Cst)
3603  return false;
3604 
3605  int64_t Val = Cst->getSExtValue();
3606 
3607  // If we have an any_extend feeding the AND, look through it to see if there
3608  // is a shift behind it. But only if the AND doesn't use the extended bits.
3609  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
3610  bool FoundAnyExtend = false;
3611  if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
3612  Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
3613  isUInt<32>(Val)) {
3614  FoundAnyExtend = true;
3615  Shift = Shift.getOperand(0);
3616  }
3617 
3618  if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
3619  return false;
3620 
3621  // i8 is unshrinkable, i16 should be promoted to i32.
3622  if (NVT != MVT::i32 && NVT != MVT::i64)
3623  return false;
3624 
3625  ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
3626  if (!ShlCst)
3627  return false;
3628 
3629  uint64_t ShAmt = ShlCst->getZExtValue();
3630 
3631  // Make sure that we don't change the operation by removing bits.
3632  // This only matters for OR and XOR, AND is unaffected.
3633  uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
3634  if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
3635  return false;
3636 
3637  // Check the minimum bitwidth for the new constant.
3638  // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3639  auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
3640  if (Opcode == ISD::AND) {
3641  // AND32ri is the same as AND64ri32 with zext imm.
3642  // Try this before sign extended immediates below.
3643  ShiftedVal = (uint64_t)Val >> ShAmt;
3644  if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3645  return true;
3646  // Also swap order when the AND can become MOVZX.
3647  if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
3648  return true;
3649  }
3650  ShiftedVal = Val >> ShAmt;
3651  if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
3652  (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
3653  return true;
3654  if (Opcode != ISD::AND) {
3655  // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
3656  ShiftedVal = (uint64_t)Val >> ShAmt;
3657  if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3658  return true;
3659  }
3660  return false;
3661  };
3662 
3663  int64_t ShiftedVal;
3664  if (!CanShrinkImmediate(ShiftedVal))
3665  return false;
3666 
3667  // Ok, we can reorder to get a smaller immediate.
3668 
3669  // But, its possible the original immediate allowed an AND to become MOVZX.
3670  // Doing this late due to avoid the MakedValueIsZero call as late as
3671  // possible.
3672  if (Opcode == ISD::AND) {
3673  // Find the smallest zext this could possibly be.
3674  unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
3675  ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
3676 
3677  // Figure out which bits need to be zero to achieve that mask.
3678  APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
3679  ZExtWidth);
3680  NeededMask &= ~Cst->getAPIntValue();
3681 
3682  if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
3683  return false;
3684  }
3685 
3686  SDValue X = Shift.getOperand(0);
3687  if (FoundAnyExtend) {
3688  SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
3689  insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
3690  X = NewX;
3691  }
3692 
3693  SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
3694  insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
3695  SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
3696  insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
3697  SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
3698  Shift.getOperand(1));
3699  ReplaceNode(N, NewSHL.getNode());
3700  SelectCode(NewSHL.getNode());
3701  return true;
3702 }
3703 
3704 /// If the high bits of an 'and' operand are known zero, try setting the
3705 /// high bits of an 'and' constant operand to produce a smaller encoding by
3706 /// creating a small, sign-extended negative immediate rather than a large
3707 /// positive one. This reverses a transform in SimplifyDemandedBits that
3708 /// shrinks mask constants by clearing bits. There is also a possibility that
3709 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
3710 /// case, just replace the 'and'. Return 'true' if the node is replaced.
3711 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
3712  // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
3713  // have immediate operands.
3714  MVT VT = And->getSimpleValueType(0);
3715  if (VT != MVT::i32 && VT != MVT::i64)
3716  return false;
3717 
3718  auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
3719  if (!And1C)
3720  return false;
3721 
3722  // Bail out if the mask constant is already negative. It's can't shrink more.
3723  // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
3724  // patterns to use a 32-bit and instead of a 64-bit and by relying on the
3725  // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
3726  // are negative too.
3727  APInt MaskVal = And1C->getAPIntValue();
3728  unsigned MaskLZ = MaskVal.countLeadingZeros();
3729  if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
3730  return false;
3731 
3732  // Don't extend into the upper 32 bits of a 64 bit mask.
3733  if (VT == MVT::i64 && MaskLZ >= 32) {
3734  MaskLZ -= 32;
3735  MaskVal = MaskVal.trunc(32);
3736  }
3737 
3738  SDValue And0 = And->getOperand(0);
3739  APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
3740  APInt NegMaskVal = MaskVal | HighZeros;
3741 
3742  // If a negative constant would not allow a smaller encoding, there's no need
3743  // to continue. Only change the constant when we know it's a win.
3744  unsigned MinWidth = NegMaskVal.getMinSignedBits();
3745  if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
3746  return false;
3747 
3748  // Extend masks if we truncated above.
3749  if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
3750  NegMaskVal = NegMaskVal.zext(64);
3751  HighZeros = HighZeros.zext(64);
3752  }
3753 
3754  // The variable operand must be all zeros in the top bits to allow using the
3755  // new, negative constant as the mask.
3756  if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
3757  return false;
3758 
3759  // Check if the mask is -1. In that case, this is an unnecessary instruction
3760  // that escaped earlier analysis.
3761  if (NegMaskVal.isAllOnesValue()) {
3762  ReplaceNode(And, And0.getNode());
3763  return true;
3764  }
3765 
3766  // A negative mask allows a smaller encoding. Create a new 'and' node.
3767  SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
3768  SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
3769  ReplaceNode(And, NewAnd.getNode());
3770  SelectCode(NewAnd.getNode());
3771  return true;
3772 }
3773 
3774 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
3775  bool FoldedBCast, bool Masked) {
3776  if (Masked) {
3777  if (FoldedLoad) {
3778  switch (TestVT.SimpleTy) {
3779  default: llvm_unreachable("Unexpected VT!");
3780  case MVT::v16i8:
3781  return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
3782  case MVT::v8i16:
3783  return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
3784  case MVT::v4i32:
3785  return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
3786  case MVT::v2i64:
3787  return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
3788  case MVT::v32i8:
3789  return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
3790  case MVT::v16i16:
3791  return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
3792  case MVT::v8i32:
3793  return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
3794  case MVT::v4i64:
3795  return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
3796  case MVT::v64i8:
3797  return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
3798  case MVT::v32i16:
3799  return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
3800  case MVT::v16i32:
3801  return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
3802  case MVT::v8i64:
3803  return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
3804  }
3805  }
3806 
3807  if (FoldedBCast) {
3808  switch (TestVT.SimpleTy) {
3809  default: llvm_unreachable("Unexpected VT!");
3810  case MVT::v4i32:
3811  return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
3812  case MVT::v2i64:
3813  return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
3814  case MVT::v8i32:
3815  return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
3816  case MVT::v4i64:
3817  return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
3818  case MVT::v16i32:
3819  return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
3820  case MVT::v8i64:
3821  return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
3822  }
3823  }
3824 
3825  switch (TestVT.SimpleTy) {
3826  default: llvm_unreachable("Unexpected VT!");
3827  case MVT::v16i8:
3828  return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
3829  case MVT::v8i16:
3830  return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
3831  case MVT::v4i32:
3832  return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
3833  case MVT::v2i64:
3834  return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
3835  case MVT::v32i8:
3836  return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
3837  case MVT::v16i16:
3838  return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
3839  case MVT::v8i32:
3840  return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
3841  case MVT::v4i64:
3842  return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
3843  case MVT::v64i8:
3844  return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
3845  case MVT::v32i16:
3846  return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
3847  case MVT::v16i32:
3848  return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
3849  case MVT::v8i64:
3850  return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
3851  }
3852  }
3853 
3854  if (FoldedLoad) {
3855  switch (TestVT.SimpleTy) {
3856  default: llvm_unreachable("Unexpected VT!");
3857  case MVT::v16i8:
3858  return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
3859  case MVT::v8i16:
3860  return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
3861  case MVT::v4i32:
3862  return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
3863  case MVT::v2i64:
3864  return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
3865  case MVT::v32i8:
3866  return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
3867  case MVT::v16i16:
3868  return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
3869  case MVT::v8i32:
3870  return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
3871  case MVT::v4i64:
3872  return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
3873  case MVT::v64i8:
3874  return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
3875  case MVT::v32i16:
3876  return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
3877  case MVT::v16i32:
3878  return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
3879  case MVT::v8i64:
3880  return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
3881  }
3882  }
3883 
3884  if (FoldedBCast) {
3885  switch (TestVT.SimpleTy) {
3886  default: llvm_unreachable("Unexpected VT!");
3887  case MVT::v4i32:
3888  return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
3889  case MVT::v2i64:
3890  return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
3891  case MVT::v8i32:
3892  return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
3893  case MVT::v4i64:
3894  return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
3895  case MVT::v16i32:
3896  return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
3897  case MVT::v8i64:
3898  return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
3899  }
3900  }
3901 
3902  switch (TestVT.SimpleTy) {
3903  default: llvm_unreachable("Unexpected VT!");
3904  case MVT::v16i8:
3905  return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
3906  case MVT::v8i16:
3907  return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
3908  case MVT::v4i32:
3909  return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
3910  case MVT::v2i64:
3911  return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
3912  case MVT::v32i8:
3913  return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
3914  case MVT::v16i16:
3915  return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
3916  case MVT::v8i32:
3917  return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
3918  case MVT::v4i64:
3919  return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
3920  case MVT::v64i8:
3921  return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
3922  case MVT::v32i16:
3923  return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
3924  case MVT::v16i32:
3925  return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
3926  case MVT::v8i64:
3927  return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
3928  }
3929 }
3930 
3931 // Try to create VPTESTM instruction. If InMask is not null, it will be used
3932 // to form a masked operation.
3933 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
3934  SDValue InMask) {
3935  assert(Subtarget->hasAVX512() && "Expected AVX512!");
3937  "Unexpected VT!");
3938 
3939  // Look for equal and not equal compares.
3940  ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
3941  if (CC != ISD::SETEQ && CC != ISD::SETNE)
3942  return false;
3943 
3944  // See if we're comparing against zero. This should have been canonicalized
3945  // to RHS during lowering.
3947  return false;
3948 
3949  SDValue N0 = Setcc.getOperand(0);
3950 
3951  MVT CmpVT = N0.getSimpleValueType();
3952  MVT CmpSVT = CmpVT.getVectorElementType();
3953 
3954  // Start with both operands the same. We'll try to refine this.
3955  SDValue Src0 = N0;
3956  SDValue Src1 = N0;
3957 
3958  {
3959  // Look through single use bitcasts.
3960  SDValue N0Temp = N0;
3961  if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
3962  N0Temp = N0.getOperand(0);
3963 
3964  // Look for single use AND.
3965  if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
3966  Src0 = N0Temp.getOperand(0);
3967  Src1 = N0Temp.getOperand(1);
3968  }
3969  }
3970 
3971  // Without VLX we need to widen the load.
3972  bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
3973 
3974  // We can only fold loads if the sources are unique.
3975  bool CanFoldLoads = Src0 != Src1;
3976 
3977  // Try to fold loads unless we need to widen.
3978  bool FoldedLoad = false;
3979  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
3980  if (!Widen && CanFoldLoads) {
3981  Load = Src1;
3982  FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
3983  Tmp4);
3984  if (!FoldedLoad) {
3985  // And is computative.
3986  Load = Src0;
3987  FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
3988  Tmp3, Tmp4);
3989  if (FoldedLoad)
3990  std::swap(Src0, Src1);
3991  }
3992  }
3993 
3994  auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
3995  // Look through single use bitcasts.
3996  if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
3997  Src = Src.getOperand(0);
3998 
3999  if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
4000  Parent = Src.getNode();
4001  Src = Src.getOperand(0);
4002  if (Src.getSimpleValueType() == CmpSVT)
4003  return Src;
4004  }
4005 
4006  return SDValue();
4007  };
4008 
4009  // If we didn't fold a load, try to match broadcast. No widening limitation
4010  // for this. But only 32 and 64 bit types are supported.
4011  bool FoldedBCast = false;
4012  if (!FoldedLoad && CanFoldLoads &&
4013  (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
4014  SDNode *ParentNode = nullptr;
4015  if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
4016  FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
4017  Tmp1, Tmp2, Tmp3, Tmp4);
4018  }
4019 
4020  // Try the other operand.
4021  if (!FoldedBCast) {
4022  if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
4023  FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
4024  Tmp1, Tmp2, Tmp3, Tmp4);
4025  if (FoldedBCast)
4026  std::swap(Src0, Src1);
4027  }
4028  }
4029  }
4030 
4031  auto getMaskRC = [](MVT MaskVT) {
4032  switch (MaskVT.SimpleTy) {
4033  default: llvm_unreachable("Unexpected VT!");
4034  case MVT::v2i1: return X86::VK2RegClassID;
4035  case MVT::v4i1: return X86::VK4RegClassID;
4036  case MVT::v8i1: return X86::VK8RegClassID;
4037  case MVT::v16i1: return X86::VK16RegClassID;
4038  case MVT::v32i1: return X86::VK32RegClassID;
4039  case MVT::v64i1: return X86::VK64RegClassID;
4040  }
4041  };
4042 
4043  bool IsMasked = InMask.getNode() != nullptr;
4044 
4045  SDLoc dl(Root);
4046 
4047  MVT ResVT = Setcc.getSimpleValueType();
4048  MVT MaskVT = ResVT;
4049  if (Widen) {
4050  // Widen the inputs using insert_subreg or copy_to_regclass.
4051  unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4052  unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4053  unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4054  CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4055  MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4056  SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4057  CmpVT), 0);
4058  Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4059 
4060  assert(!FoldedLoad && "Shouldn't have folded the load");
4061  if (!FoldedBCast)
4062  Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4063 
4064  if (IsMasked) {
4065  // Widen the mask.
4066  unsigned RegClass = getMaskRC(MaskVT);
4067  SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4068  InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4069  dl, MaskVT, InMask, RC), 0);
4070  }
4071  }
4072 
4073  bool IsTestN = CC == ISD::SETEQ;
4074  unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4075  IsMasked);
4076 
4077  MachineSDNode *CNode;
4078  if (FoldedLoad || FoldedBCast) {
4079  SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4080 
4081  if (IsMasked) {
4082  SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4083  Load.getOperand(0) };
4084  CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4085  } else {
4086  SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4087  Load.getOperand(0) };
4088  CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4089  }
4090 
4091  // Update the chain.
4092  ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
4093  // Record the mem-refs
4094  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
4095  } else {
4096  if (IsMasked)
4097  CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4098  else
4099  CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4100  }
4101 
4102  // If we widened, we need to shrink the mask VT.
4103  if (Widen) {
4104  unsigned RegClass = getMaskRC(ResVT);
4105  SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4106  CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4107  dl, ResVT, SDValue(CNode, 0), RC);
4108  }
4109 
4110  ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4111  CurDAG->RemoveDeadNode(Root);
4112  return true;
4113 }
4114 
4115 void X86DAGToDAGISel::Select(SDNode *Node) {
4116  MVT NVT = Node->getSimpleValueType(0);
4117  unsigned Opcode = Node->getOpcode();
4118  SDLoc dl(Node);
4119 
4120  if (Node->isMachineOpcode()) {
4121  LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
4122  Node->setNodeId(-1);
4123  return; // Already selected.
4124  }
4125 
4126  switch (Opcode) {
4127  default: break;
4128  case ISD::INTRINSIC_VOID: {
4129  unsigned IntNo = Node->getConstantOperandVal(1);
4130  switch (IntNo) {
4131  default: break;
4132  case Intrinsic::x86_sse3_monitor:
4133  case Intrinsic::x86_monitorx:
4134  case Intrinsic::x86_clzero: {
4135  bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
4136 
4137  unsigned Opc = 0;
4138  switch (IntNo) {
4139  case Intrinsic::x86_sse3_monitor:
4140  if (!Subtarget->hasSSE3())
4141  break;
4142  Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
4143  break;
4144  case Intrinsic::x86_monitorx:
4145  if (!Subtarget->hasMWAITX())
4146  break;
4147  Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
4148  break;
4149  case Intrinsic::x86_clzero:
4150  if (!Subtarget->hasCLZERO())
4151  break;
4152  Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
4153  break;
4154  }
4155 
4156  if (Opc) {
4157  unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
4158  SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
4159  Node->getOperand(2), SDValue());
4160  SDValue InFlag = Chain.getValue(1);
4161 
4162  if (IntNo == Intrinsic::x86_sse3_monitor ||
4163  IntNo == Intrinsic::x86_monitorx) {
4164  // Copy the other two operands to ECX and EDX.
4165  Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
4166  InFlag);
4167  InFlag = Chain.getValue(1);
4168  Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
4169  InFlag);
4170  InFlag = Chain.getValue(1);
4171  }
4172 
4173  MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
4174  { Chain, InFlag});
4175  ReplaceNode(Node, CNode);
4176  return;
4177  }
4178  }
4179  }
4180 
4181  break;
4182  }
4183  case ISD::BRIND: {
4184  if (Subtarget->isTargetNaCl())
4185  // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
4186  // leave the instruction alone.
4187  break;
4188  if (Subtarget->isTarget64BitILP32()) {
4189  // Converts a 32-bit register to a 64-bit, zero-extended version of
4190  // it. This is needed because x86-64 can do many things, but jmp %r32
4191  // ain't one of them.
4192  const SDValue &Target = Node->getOperand(1);
4194  SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
4195  SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
4196  Node->getOperand(0), ZextTarget);
4197  ReplaceNode(Node, Brind.getNode());
4198  SelectCode(ZextTarget.getNode());
4199  SelectCode(Brind.getNode());
4200  return;
4201  }
4202  break;
4203  }
4204  case X86ISD::GlobalBaseReg:
4205  ReplaceNode(Node, getGlobalBaseReg());
4206  return;
4207 
4208  case ISD::BITCAST:
4209  // Just drop all 128/256/512-bit bitcasts.
4210  if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
4211  NVT == MVT::f128) {
4212  ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
4213  CurDAG->RemoveDeadNode(Node);
4214  return;
4215  }
4216  break;
4217 
4218  case ISD::VSELECT: {
4219  // Replace VSELECT with non-mask conditions with with BLENDV.
4221  break;
4222 
4223  assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
4224  SDValue Blendv = CurDAG->getNode(
4225  X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
4226  Node->getOperand(1), Node->getOperand(2));
4227  ReplaceNode(Node, Blendv.getNode());
4228  SelectCode(Blendv.getNode());
4229  // We already called ReplaceUses.
4230  return;
4231  }
4232 
4233  case ISD::SRL:
4234  if (matchBitExtract(Node))
4235  return;
4237  case ISD::SRA:
4238  case ISD::SHL:
4239  if (tryShiftAmountMod(Node))
4240  return;
4241  break;
4242 
4243  case ISD::AND:
4244  if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
4245  // Try to form a masked VPTESTM. Operands can be in either order.
4246  SDValue N0 = Node->getOperand(0);
4247  SDValue N1 = Node->getOperand(1);
4248  if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
4249  tryVPTESTM(Node, N0, N1))
4250  return;
4251  if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
4252  tryVPTESTM(Node, N1, N0))
4253  return;
4254  }
4255 
4256  if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
4257  ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4258  CurDAG->RemoveDeadNode(Node);
4259  return;
4260  }
4261  if (matchBitExtract(Node))
4262  return;
4263  if (AndImmShrink && shrinkAndImmediate(Node))
4264  return;
4265 
4267  case ISD::OR:
4268  case ISD::XOR:
4269  if (tryShrinkShlLogicImm(Node))
4270  return;
4271  break;
4272 
4273  case X86ISD::SMUL:
4274  // i16/i32/i64 are handled with isel patterns.
4275  if (NVT != MVT::i8)
4276  break;
4278  case X86ISD::UMUL: {
4279  SDValue N0 = Node->getOperand(0);
4280  SDValue N1 = Node->getOperand(1);
4281 
4282  unsigned LoReg, ROpc, MOpc;
4283  switch (NVT.SimpleTy) {
4284  default: llvm_unreachable("Unsupported VT!");
4285  case MVT::i8:
4286  LoReg = X86::AL;
4287  ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
4288  MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
4289  break;
4290  case MVT::i16:
4291  LoReg = X86::AX;
4292  ROpc = X86::MUL16r;
4293  MOpc = X86::MUL16m;
4294  break;
4295  case MVT::i32:
4296  LoReg = X86::EAX;
4297  ROpc = X86::MUL32r;
4298  MOpc = X86::MUL32m;
4299  break;
4300  case MVT::i64:
4301  LoReg = X86::RAX;
4302  ROpc = X86::MUL64r;
4303  MOpc = X86::MUL64m;
4304  break;
4305  }
4306 
4307  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4308  bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4309  // Multiply is commmutative.
4310  if (!FoldedLoad) {
4311  FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4312  if (FoldedLoad)
4313  std::swap(N0, N1);
4314  }
4315 
4316  SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
4317  N0, SDValue()).getValue(1);
4318 
4319  MachineSDNode *CNode;
4320  if (FoldedLoad) {
4321  // i16/i32/i64 use an instruction that produces a low and high result even
4322  // though only the low result is used.
4323  SDVTList VTs;
4324  if (NVT == MVT::i8)
4325  VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4326  else
4327  VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
4328 
4329  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4330  InFlag };
4331  CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4332 
4333  // Update the chain.
4334  ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
4335  // Record the mem-refs
4336  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4337  } else {
4338  // i16/i32/i64 use an instruction that produces a low and high result even
4339  // though only the low result is used.
4340  SDVTList VTs;
4341  if (NVT == MVT::i8)
4342  VTs = CurDAG->getVTList(NVT, MVT::i32);
4343  else
4344  VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
4345 
4346  CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
4347  }
4348 
4349  ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4350  ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
4351  CurDAG->RemoveDeadNode(Node);
4352  return;
4353  }
4354 
4355  case ISD::SMUL_LOHI:
4356  case ISD::UMUL_LOHI: {
4357  SDValue N0 = Node->getOperand(0);
4358  SDValue N1 = Node->getOperand(1);
4359 
4360  unsigned Opc, MOpc;
4361  bool isSigned = Opcode == ISD::SMUL_LOHI;
4362  if (!isSigned) {
4363  switch (NVT.SimpleTy) {
4364  default: llvm_unreachable("Unsupported VT!");
4365  case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
4366  case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
4367  }
4368  } else {
4369  switch (NVT.SimpleTy) {
4370  default: llvm_unreachable("Unsupported VT!");
4371  case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
4372  case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
4373  }
4374  }
4375 
4376  unsigned SrcReg, LoReg, HiReg;
4377  switch (Opc) {
4378  default: llvm_unreachable("Unknown MUL opcode!");
4379  case X86::IMUL32r:
4380  case X86::MUL32r:
4381  SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
4382  break;
4383  case X86::IMUL64r:
4384  case X86::MUL64r:
4385  SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
4386  break;
4387  }
4388 
4389  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4390  bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4391  // Multiply is commmutative.
4392  if (!foldedLoad) {
4393  foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4394  if (foldedLoad)
4395  std::swap(N0, N1);
4396  }
4397 
4398  SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
4399  N0, SDValue()).getValue(1);
4400  if (foldedLoad) {
4401  SDValue Chain;
4402  MachineSDNode *CNode = nullptr;
4403  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4404  InFlag };
4405  SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
4406  CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4407  Chain = SDValue(CNode, 0);
4408  InFlag = SDValue(CNode, 1);
4409 
4410  // Update the chain.
4411  ReplaceUses(N1.getValue(1), Chain);
4412  // Record the mem-refs
4413  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4414  } else {
4415  SDValue Ops[] = { N1, InFlag };
4416  SDVTList VTs = CurDAG->getVTList(MVT::Glue);
4417  SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4418  InFlag = SDValue(CNode, 0);
4419  }
4420 
4421  // Copy the low half of the result, if it is needed.
4422  if (!SDValue(Node, 0).use_empty()) {
4423  assert(LoReg && "Register for low half is not defined!");
4424  SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
4425  NVT, InFlag);
4426  InFlag = ResLo.getValue(2);
4427  ReplaceUses(SDValue(Node, 0), ResLo);
4428  LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
4429  dbgs() << '\n');
4430  }
4431  // Copy the high half of the result, if it is needed.
4432  if (!SDValue(Node, 1).use_empty()) {
4433  assert(HiReg && "Register for high half is not defined!");
4434  SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
4435  NVT, InFlag);
4436  InFlag = ResHi.getValue(2);
4437  ReplaceUses(SDValue(Node, 1), ResHi);
4438  LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
4439  dbgs() << '\n');
4440  }
4441 
4442  CurDAG->RemoveDeadNode(Node);
4443  return;
4444  }
4445 
4446  case ISD::SDIVREM:
4447  case ISD::UDIVREM: {
4448  SDValue N0 = Node->getOperand(0);
4449  SDValue N1 = Node->getOperand(1);
4450 
4451  unsigned Opc, MOpc;
4452  bool isSigned = Opcode == ISD::SDIVREM;
4453  if (!isSigned) {
4454  switch (NVT.SimpleTy) {
4455  default: llvm_unreachable("Unsupported VT!");
4456  case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
4457  case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
4458  case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
4459  case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
4460  }
4461  } else {
4462  switch (NVT.SimpleTy) {
4463  default: llvm_unreachable("Unsupported VT!");
4464  case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
4465  case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
4466  case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
4467  case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
4468  }
4469  }
4470 
4471  unsigned LoReg, HiReg, ClrReg;
4472  unsigned SExtOpcode;
4473  switch (NVT.SimpleTy) {
4474  default: llvm_unreachable("Unsupported VT!");
4475  case MVT::i8:
4476  LoReg = X86::AL; ClrReg = HiReg = X86::AH;
4477  SExtOpcode = X86::CBW;
4478  break;
4479  case MVT::i16:
4480  LoReg = X86::AX; HiReg = X86::DX;
4481  ClrReg = X86::DX;
4482  SExtOpcode = X86::CWD;
4483  break;
4484  case MVT::i32:
4485  LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
4486  SExtOpcode = X86::CDQ;
4487  break;
4488  case MVT::i64:
4489  LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
4490  SExtOpcode = X86::CQO;
4491  break;
4492  }
4493 
4494  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4495  bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4496  bool signBitIsZero = CurDAG->SignBitIsZero(N0);
4497 
4498  SDValue InFlag;
4499  if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
4500  // Special case for div8, just use a move with zero extension to AX to
4501  // clear the upper 8 bits (AH).
4502  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
4503  MachineSDNode *Move;
4504  if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4505  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4506  Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
4507  MVT::Other, Ops);
4508  Chain = SDValue(Move, 1);
4509  ReplaceUses(N0.getValue(1), Chain);
4510  // Record the mem-refs
4511  CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
4512  } else {
4513  Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0);
4514  Chain = CurDAG->getEntryNode();
4515  }
4516  Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0),
4517  SDValue());
4518  InFlag = Chain.getValue(1);
4519  } else {
4520  InFlag =
4521  CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
4522  LoReg, N0, SDValue()).getValue(1);
4523  if (isSigned && !signBitIsZero) {
4524  // Sign extend the low part into the high part.
4525  InFlag =
4526  SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
4527  } else {
4528  // Zero out the high part, effectively zero extending the input.
4529  SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
4530  switch (NVT.SimpleTy) {
4531  case MVT::i16:
4532  ClrNode =
4533  SDValue(CurDAG->getMachineNode(
4534  TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
4535  CurDAG->getTargetConstant(X86::sub_16bit, dl,
4536  MVT::i32)),
4537  0);
4538  break;
4539  case MVT::i32:
4540  break;
4541  case MVT::i64:
4542  ClrNode =
4543  SDValue(CurDAG->getMachineNode(
4544  TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
4545  CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
4546  CurDAG->getTargetConstant(X86::sub_32bit, dl,
4547  MVT::i32)),
4548  0);
4549  break;
4550  default:
4551  llvm_unreachable("Unexpected division source");
4552  }
4553 
4554  InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
4555  ClrNode, InFlag).getValue(1);
4556  }
4557  }
4558 
4559  if (foldedLoad) {
4560  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4561  InFlag };
4562  MachineSDNode *CNode =
4563  CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
4564  InFlag = SDValue(CNode, 1);
4565  // Update the chain.
4566  ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
4567  // Record the mem-refs
4568  CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4569  } else {
4570  InFlag =
4571  SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
4572  }
4573 
4574  // Prevent use of AH in a REX instruction by explicitly copying it to
4575  // an ABCD_L register.
4576  //
4577  // The current assumption of the register allocator is that isel
4578  // won't generate explicit references to the GR8_ABCD_H registers. If
4579  // the allocator and/or the backend get enhanced to be more robust in
4580  // that regard, this can be, and should be, removed.
4581  if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
4582  SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
4583  unsigned AHExtOpcode =
4584  isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
4585 
4586  SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
4587  MVT::Glue, AHCopy, InFlag);
4588  SDValue Result(RNode, 0);
4589  InFlag = SDValue(RNode, 1);
4590 
4591  Result =
4592  CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
4593 
4594  ReplaceUses(SDValue(Node, 1), Result);
4595  LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4596  dbgs() << '\n');
4597  }
4598  // Copy the division (low) result, if it is needed.
4599  if (!SDValue(Node, 0).use_empty()) {
4600  SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4601  LoReg, NVT, InFlag);
4602  InFlag = Result.getValue(2);
4603  ReplaceUses(SDValue(Node, 0), Result);
4604  LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4605  dbgs() << '\n');
4606  }
4607  // Copy the remainder (high) result, if it is needed.
4608  if (!SDValue(Node, 1).use_empty()) {
4609  SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4610  HiReg, NVT, InFlag);
4611  InFlag = Result.getValue(2);
4612  ReplaceUses(SDValue(Node, 1), Result);
4613  LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4614  dbgs() << '\n');
4615  }
4616  CurDAG->RemoveDeadNode(Node);
4617  return;
4618  }
4619 
4620  case X86ISD::CMP: {
4621  SDValue N0 = Node->getOperand(0);
4622  SDValue N1 = Node->getOperand(1);
4623 
4624  // Optimizations for TEST compares.
4625  if (!isNullConstant(N1))
4626  break;
4627 
4628  // Save the original VT of the compare.
4629  MVT CmpVT = N0.getSimpleValueType();
4630 
4631  // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
4632  // by a test instruction. The test should be removed later by
4633  // analyzeCompare if we are using only the zero flag.
4634  // TODO: Should we check the users and use the BEXTR flags directly?
4635  if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
4636  if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
4637  unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
4638  : X86::TEST32rr;
4639  SDValue BEXTR = SDValue(NewNode, 0);
4640  NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
4641  ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4642  CurDAG->RemoveDeadNode(Node);
4643  return;
4644  }
4645  }
4646 
4647  // We can peek through truncates, but we need to be careful below.
4648  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
4649  N0 = N0.getOperand(0);
4650 
4651  // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
4652  // use a smaller encoding.
4653  // Look past the truncate if CMP is the only use of it.
4654  if (N0.getOpcode() == ISD::AND &&
4655  N0.getNode()->hasOneUse() &&
4656  N0.getValueType() != MVT::i8) {
4658  if (!C) break;
4659  uint64_t Mask = C->getZExtValue();
4660 
4661  // Check if we can replace AND+IMM64 with a shift. This is possible for
4662  // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
4663  // flag.
4664  if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
4665  onlyUsesZeroFlag(SDValue(Node, 0))) {
4666  if (isMask_64(~Mask)) {
4667  unsigned TrailingZeros = countTrailingZeros(Mask);
4668  SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
4669  SDValue Shift =
4670  SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
4671  N0.getOperand(0), Imm), 0);
4672  MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
4673  MVT::i32, Shift, Shift);
4674  ReplaceNode(Node, Test);
4675  return;
4676  }
4677  if (isMask_64(Mask)) {
4678  unsigned LeadingZeros = countLeadingZeros(Mask);
4679  SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
4680  SDValue Shift =
4681  SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
4682  N0.getOperand(0), Imm), 0);
4683  MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
4684  MVT::i32, Shift, Shift);
4685  ReplaceNode(Node, Test);
4686  return;
4687  }
4688  }
4689 
4690  MVT VT;
4691  int SubRegOp;
4692  unsigned ROpc, MOpc;
4693 
4694  // For each of these checks we need to be careful if the sign flag is
4695  // being used. It is only safe to use the sign flag in two conditions,
4696  // either the sign bit in the shrunken mask is zero or the final test
4697  // size is equal to the original compare size.
4698 
4699  if (isUInt<8>(Mask) &&
4700  (!(Mask & 0x80) || CmpVT == MVT::i8 ||
4701  hasNoSignFlagUses(SDValue(Node, 0)))) {
4702  // For example, convert "testl %eax, $8" to "testb %al, $8"
4703  VT = MVT::i8;
4704  SubRegOp = X86::sub_8bit;
4705  ROpc = X86::TEST8ri;
4706  MOpc = X86::TEST8mi;
4707  } else if (OptForMinSize && isUInt<16>(Mask) &&
4708  (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
4709  hasNoSignFlagUses(SDValue(Node, 0)))) {
4710  // For example, "testl %eax, $32776" to "testw %ax, $32776".
4711  // NOTE: We only want to form TESTW instructions if optimizing for
4712  // min size. Otherwise we only save one byte and possibly get a length
4713  // changing prefix penalty in the decoders.
4714  VT = MVT::i16;
4715  SubRegOp = X86::sub_16bit;
4716  ROpc = X86::TEST16ri;
4717  MOpc = X86::TEST16mi;
4718  } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
4719  ((!(Mask & 0x80000000) &&
4720  // Without minsize 16-bit Cmps can get here so we need to
4721  // be sure we calculate the correct sign flag if needed.
4722  (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
4723  CmpVT == MVT::i32 ||
4724  hasNoSignFlagUses(SDValue(Node, 0)))) {
4725  // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
4726  // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
4727  // Otherwize, we find ourselves in a position where we have to do
4728  // promotion. If previous passes did not promote the and, we assume
4729  // they had a good reason not to and do not promote here.
4730  VT = MVT::i32;
4731  SubRegOp = X86::sub_32bit;
4732  ROpc = X86::TEST32ri;
4733  MOpc = X86::TEST32mi;
4734  } else {
4735  // No eligible transformation was found.
4736  break;
4737  }
4738 
4739  SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
4740  SDValue Reg = N0.getOperand(0);
4741 
4742  // Emit a testl or testw.
4743  MachineSDNode *NewNode;
4744  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4745  if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4746  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4747  Reg.getOperand(0) };
4748  NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
4749  // Update the chain.
4750  ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
4751  // Record the mem-refs
4752  CurDAG->setNodeMemRefs(NewNode,
4753  {cast<LoadSDNode>(Reg)->getMemOperand()});
4754  } else {
4755  // Extract the subregister if necessary.
4756  if (N0.getValueType() != VT)
4757  Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
4758 
4759  NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
4760  }
4761  // Replace CMP with TEST.
4762  ReplaceNode(Node, NewNode);
4763  return;
4764  }
4765  break;
4766  }
4767  case X86ISD::PCMPISTR: {
4768  if (!Subtarget->hasSSE42())
4769  break;
4770 
4771  bool NeedIndex = !SDValue(Node, 0).use_empty();
4772  bool NeedMask = !SDValue(Node, 1).use_empty();
4773  // We can't fold a load if we are going to make two instructions.
4774  bool MayFoldLoad = !NeedIndex || !NeedMask;
4775 
4776  MachineSDNode *CNode;
4777  if (NeedMask) {
4778  unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
4779  unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
4780  CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
4781  ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
4782  }
4783  if (NeedIndex || !NeedMask) {
4784  unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
4785  unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
4786  CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
4787  ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4788  }
4789 
4790  // Connect the flag usage to the last instruction created.
4791  ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
4792  CurDAG->RemoveDeadNode(Node);
4793  return;
4794  }
4795  case X86ISD::PCMPESTR: {
4796  if (!Subtarget->hasSSE42())
4797  break;
4798 
4799  // Copy the two implicit register inputs.
4800  SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
4801  Node->getOperand(1),
4802  SDValue()).getValue(1);
4803  InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
4804  Node->getOperand(3), InFlag).getValue(1);
4805 
4806  bool NeedIndex = !SDValue(Node, 0).use_empty();
4807  bool NeedMask = !SDValue(Node, 1).use_empty();
4808  // We can't fold a load if we are going to make two instructions.
4809  bool MayFoldLoad = !NeedIndex || !NeedMask;
4810 
4811  MachineSDNode *CNode;
4812  if (NeedMask) {
4813  unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
4814  unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
4815  CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
4816  InFlag);
4817  ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
4818  }
4819  if (NeedIndex || !NeedMask) {
4820  unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
4821  unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
4822  CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
4823  ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4824  }
4825  // Connect the flag usage to the last instruction created.
4826  ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
4827  CurDAG->RemoveDeadNode(Node);
4828  return;
4829  }
4830 
4831  case ISD::SETCC: {
4832  if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
4833  return;
4834 
4835  break;
4836  }
4837 
4838  case ISD::STORE:
4839  if (foldLoadStoreIntoMemOperand(Node))
4840  return;
4841  break;
4842  case ISD::FCEIL:
4843  case ISD::FFLOOR:
4844  case ISD::FTRUNC:
4845  case ISD::FNEARBYINT:
4846  case ISD::FRINT: {
4847  // Replace fp rounding with their X86 specific equivalent so we don't
4848  // need 2 sets of patterns.
4849  // FIXME: This can only happen when the nodes started as STRICT_* and have
4850  // been mutated into their non-STRICT equivalents. Eventually this
4851  // mutation will be removed and we should switch the STRICT_ nodes to a
4852  // strict version of RNDSCALE in PreProcessISelDAG.
4853  unsigned Imm;
4854  switch (Node->getOpcode()) {
4855  default: llvm_unreachable("Unexpected opcode!");
4856  case ISD::FCEIL: Imm = 0xA; break;
4857  case ISD::FFLOOR: Imm = 0x9; break;
4858  case ISD::FTRUNC: Imm = 0xB; break;
4859  case ISD::FNEARBYINT: Imm = 0xC; break;
4860  case ISD::FRINT: Imm = 0x4; break;
4861  }
4862  SDLoc dl(Node);
4863  SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
4864  Node->getValueType(0),
4865  Node->getOperand(0),
4866  CurDAG->getConstant(Imm, dl, MVT::i8));
4867  ReplaceNode(Node, Res.getNode());
4868  SelectCode(Res.getNode());
4869  return;
4870  }
4871  }
4872 
4873  SelectCode(Node);
4874 }
4875 
4876 bool X86DAGToDAGISel::
4877 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
4878  std::vector<SDValue> &OutOps) {
4879  SDValue Op0, Op1, Op2, Op3, Op4;
4880  switch (ConstraintID) {
4881  default:
4882  llvm_unreachable("Unexpected asm memory constraint");
4884  // FIXME: It seems strange that 'i' is needed here since it's supposed to
4885  // be an immediate and not a memory constraint.
4887  case InlineAsm::Constraint_o: // offsetable ??
4888  case InlineAsm::Constraint_v: // not offsetable ??
4889  case InlineAsm::Constraint_m: // memory
4891  if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
4892  return true;
4893  break;
4894  }
4895 
4896  OutOps.push_back(Op0);
4897  OutOps.push_back(Op1);
4898  OutOps.push_back(Op2);
4899  OutOps.push_back(Op3);
4900  OutOps.push_back(Op4);
4901  return false;
4902 }
4903 
4904 /// This pass converts a legalized DAG into a X86-specific DAG,
4905 /// ready for instruction scheduling.
4907  CodeGenOpt::Level OptLevel) {
4908  return new X86DAGToDAGISel(TM, OptLevel);
4909 }
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
uint64_t CallInst * C
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:595
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:348
X = FP_ROUND(Y, TRUNC) - Rounding &#39;Y&#39; from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:562
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, unsigned LoadOpNo, LoadSDNode *&LoadNode, SDValue &InputChain)
Check whether or not the chain ending in StoreNode is suitable for doing the {load; op; store} to mod...
EVT getValueType() const
Return the ValueType of the referenced return value.
Vector comparison generating mask bits for fp and integer signed and unsigned data types...
C - The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
const SDValue & getOffset() const
bool isUndef() const
APInt getSignedMax() const
Return the largest signed value contained in the ConstantRange.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N)
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static MVT getVectorVT(MVT VT, unsigned NumElements)
Tail call return.
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:614
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:543
bool isVector() const
Return true if this is a vector value type.
const SDValue & getBasePtr() const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:857
X86 conditional moves.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1203
unsigned Reg
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition: APInt.h:647
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:252
bool hasFastBEXTR() const
Definition: X86Subtarget.h:660
unsigned getVectorNumElements() const
const SDValue & getChain() const
constexpr bool isInt< 8 >(int64_t x)
Definition: MathExtras.h:302
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc, or post-dec.
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:130
unsigned getAlignment() const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:323
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:810
STATISTIC(NumFunctions, "Total number of functions")
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
F(f)
Bitwise logical ANDNOT of floating point values.
void setNodeId(int Id)
Set unique node id.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1 at the ...
Definition: ISDOpcodes.h:377
unsigned getValueSizeInBits() const
Returns the size of the value in bits.
static bool MayFoldLoad(SDValue Op)
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1508
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:158
GlobalBaseReg - On Darwin, this node represents the result of the mflr at function entry...
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode *> &Visited, SmallVectorImpl< const SDNode *> &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:209
X86 compare and logical compare instructions.
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0&#39;s from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:119
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4382
The address of a basic block.
Definition: Constants.h:839
static cl::opt< bool > AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden)
bool hasOneUse() const
Return true if there is exactly one use of this node.
A description of a memory reference used in the backend.
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
Bitwise Logical AND NOT of Packed FP values.
Shift and rotation operations.
Definition: ISDOpcodes.h:434
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef...
CallLoweringInfo & setChain(SDValue InChain)
CopyToReg - This node has three operands: a chain, a register number to set to this value...
Definition: ISDOpcodes.h:169
SymbolFlags
Symbol flags.
Definition: Symbol.h:24
unsigned SubReg
op_iterator op_end() const
unsigned getScalarValueSizeInBits() const
uint64_t getConstantOperandVal(unsigned i) const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
SimpleValueType SimpleTy
unsigned getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:303
CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end of a call sequence, and carry arbitrary information that target might want to know.
Definition: ISDOpcodes.h:744
Position
Position to insert a new instruction relative to an existing instruction.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
This represents a list of ValueType&#39;s that has been intern&#39;d by a SelectionDAG.
bool hasVLX() const
Definition: X86Subtarget.h:681
unsigned getSizeInBits() const
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool hasExternalLinkage() const
Definition: GlobalValue.h:431
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1532
int64_t getSExtValue() const
static void InvalidateNodeId(SDNode *N)
unsigned getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:291
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE...
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:410
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:453
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out...
Definition: ISDOpcodes.h:995
op_iterator op_begin() const
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:323
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:521
ArrayRef< SDUse > ops() const
MVT getVectorElementType() const
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
bool use_empty() const
Return true if there are no nodes using value ResNo of Node.
This class is used to represent ISD::STORE nodes.
Bitwise logical XOR of floating point values.
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition: APInt.h:635
#define P(N)
const SDValue & getBasePtr() const
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:165
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:342
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:80
Machine Value Type.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
bool isMachineOpcode() const
static bool mayUseCarryFlag(X86::CondCode CC)
unsigned getScalarSizeInBits() const
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1184
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This is an important base class in LLVM.
Definition: Constant.h:41
const SDValue & getOperand(unsigned Num) const
X86 conditional branches.
bool isOperandOf(const SDNode *N) const
Return true if this node is an operand of N.
const SDValue & getOffset() const
APInt getUnsignedMax() const
Return the largest unsigned value contained in the ConstantRange.
static Type * getVoidTy(LLVMContext &C)
Definition: Type.cpp:160
This class provides iterator support for SDUse operands that use a specific SDNode.
unsigned getMachineOpcode() const
void RepositionNode(allnodes_iterator Position, SDNode *N)
Move node N in the AllNodes list to be immediately before the given iterator Position.
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using &#39;From&#39; to use &#39;To&#39; instead.
const APInt & getAPIntValue() const
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
On Darwin, this node represents the result of the popl at function entry, used for PIC code...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
self_iterator getIterator()
Definition: ilist_node.h:81
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
Definition: