LLVM 19.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86.h"
16#include "X86RegisterInfo.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Type.h"
29#include "llvm/Support/Debug.h"
33#include <cstdint>
34
35using namespace llvm;
36
37#define DEBUG_TYPE "x86-isel"
38#define PASS_NAME "X86 DAG->DAG Instruction Selection"
39
40STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
41
42static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
43 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45
47 "x86-promote-anyext-load", cl::init(true),
48 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
49
51
52//===----------------------------------------------------------------------===//
53// Pattern Matcher Implementation
54//===----------------------------------------------------------------------===//
55
56namespace {
57 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
58 /// numbers for the leaves of the matched tree.
59 struct X86ISelAddressMode {
60 enum {
61 RegBase,
62 FrameIndexBase
63 } BaseType = RegBase;
64
65 // This is really a union, discriminated by BaseType!
66 SDValue Base_Reg;
67 int Base_FrameIndex = 0;
68
69 unsigned Scale = 1;
70 SDValue IndexReg;
71 int32_t Disp = 0;
72 SDValue Segment;
73 const GlobalValue *GV = nullptr;
74 const Constant *CP = nullptr;
75 const BlockAddress *BlockAddr = nullptr;
76 const char *ES = nullptr;
77 MCSymbol *MCSym = nullptr;
78 int JT = -1;
79 Align Alignment; // CP alignment.
80 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
81 bool NegateIndex = false;
82
83 X86ISelAddressMode() = default;
84
85 bool hasSymbolicDisplacement() const {
86 return GV != nullptr || CP != nullptr || ES != nullptr ||
87 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
88 }
89
90 bool hasBaseOrIndexReg() const {
91 return BaseType == FrameIndexBase ||
92 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
93 }
94
95 /// Return true if this addressing mode is already RIP-relative.
96 bool isRIPRelative() const {
97 if (BaseType != RegBase) return false;
98 if (RegisterSDNode *RegNode =
99 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
100 return RegNode->getReg() == X86::RIP;
101 return false;
102 }
103
104 void setBaseReg(SDValue Reg) {
105 BaseType = RegBase;
106 Base_Reg = Reg;
107 }
108
109#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
110 void dump(SelectionDAG *DAG = nullptr) {
111 dbgs() << "X86ISelAddressMode " << this << '\n';
112 dbgs() << "Base_Reg ";
113 if (Base_Reg.getNode())
114 Base_Reg.getNode()->dump(DAG);
115 else
116 dbgs() << "nul\n";
117 if (BaseType == FrameIndexBase)
118 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
119 dbgs() << " Scale " << Scale << '\n'
120 << "IndexReg ";
121 if (NegateIndex)
122 dbgs() << "negate ";
123 if (IndexReg.getNode())
124 IndexReg.getNode()->dump(DAG);
125 else
126 dbgs() << "nul\n";
127 dbgs() << " Disp " << Disp << '\n'
128 << "GV ";
129 if (GV)
130 GV->dump();
131 else
132 dbgs() << "nul";
133 dbgs() << " CP ";
134 if (CP)
135 CP->dump();
136 else
137 dbgs() << "nul";
138 dbgs() << '\n'
139 << "ES ";
140 if (ES)
141 dbgs() << ES;
142 else
143 dbgs() << "nul";
144 dbgs() << " MCSym ";
145 if (MCSym)
146 dbgs() << MCSym;
147 else
148 dbgs() << "nul";
149 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
150 }
151#endif
152 };
153}
154
155namespace {
156 //===--------------------------------------------------------------------===//
157 /// ISel - X86-specific code to select X86 machine instructions for
158 /// SelectionDAG operations.
159 ///
160 class X86DAGToDAGISel final : public SelectionDAGISel {
161 /// Keep a pointer to the X86Subtarget around so that we can
162 /// make the right decision when generating code for different targets.
163 const X86Subtarget *Subtarget;
164
165 /// If true, selector should try to optimize for minimum code size.
166 bool OptForMinSize;
167
168 /// Disable direct TLS access through segment registers.
169 bool IndirectTlsSegRefs;
170
171 public:
172 static char ID;
173
174 X86DAGToDAGISel() = delete;
175
176 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
177 : SelectionDAGISel(ID, tm, OptLevel), Subtarget(nullptr),
178 OptForMinSize(false), IndirectTlsSegRefs(false) {}
179
180 bool runOnMachineFunction(MachineFunction &MF) override {
181 // Reset the subtarget each time through.
182 Subtarget = &MF.getSubtarget<X86Subtarget>();
183 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
184 "indirect-tls-seg-refs");
185
186 // OptFor[Min]Size are used in pattern predicates that isel is matching.
187 OptForMinSize = MF.getFunction().hasMinSize();
188 assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
189 "OptForMinSize implies OptForSize");
190
192 return true;
193 }
194
195 void emitFunctionEntryCode() override;
196
197 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
198
199 void PreprocessISelDAG() override;
200 void PostprocessISelDAG() override;
201
202// Include the pieces autogenerated from the target description.
203#include "X86GenDAGISel.inc"
204
205 private:
206 void Select(SDNode *N) override;
207
208 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
209 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
210 bool AllowSegmentRegForX32 = false);
211 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
212 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
213 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
214 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
215 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
216 unsigned Depth);
217 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218 unsigned Depth);
219 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
220 unsigned Depth);
221 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
222 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
223 SDValue &Scale, SDValue &Index, SDValue &Disp,
224 SDValue &Segment);
225 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
226 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
227 SDValue &Index, SDValue &Disp, SDValue &Segment);
228 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
229 bool selectLEAAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
233 SDValue &Scale, SDValue &Index, SDValue &Disp,
234 SDValue &Segment);
235 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
236 SDValue &Scale, SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238 bool selectRelocImm(SDValue N, SDValue &Op);
239
240 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment);
244
245 // Convenience method where P is also root.
246 bool tryFoldLoad(SDNode *P, SDValue N,
247 SDValue &Base, SDValue &Scale,
248 SDValue &Index, SDValue &Disp,
249 SDValue &Segment) {
250 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
251 }
252
253 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
254 SDValue &Base, SDValue &Scale,
255 SDValue &Index, SDValue &Disp,
256 SDValue &Segment);
257
258 bool isProfitableToFormMaskedOp(SDNode *N) const;
259
260 /// Implement addressing mode selection for inline asm expressions.
262 InlineAsm::ConstraintCode ConstraintID,
263 std::vector<SDValue> &OutOps) override;
264
265 void emitSpecialCodeForMain();
266
267 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
268 MVT VT, SDValue &Base, SDValue &Scale,
269 SDValue &Index, SDValue &Disp,
270 SDValue &Segment) {
271 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
272 Base = CurDAG->getTargetFrameIndex(
273 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
274 else if (AM.Base_Reg.getNode())
275 Base = AM.Base_Reg;
276 else
277 Base = CurDAG->getRegister(0, VT);
278
279 Scale = getI8Imm(AM.Scale, DL);
280
281#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
282 // Negate the index if needed.
283 if (AM.NegateIndex) {
284 unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
285 : GET_ND_IF_ENABLED(X86::NEG32r);
286 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
287 AM.IndexReg), 0);
288 AM.IndexReg = Neg;
289 }
290
291 if (AM.IndexReg.getNode())
292 Index = AM.IndexReg;
293 else
294 Index = CurDAG->getRegister(0, VT);
295
296 // These are 32-bit even in 64-bit mode since RIP-relative offset
297 // is 32-bit.
298 if (AM.GV)
299 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
300 MVT::i32, AM.Disp,
301 AM.SymbolFlags);
302 else if (AM.CP)
303 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
304 AM.Disp, AM.SymbolFlags);
305 else if (AM.ES) {
306 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
307 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
308 } else if (AM.MCSym) {
309 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
310 assert(AM.SymbolFlags == 0 && "oo");
311 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
312 } else if (AM.JT != -1) {
313 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
314 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
315 } else if (AM.BlockAddr)
316 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
317 AM.SymbolFlags);
318 else
319 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
320
321 if (AM.Segment.getNode())
322 Segment = AM.Segment;
323 else
324 Segment = CurDAG->getRegister(0, MVT::i16);
325 }
326
327 // Utility function to determine whether we should avoid selecting
328 // immediate forms of instructions for better code size or not.
329 // At a high level, we'd like to avoid such instructions when
330 // we have similar constants used within the same basic block
331 // that can be kept in a register.
332 //
333 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
334 uint32_t UseCount = 0;
335
336 // Do not want to hoist if we're not optimizing for size.
337 // TODO: We'd like to remove this restriction.
338 // See the comment in X86InstrInfo.td for more info.
339 if (!CurDAG->shouldOptForSize())
340 return false;
341
342 // Walk all the users of the immediate.
343 for (const SDNode *User : N->uses()) {
344 if (UseCount >= 2)
345 break;
346
347 // This user is already selected. Count it as a legitimate use and
348 // move on.
349 if (User->isMachineOpcode()) {
350 UseCount++;
351 continue;
352 }
353
354 // We want to count stores of immediates as real uses.
355 if (User->getOpcode() == ISD::STORE &&
356 User->getOperand(1).getNode() == N) {
357 UseCount++;
358 continue;
359 }
360
361 // We don't currently match users that have > 2 operands (except
362 // for stores, which are handled above)
363 // Those instruction won't match in ISEL, for now, and would
364 // be counted incorrectly.
365 // This may change in the future as we add additional instruction
366 // types.
367 if (User->getNumOperands() != 2)
368 continue;
369
370 // If this is a sign-extended 8-bit integer immediate used in an ALU
371 // instruction, there is probably an opcode encoding to save space.
372 auto *C = dyn_cast<ConstantSDNode>(N);
373 if (C && isInt<8>(C->getSExtValue()))
374 continue;
375
376 // Immediates that are used for offsets as part of stack
377 // manipulation should be left alone. These are typically
378 // used to indicate SP offsets for argument passing and
379 // will get pulled into stores/pushes (implicitly).
380 if (User->getOpcode() == X86ISD::ADD ||
381 User->getOpcode() == ISD::ADD ||
382 User->getOpcode() == X86ISD::SUB ||
383 User->getOpcode() == ISD::SUB) {
384
385 // Find the other operand of the add/sub.
386 SDValue OtherOp = User->getOperand(0);
387 if (OtherOp.getNode() == N)
388 OtherOp = User->getOperand(1);
389
390 // Don't count if the other operand is SP.
391 RegisterSDNode *RegNode;
392 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
393 (RegNode = dyn_cast_or_null<RegisterSDNode>(
394 OtherOp->getOperand(1).getNode())))
395 if ((RegNode->getReg() == X86::ESP) ||
396 (RegNode->getReg() == X86::RSP))
397 continue;
398 }
399
400 // ... otherwise, count this and move on.
401 UseCount++;
402 }
403
404 // If we have more than 1 use, then recommend for hoisting.
405 return (UseCount > 1);
406 }
407
408 /// Return a target constant with the specified value of type i8.
409 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
410 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
411 }
412
413 /// Return a target constant with the specified value, of type i32.
414 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
415 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
416 }
417
418 /// Return a target constant with the specified value, of type i64.
419 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
420 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
421 }
422
423 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
424 const SDLoc &DL) {
425 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
426 uint64_t Index = N->getConstantOperandVal(1);
427 MVT VecVT = N->getOperand(0).getSimpleValueType();
428 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
429 }
430
431 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
432 const SDLoc &DL) {
433 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
434 uint64_t Index = N->getConstantOperandVal(2);
435 MVT VecVT = N->getSimpleValueType(0);
436 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
437 }
438
439 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
440 const SDLoc &DL) {
441 assert(VecWidth == 128 && "Unexpected vector width");
442 uint64_t Index = N->getConstantOperandVal(2);
443 MVT VecVT = N->getSimpleValueType(0);
444 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
445 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
446 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
447 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
448 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
449 }
450
451 SDValue getSBBZero(SDNode *N) {
452 SDLoc dl(N);
453 MVT VT = N->getSimpleValueType(0);
454
455 // Create zero.
456 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
458 CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
459 if (VT == MVT::i64) {
460 Zero = SDValue(
461 CurDAG->getMachineNode(
462 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
463 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
464 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
465 0);
466 }
467
468 // Copy flags to the EFLAGS register and glue it to next node.
469 unsigned Opcode = N->getOpcode();
470 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
471 "Unexpected opcode for SBB materialization");
472 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
473 SDValue EFLAGS =
474 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
475 N->getOperand(FlagOpIndex), SDValue());
476
477 // Create a 64-bit instruction if the result is 64-bits otherwise use the
478 // 32-bit version.
479 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
480 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
481 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
482 return SDValue(
483 CurDAG->getMachineNode(Opc, dl, VTs,
484 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
485 0);
486 }
487
488 // Helper to detect unneeded and instructions on shift amounts. Called
489 // from PatFrags in tablegen.
490 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
491 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
492 const APInt &Val = N->getConstantOperandAPInt(1);
493
494 if (Val.countr_one() >= Width)
495 return true;
496
497 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
498 return Mask.countr_one() >= Width;
499 }
500
501 /// Return an SDNode that returns the value of the global base register.
502 /// Output instructions required to initialize the global base register,
503 /// if necessary.
504 SDNode *getGlobalBaseReg();
505
506 /// Return a reference to the TargetMachine, casted to the target-specific
507 /// type.
508 const X86TargetMachine &getTargetMachine() const {
509 return static_cast<const X86TargetMachine &>(TM);
510 }
511
512 /// Return a reference to the TargetInstrInfo, casted to the target-specific
513 /// type.
514 const X86InstrInfo *getInstrInfo() const {
515 return Subtarget->getInstrInfo();
516 }
517
518 /// Return a condition code of the given SDNode
519 X86::CondCode getCondFromNode(SDNode *N) const;
520
521 /// Address-mode matching performs shift-of-and to and-of-shift
522 /// reassociation in order to expose more scaled addressing
523 /// opportunities.
524 bool ComplexPatternFuncMutatesDAG() const override {
525 return true;
526 }
527
528 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
529
530 // Indicates we should prefer to use a non-temporal load for this load.
531 bool useNonTemporalLoad(LoadSDNode *N) const {
532 if (!N->isNonTemporal())
533 return false;
534
535 unsigned StoreSize = N->getMemoryVT().getStoreSize();
536
537 if (N->getAlign().value() < StoreSize)
538 return false;
539
540 switch (StoreSize) {
541 default: llvm_unreachable("Unsupported store size");
542 case 4:
543 case 8:
544 return false;
545 case 16:
546 return Subtarget->hasSSE41();
547 case 32:
548 return Subtarget->hasAVX2();
549 case 64:
550 return Subtarget->hasAVX512();
551 }
552 }
553
554 bool foldLoadStoreIntoMemOperand(SDNode *Node);
555 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
556 bool matchBitExtract(SDNode *Node);
557 bool shrinkAndImmediate(SDNode *N);
558 bool isMaskZeroExtended(SDNode *N) const;
559 bool tryShiftAmountMod(SDNode *N);
560 bool tryShrinkShlLogicImm(SDNode *N);
561 bool tryVPTERNLOG(SDNode *N);
562 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
563 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
564 uint8_t Imm);
565 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
566 bool tryMatchBitSelect(SDNode *N);
567
568 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
569 const SDLoc &dl, MVT VT, SDNode *Node);
570 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
571 const SDLoc &dl, MVT VT, SDNode *Node,
572 SDValue &InGlue);
573
574 bool tryOptimizeRem8Extend(SDNode *N);
575
576 bool onlyUsesZeroFlag(SDValue Flags) const;
577 bool hasNoSignFlagUses(SDValue Flags) const;
578 bool hasNoCarryFlagUses(SDValue Flags) const;
579 };
580}
581
582char X86DAGToDAGISel::ID = 0;
583
584INITIALIZE_PASS(X86DAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
585
586// Returns true if this masked compare can be implemented legally with this
587// type.
588static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
589 unsigned Opcode = N->getOpcode();
590 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
591 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
592 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
593 // We can get 256-bit 8 element types here without VLX being enabled. When
594 // this happens we will use 512-bit operations and the mask will not be
595 // zero extended.
596 EVT OpVT = N->getOperand(0).getValueType();
597 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
598 // second operand.
599 if (Opcode == X86ISD::STRICT_CMPM)
600 OpVT = N->getOperand(1).getValueType();
601 if (OpVT.is256BitVector() || OpVT.is128BitVector())
602 return Subtarget->hasVLX();
603
604 return true;
605 }
606 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
607 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
608 Opcode == X86ISD::FSETCCM_SAE)
609 return true;
610
611 return false;
612}
613
614// Returns true if we can assume the writer of the mask has zero extended it
615// for us.
616bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
617 // If this is an AND, check if we have a compare on either side. As long as
618 // one side guarantees the mask is zero extended, the AND will preserve those
619 // zeros.
620 if (N->getOpcode() == ISD::AND)
621 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
622 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
623
624 return isLegalMaskCompare(N, Subtarget);
625}
626
627bool
628X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
629 if (OptLevel == CodeGenOptLevel::None)
630 return false;
631
632 if (!N.hasOneUse())
633 return false;
634
635 if (N.getOpcode() != ISD::LOAD)
636 return true;
637
638 // Don't fold non-temporal loads if we have an instruction for them.
639 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
640 return false;
641
642 // If N is a load, do additional profitability checks.
643 if (U == Root) {
644 switch (U->getOpcode()) {
645 default: break;
646 case X86ISD::ADD:
647 case X86ISD::ADC:
648 case X86ISD::SUB:
649 case X86ISD::SBB:
650 case X86ISD::AND:
651 case X86ISD::XOR:
652 case X86ISD::OR:
653 case ISD::ADD:
654 case ISD::UADDO_CARRY:
655 case ISD::AND:
656 case ISD::OR:
657 case ISD::XOR: {
658 SDValue Op1 = U->getOperand(1);
659
660 // If the other operand is a 8-bit immediate we should fold the immediate
661 // instead. This reduces code size.
662 // e.g.
663 // movl 4(%esp), %eax
664 // addl $4, %eax
665 // vs.
666 // movl $4, %eax
667 // addl 4(%esp), %eax
668 // The former is 2 bytes shorter. In case where the increment is 1, then
669 // the saving can be 4 bytes (by using incl %eax).
670 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
671 if (Imm->getAPIntValue().isSignedIntN(8))
672 return false;
673
674 // If this is a 64-bit AND with an immediate that fits in 32-bits,
675 // prefer using the smaller and over folding the load. This is needed to
676 // make sure immediates created by shrinkAndImmediate are always folded.
677 // Ideally we would narrow the load during DAG combine and get the
678 // best of both worlds.
679 if (U->getOpcode() == ISD::AND &&
680 Imm->getAPIntValue().getBitWidth() == 64 &&
681 Imm->getAPIntValue().isIntN(32))
682 return false;
683
684 // If this really a zext_inreg that can be represented with a movzx
685 // instruction, prefer that.
686 // TODO: We could shrink the load and fold if it is non-volatile.
687 if (U->getOpcode() == ISD::AND &&
688 (Imm->getAPIntValue() == UINT8_MAX ||
689 Imm->getAPIntValue() == UINT16_MAX ||
690 Imm->getAPIntValue() == UINT32_MAX))
691 return false;
692
693 // ADD/SUB with can negate the immediate and use the opposite operation
694 // to fit 128 into a sign extended 8 bit immediate.
695 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
696 (-Imm->getAPIntValue()).isSignedIntN(8))
697 return false;
698
699 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
700 (-Imm->getAPIntValue()).isSignedIntN(8) &&
701 hasNoCarryFlagUses(SDValue(U, 1)))
702 return false;
703 }
704
705 // If the other operand is a TLS address, we should fold it instead.
706 // This produces
707 // movl %gs:0, %eax
708 // leal i@NTPOFF(%eax), %eax
709 // instead of
710 // movl $i@NTPOFF, %eax
711 // addl %gs:0, %eax
712 // if the block also has an access to a second TLS address this will save
713 // a load.
714 // FIXME: This is probably also true for non-TLS addresses.
715 if (Op1.getOpcode() == X86ISD::Wrapper) {
716 SDValue Val = Op1.getOperand(0);
718 return false;
719 }
720
721 // Don't fold load if this matches the BTS/BTR/BTC patterns.
722 // BTS: (or X, (shl 1, n))
723 // BTR: (and X, (rotl -2, n))
724 // BTC: (xor X, (shl 1, n))
725 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
726 if (U->getOperand(0).getOpcode() == ISD::SHL &&
727 isOneConstant(U->getOperand(0).getOperand(0)))
728 return false;
729
730 if (U->getOperand(1).getOpcode() == ISD::SHL &&
731 isOneConstant(U->getOperand(1).getOperand(0)))
732 return false;
733 }
734 if (U->getOpcode() == ISD::AND) {
735 SDValue U0 = U->getOperand(0);
736 SDValue U1 = U->getOperand(1);
737 if (U0.getOpcode() == ISD::ROTL) {
738 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
739 if (C && C->getSExtValue() == -2)
740 return false;
741 }
742
743 if (U1.getOpcode() == ISD::ROTL) {
744 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
745 if (C && C->getSExtValue() == -2)
746 return false;
747 }
748 }
749
750 break;
751 }
752 case ISD::SHL:
753 case ISD::SRA:
754 case ISD::SRL:
755 // Don't fold a load into a shift by immediate. The BMI2 instructions
756 // support folding a load, but not an immediate. The legacy instructions
757 // support folding an immediate, but can't fold a load. Folding an
758 // immediate is preferable to folding a load.
759 if (isa<ConstantSDNode>(U->getOperand(1)))
760 return false;
761
762 break;
763 }
764 }
765
766 // Prevent folding a load if this can implemented with an insert_subreg or
767 // a move that implicitly zeroes.
768 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
769 isNullConstant(Root->getOperand(2)) &&
770 (Root->getOperand(0).isUndef() ||
772 return false;
773
774 return true;
775}
776
777// Indicates it is profitable to form an AVX512 masked operation. Returning
778// false will favor a masked register-register masked move or vblendm and the
779// operation will be selected separately.
780bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
781 assert(
782 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
783 "Unexpected opcode!");
784
785 // If the operation has additional users, the operation will be duplicated.
786 // Check the use count to prevent that.
787 // FIXME: Are there cheap opcodes we might want to duplicate?
788 return N->getOperand(1).hasOneUse();
789}
790
791/// Replace the original chain operand of the call with
792/// load's chain operand and move load below the call's chain operand.
793static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
794 SDValue Call, SDValue OrigChain) {
796 SDValue Chain = OrigChain.getOperand(0);
797 if (Chain.getNode() == Load.getNode())
798 Ops.push_back(Load.getOperand(0));
799 else {
800 assert(Chain.getOpcode() == ISD::TokenFactor &&
801 "Unexpected chain operand");
802 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
803 if (Chain.getOperand(i).getNode() == Load.getNode())
804 Ops.push_back(Load.getOperand(0));
805 else
806 Ops.push_back(Chain.getOperand(i));
807 SDValue NewChain =
808 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
809 Ops.clear();
810 Ops.push_back(NewChain);
811 }
812 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
813 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
814 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
815 Load.getOperand(1), Load.getOperand(2));
816
817 Ops.clear();
818 Ops.push_back(SDValue(Load.getNode(), 1));
819 Ops.append(Call->op_begin() + 1, Call->op_end());
820 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
821}
822
823/// Return true if call address is a load and it can be
824/// moved below CALLSEQ_START and the chains leading up to the call.
825/// Return the CALLSEQ_START by reference as a second output.
826/// In the case of a tail call, there isn't a callseq node between the call
827/// chain and the load.
828static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
829 // The transformation is somewhat dangerous if the call's chain was glued to
830 // the call. After MoveBelowOrigChain the load is moved between the call and
831 // the chain, this can create a cycle if the load is not folded. So it is
832 // *really* important that we are sure the load will be folded.
833 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
834 return false;
835 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
836 if (!LD ||
837 !LD->isSimple() ||
838 LD->getAddressingMode() != ISD::UNINDEXED ||
839 LD->getExtensionType() != ISD::NON_EXTLOAD)
840 return false;
841
842 // Now let's find the callseq_start.
843 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
844 if (!Chain.hasOneUse())
845 return false;
846 Chain = Chain.getOperand(0);
847 }
848
849 if (!Chain.getNumOperands())
850 return false;
851 // Since we are not checking for AA here, conservatively abort if the chain
852 // writes to memory. It's not safe to move the callee (a load) across a store.
853 if (isa<MemSDNode>(Chain.getNode()) &&
854 cast<MemSDNode>(Chain.getNode())->writeMem())
855 return false;
856 if (Chain.getOperand(0).getNode() == Callee.getNode())
857 return true;
858 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
859 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
860 Callee.getValue(1).hasOneUse())
861 return true;
862 return false;
863}
864
865static bool isEndbrImm64(uint64_t Imm) {
866// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
867// i.g: 0xF3660F1EFA, 0xF3670F1EFA
868 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
869 return false;
870
871 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
872 0x65, 0x66, 0x67, 0xf0, 0xf2};
873 int i = 24; // 24bit 0x0F1EFA has matched
874 while (i < 64) {
875 uint8_t Byte = (Imm >> i) & 0xFF;
876 if (Byte == 0xF3)
877 return true;
878 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
879 return false;
880 i += 8;
881 }
882
883 return false;
884}
885
886static bool needBWI(MVT VT) {
887 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
888}
889
890void X86DAGToDAGISel::PreprocessISelDAG() {
891 bool MadeChange = false;
892 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
893 E = CurDAG->allnodes_end(); I != E; ) {
894 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
895
896 // This is for CET enhancement.
897 //
898 // ENDBR32 and ENDBR64 have specific opcodes:
899 // ENDBR32: F3 0F 1E FB
900 // ENDBR64: F3 0F 1E FA
901 // And we want that attackers won’t find unintended ENDBR32/64
902 // opcode matches in the binary
903 // Here’s an example:
904 // If the compiler had to generate asm for the following code:
905 // a = 0xF30F1EFA
906 // it could, for example, generate:
907 // mov 0xF30F1EFA, dword ptr[a]
908 // In such a case, the binary would include a gadget that starts
909 // with a fake ENDBR64 opcode. Therefore, we split such generation
910 // into multiple operations, let it not shows in the binary
911 if (N->getOpcode() == ISD::Constant) {
912 MVT VT = N->getSimpleValueType(0);
913 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
914 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
915 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
916 // Check that the cf-protection-branch is enabled.
917 Metadata *CFProtectionBranch =
918 MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
919 if (CFProtectionBranch || IndirectBranchTracking) {
920 SDLoc dl(N);
921 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
922 Complement = CurDAG->getNOT(dl, Complement, VT);
923 --I;
924 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
925 ++I;
926 MadeChange = true;
927 continue;
928 }
929 }
930 }
931
932 // If this is a target specific AND node with no flag usages, turn it back
933 // into ISD::AND to enable test instruction matching.
934 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
935 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
936 N->getOperand(0), N->getOperand(1));
937 --I;
938 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
939 ++I;
940 MadeChange = true;
941 continue;
942 }
943
944 // Convert vector increment or decrement to sub/add with an all-ones
945 // constant:
946 // add X, <1, 1...> --> sub X, <-1, -1...>
947 // sub X, <1, 1...> --> add X, <-1, -1...>
948 // The all-ones vector constant can be materialized using a pcmpeq
949 // instruction that is commonly recognized as an idiom (has no register
950 // dependency), so that's better/smaller than loading a splat 1 constant.
951 //
952 // But don't do this if it would inhibit a potentially profitable load
953 // folding opportunity for the other operand. That only occurs with the
954 // intersection of:
955 // (1) The other operand (op0) is load foldable.
956 // (2) The op is an add (otherwise, we are *creating* an add and can still
957 // load fold the other op).
958 // (3) The target has AVX (otherwise, we have a destructive add and can't
959 // load fold the other op without killing the constant op).
960 // (4) The constant 1 vector has multiple uses (so it is profitable to load
961 // into a register anyway).
962 auto mayPreventLoadFold = [&]() {
963 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
964 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
965 !N->getOperand(1).hasOneUse();
966 };
967 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
968 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
969 APInt SplatVal;
970 if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
971 SplatVal.isOne()) {
972 SDLoc DL(N);
973
974 MVT VT = N->getSimpleValueType(0);
975 unsigned NumElts = VT.getSizeInBits() / 32;
977 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
978 AllOnes = CurDAG->getBitcast(VT, AllOnes);
979
980 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
981 SDValue Res =
982 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
983 --I;
984 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
985 ++I;
986 MadeChange = true;
987 continue;
988 }
989 }
990
991 switch (N->getOpcode()) {
992 case X86ISD::VBROADCAST: {
993 MVT VT = N->getSimpleValueType(0);
994 // Emulate v32i16/v64i8 broadcast without BWI.
995 if (!Subtarget->hasBWI() && needBWI(VT)) {
996 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
997 SDLoc dl(N);
998 SDValue NarrowBCast =
999 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1000 SDValue Res =
1001 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1002 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1003 unsigned Index = NarrowVT.getVectorMinNumElements();
1004 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1005 CurDAG->getIntPtrConstant(Index, dl));
1006
1007 --I;
1008 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1009 ++I;
1010 MadeChange = true;
1011 continue;
1012 }
1013
1014 break;
1015 }
1017 MVT VT = N->getSimpleValueType(0);
1018 // Emulate v32i16/v64i8 broadcast without BWI.
1019 if (!Subtarget->hasBWI() && needBWI(VT)) {
1020 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1021 auto *MemNode = cast<MemSDNode>(N);
1022 SDLoc dl(N);
1023 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1024 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1025 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1026 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1027 MemNode->getMemOperand());
1028 SDValue Res =
1029 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1030 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1031 unsigned Index = NarrowVT.getVectorMinNumElements();
1032 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1033 CurDAG->getIntPtrConstant(Index, dl));
1034
1035 --I;
1036 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1037 CurDAG->ReplaceAllUsesWith(N, To);
1038 ++I;
1039 MadeChange = true;
1040 continue;
1041 }
1042
1043 break;
1044 }
1045 case ISD::LOAD: {
1046 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1047 // load, then just extract the lower subvector and avoid the second load.
1048 auto *Ld = cast<LoadSDNode>(N);
1049 MVT VT = N->getSimpleValueType(0);
1050 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1051 !(VT.is128BitVector() || VT.is256BitVector()))
1052 break;
1053
1054 MVT MaxVT = VT;
1055 SDNode *MaxLd = nullptr;
1056 SDValue Ptr = Ld->getBasePtr();
1057 SDValue Chain = Ld->getChain();
1058 for (SDNode *User : Ptr->uses()) {
1059 auto *UserLd = dyn_cast<LoadSDNode>(User);
1060 MVT UserVT = User->getSimpleValueType(0);
1061 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1062 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1063 !User->hasAnyUseOfValue(1) &&
1064 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1065 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1066 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1067 MaxLd = User;
1068 MaxVT = UserVT;
1069 }
1070 }
1071 if (MaxLd) {
1072 SDLoc dl(N);
1073 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1074 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1075 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1076 SDValue(MaxLd, 0),
1077 CurDAG->getIntPtrConstant(0, dl));
1078 SDValue Res = CurDAG->getBitcast(VT, Extract);
1079
1080 --I;
1081 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1082 CurDAG->ReplaceAllUsesWith(N, To);
1083 ++I;
1084 MadeChange = true;
1085 continue;
1086 }
1087 break;
1088 }
1089 case ISD::VSELECT: {
1090 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1091 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1092 if (EleVT == MVT::i1)
1093 break;
1094
1095 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1096 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1097 "We can't replace VSELECT with BLENDV in vXi16!");
1098 SDValue R;
1099 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1100 EleVT.getSizeInBits()) {
1101 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1102 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1103 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1104 } else {
1105 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1106 N->getOperand(0), N->getOperand(1),
1107 N->getOperand(2));
1108 }
1109 --I;
1110 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1111 ++I;
1112 MadeChange = true;
1113 continue;
1114 }
1115 case ISD::FP_ROUND:
1117 case ISD::FP_TO_SINT:
1118 case ISD::FP_TO_UINT:
1121 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1122 // don't need 2 sets of patterns.
1123 if (!N->getSimpleValueType(0).isVector())
1124 break;
1125
1126 unsigned NewOpc;
1127 switch (N->getOpcode()) {
1128 default: llvm_unreachable("Unexpected opcode!");
1129 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1130 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1131 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1132 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1133 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1134 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1135 }
1136 SDValue Res;
1137 if (N->isStrictFPOpcode())
1138 Res =
1139 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1140 {N->getOperand(0), N->getOperand(1)});
1141 else
1142 Res =
1143 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1144 N->getOperand(0));
1145 --I;
1146 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1147 ++I;
1148 MadeChange = true;
1149 continue;
1150 }
1151 case ISD::SHL:
1152 case ISD::SRA:
1153 case ISD::SRL: {
1154 // Replace vector shifts with their X86 specific equivalent so we don't
1155 // need 2 sets of patterns.
1156 if (!N->getValueType(0).isVector())
1157 break;
1158
1159 unsigned NewOpc;
1160 switch (N->getOpcode()) {
1161 default: llvm_unreachable("Unexpected opcode!");
1162 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1163 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1164 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1165 }
1166 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1167 N->getOperand(0), N->getOperand(1));
1168 --I;
1169 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1170 ++I;
1171 MadeChange = true;
1172 continue;
1173 }
1174 case ISD::ANY_EXTEND:
1176 // Replace vector any extend with the zero extend equivalents so we don't
1177 // need 2 sets of patterns. Ignore vXi1 extensions.
1178 if (!N->getValueType(0).isVector())
1179 break;
1180
1181 unsigned NewOpc;
1182 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1183 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1184 "Unexpected opcode for mask vector!");
1185 NewOpc = ISD::SIGN_EXTEND;
1186 } else {
1187 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1190 }
1191
1192 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1193 N->getOperand(0));
1194 --I;
1195 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1196 ++I;
1197 MadeChange = true;
1198 continue;
1199 }
1200 case ISD::FCEIL:
1201 case ISD::STRICT_FCEIL:
1202 case ISD::FFLOOR:
1203 case ISD::STRICT_FFLOOR:
1204 case ISD::FTRUNC:
1205 case ISD::STRICT_FTRUNC:
1206 case ISD::FROUNDEVEN:
1208 case ISD::FNEARBYINT:
1210 case ISD::FRINT:
1211 case ISD::STRICT_FRINT: {
1212 // Replace fp rounding with their X86 specific equivalent so we don't
1213 // need 2 sets of patterns.
1214 unsigned Imm;
1215 switch (N->getOpcode()) {
1216 default: llvm_unreachable("Unexpected opcode!");
1217 case ISD::STRICT_FCEIL:
1218 case ISD::FCEIL: Imm = 0xA; break;
1219 case ISD::STRICT_FFLOOR:
1220 case ISD::FFLOOR: Imm = 0x9; break;
1221 case ISD::STRICT_FTRUNC:
1222 case ISD::FTRUNC: Imm = 0xB; break;
1224 case ISD::FROUNDEVEN: Imm = 0x8; break;
1226 case ISD::FNEARBYINT: Imm = 0xC; break;
1227 case ISD::STRICT_FRINT:
1228 case ISD::FRINT: Imm = 0x4; break;
1229 }
1230 SDLoc dl(N);
1231 bool IsStrict = N->isStrictFPOpcode();
1232 SDValue Res;
1233 if (IsStrict)
1234 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1235 {N->getValueType(0), MVT::Other},
1236 {N->getOperand(0), N->getOperand(1),
1237 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1238 else
1239 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1240 N->getOperand(0),
1241 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1242 --I;
1243 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1244 ++I;
1245 MadeChange = true;
1246 continue;
1247 }
1248 case X86ISD::FANDN:
1249 case X86ISD::FAND:
1250 case X86ISD::FOR:
1251 case X86ISD::FXOR: {
1252 // Widen scalar fp logic ops to vector to reduce isel patterns.
1253 // FIXME: Can we do this during lowering/combine.
1254 MVT VT = N->getSimpleValueType(0);
1255 if (VT.isVector() || VT == MVT::f128)
1256 break;
1257
1258 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1259 : VT == MVT::f32 ? MVT::v4f32
1260 : MVT::v8f16;
1261
1262 SDLoc dl(N);
1263 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1264 N->getOperand(0));
1265 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1266 N->getOperand(1));
1267
1268 SDValue Res;
1269 if (Subtarget->hasSSE2()) {
1270 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1271 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1272 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1273 unsigned Opc;
1274 switch (N->getOpcode()) {
1275 default: llvm_unreachable("Unexpected opcode!");
1276 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1277 case X86ISD::FAND: Opc = ISD::AND; break;
1278 case X86ISD::FOR: Opc = ISD::OR; break;
1279 case X86ISD::FXOR: Opc = ISD::XOR; break;
1280 }
1281 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1282 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1283 } else {
1284 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1285 }
1286 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1287 CurDAG->getIntPtrConstant(0, dl));
1288 --I;
1289 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1290 ++I;
1291 MadeChange = true;
1292 continue;
1293 }
1294 }
1295
1296 if (OptLevel != CodeGenOptLevel::None &&
1297 // Only do this when the target can fold the load into the call or
1298 // jmp.
1299 !Subtarget->useIndirectThunkCalls() &&
1300 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1301 (N->getOpcode() == X86ISD::TC_RETURN &&
1302 (Subtarget->is64Bit() ||
1303 !getTargetMachine().isPositionIndependent())))) {
1304 /// Also try moving call address load from outside callseq_start to just
1305 /// before the call to allow it to be folded.
1306 ///
1307 /// [Load chain]
1308 /// ^
1309 /// |
1310 /// [Load]
1311 /// ^ ^
1312 /// | |
1313 /// / \--
1314 /// / |
1315 ///[CALLSEQ_START] |
1316 /// ^ |
1317 /// | |
1318 /// [LOAD/C2Reg] |
1319 /// | |
1320 /// \ /
1321 /// \ /
1322 /// [CALL]
1323 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1324 SDValue Chain = N->getOperand(0);
1325 SDValue Load = N->getOperand(1);
1326 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1327 continue;
1328 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1329 ++NumLoadMoved;
1330 MadeChange = true;
1331 continue;
1332 }
1333
1334 // Lower fpround and fpextend nodes that target the FP stack to be store and
1335 // load to the stack. This is a gross hack. We would like to simply mark
1336 // these as being illegal, but when we do that, legalize produces these when
1337 // it expands calls, then expands these in the same legalize pass. We would
1338 // like dag combine to be able to hack on these between the call expansion
1339 // and the node legalization. As such this pass basically does "really
1340 // late" legalization of these inline with the X86 isel pass.
1341 // FIXME: This should only happen when not compiled with -O0.
1342 switch (N->getOpcode()) {
1343 default: continue;
1344 case ISD::FP_ROUND:
1345 case ISD::FP_EXTEND:
1346 {
1347 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1348 MVT DstVT = N->getSimpleValueType(0);
1349
1350 // If any of the sources are vectors, no fp stack involved.
1351 if (SrcVT.isVector() || DstVT.isVector())
1352 continue;
1353
1354 // If the source and destination are SSE registers, then this is a legal
1355 // conversion that should not be lowered.
1356 const X86TargetLowering *X86Lowering =
1357 static_cast<const X86TargetLowering *>(TLI);
1358 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1359 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1360 if (SrcIsSSE && DstIsSSE)
1361 continue;
1362
1363 if (!SrcIsSSE && !DstIsSSE) {
1364 // If this is an FPStack extension, it is a noop.
1365 if (N->getOpcode() == ISD::FP_EXTEND)
1366 continue;
1367 // If this is a value-preserving FPStack truncation, it is a noop.
1368 if (N->getConstantOperandVal(1))
1369 continue;
1370 }
1371
1372 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1373 // FPStack has extload and truncstore. SSE can fold direct loads into other
1374 // operations. Based on this, decide what we want to do.
1375 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1376 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1377 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1378 MachinePointerInfo MPI =
1379 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1380 SDLoc dl(N);
1381
1382 // FIXME: optimize the case where the src/dest is a load or store?
1383
1384 SDValue Store = CurDAG->getTruncStore(
1385 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1386 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1387 MemTmp, MPI, MemVT);
1388
1389 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1390 // extload we created. This will cause general havok on the dag because
1391 // anything below the conversion could be folded into other existing nodes.
1392 // To avoid invalidating 'I', back it up to the convert node.
1393 --I;
1394 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1395 break;
1396 }
1397
1398 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1399 //dealing with the chain differently, as there is already a preexisting chain.
1402 {
1403 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1404 MVT DstVT = N->getSimpleValueType(0);
1405
1406 // If any of the sources are vectors, no fp stack involved.
1407 if (SrcVT.isVector() || DstVT.isVector())
1408 continue;
1409
1410 // If the source and destination are SSE registers, then this is a legal
1411 // conversion that should not be lowered.
1412 const X86TargetLowering *X86Lowering =
1413 static_cast<const X86TargetLowering *>(TLI);
1414 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1415 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1416 if (SrcIsSSE && DstIsSSE)
1417 continue;
1418
1419 if (!SrcIsSSE && !DstIsSSE) {
1420 // If this is an FPStack extension, it is a noop.
1421 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1422 continue;
1423 // If this is a value-preserving FPStack truncation, it is a noop.
1424 if (N->getConstantOperandVal(2))
1425 continue;
1426 }
1427
1428 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1429 // FPStack has extload and truncstore. SSE can fold direct loads into other
1430 // operations. Based on this, decide what we want to do.
1431 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1432 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1433 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1434 MachinePointerInfo MPI =
1435 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1436 SDLoc dl(N);
1437
1438 // FIXME: optimize the case where the src/dest is a load or store?
1439
1440 //Since the operation is StrictFP, use the preexisting chain.
1442 if (!SrcIsSSE) {
1443 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1444 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1445 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1446 MPI, /*Align*/ std::nullopt,
1448 if (N->getFlags().hasNoFPExcept()) {
1449 SDNodeFlags Flags = Store->getFlags();
1450 Flags.setNoFPExcept(true);
1451 Store->setFlags(Flags);
1452 }
1453 } else {
1454 assert(SrcVT == MemVT && "Unexpected VT!");
1455 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1456 MPI);
1457 }
1458
1459 if (!DstIsSSE) {
1460 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1461 SDValue Ops[] = {Store, MemTmp};
1462 Result = CurDAG->getMemIntrinsicNode(
1463 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1464 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1465 if (N->getFlags().hasNoFPExcept()) {
1466 SDNodeFlags Flags = Result->getFlags();
1467 Flags.setNoFPExcept(true);
1468 Result->setFlags(Flags);
1469 }
1470 } else {
1471 assert(DstVT == MemVT && "Unexpected VT!");
1472 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1473 }
1474
1475 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1476 // extload we created. This will cause general havok on the dag because
1477 // anything below the conversion could be folded into other existing nodes.
1478 // To avoid invalidating 'I', back it up to the convert node.
1479 --I;
1480 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1481 break;
1482 }
1483 }
1484
1485
1486 // Now that we did that, the node is dead. Increment the iterator to the
1487 // next node to process, then delete N.
1488 ++I;
1489 MadeChange = true;
1490 }
1491
1492 // Remove any dead nodes that may have been left behind.
1493 if (MadeChange)
1494 CurDAG->RemoveDeadNodes();
1495}
1496
1497// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1498bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1499 unsigned Opc = N->getMachineOpcode();
1500 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1501 Opc != X86::MOVSX64rr8)
1502 return false;
1503
1504 SDValue N0 = N->getOperand(0);
1505
1506 // We need to be extracting the lower bit of an extend.
1507 if (!N0.isMachineOpcode() ||
1508 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1509 N0.getConstantOperandVal(1) != X86::sub_8bit)
1510 return false;
1511
1512 // We're looking for either a movsx or movzx to match the original opcode.
1513 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1514 : X86::MOVSX32rr8_NOREX;
1515 SDValue N00 = N0.getOperand(0);
1516 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1517 return false;
1518
1519 if (Opc == X86::MOVSX64rr8) {
1520 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1521 // to 64.
1522 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1523 MVT::i64, N00);
1524 ReplaceUses(N, Extend);
1525 } else {
1526 // Ok we can drop this extend and just use the original extend.
1527 ReplaceUses(N, N00.getNode());
1528 }
1529
1530 return true;
1531}
1532
1533void X86DAGToDAGISel::PostprocessISelDAG() {
1534 // Skip peepholes at -O0.
1535 if (TM.getOptLevel() == CodeGenOptLevel::None)
1536 return;
1537
1538 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1539
1540 bool MadeChange = false;
1541 while (Position != CurDAG->allnodes_begin()) {
1542 SDNode *N = &*--Position;
1543 // Skip dead nodes and any non-machine opcodes.
1544 if (N->use_empty() || !N->isMachineOpcode())
1545 continue;
1546
1547 if (tryOptimizeRem8Extend(N)) {
1548 MadeChange = true;
1549 continue;
1550 }
1551
1552 // Look for a TESTrr+ANDrr pattern where both operands of the test are
1553 // the same. Rewrite to remove the AND.
1554 unsigned Opc = N->getMachineOpcode();
1555 if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
1556 Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
1557 N->getOperand(0) == N->getOperand(1) &&
1558 N->getOperand(0)->hasNUsesOfValue(2, N->getOperand(0).getResNo()) &&
1559 N->getOperand(0).isMachineOpcode()) {
1560 SDValue And = N->getOperand(0);
1561 unsigned N0Opc = And.getMachineOpcode();
1562 if ((N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
1563 N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) &&
1564 !And->hasAnyUseOfValue(1)) {
1565 MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1566 MVT::i32,
1567 And.getOperand(0),
1568 And.getOperand(1));
1569 ReplaceUses(N, Test);
1570 MadeChange = true;
1571 continue;
1572 }
1573 if ((N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
1574 N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) &&
1575 !And->hasAnyUseOfValue(1)) {
1576 unsigned NewOpc;
1577 switch (N0Opc) {
1578 case X86::AND8rm: NewOpc = X86::TEST8mr; break;
1579 case X86::AND16rm: NewOpc = X86::TEST16mr; break;
1580 case X86::AND32rm: NewOpc = X86::TEST32mr; break;
1581 case X86::AND64rm: NewOpc = X86::TEST64mr; break;
1582 }
1583
1584 // Need to swap the memory and register operand.
1585 SDValue Ops[] = { And.getOperand(1),
1586 And.getOperand(2),
1587 And.getOperand(3),
1588 And.getOperand(4),
1589 And.getOperand(5),
1590 And.getOperand(0),
1591 And.getOperand(6) /* Chain */ };
1592 MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1593 MVT::i32, MVT::Other, Ops);
1594 CurDAG->setNodeMemRefs(
1595 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1596 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1597 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1598 MadeChange = true;
1599 continue;
1600 }
1601 }
1602
1603 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1604 // used. We're doing this late so we can prefer to fold the AND into masked
1605 // comparisons. Doing that can be better for the live range of the mask
1606 // register.
1607 if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
1608 Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
1609 N->getOperand(0) == N->getOperand(1) &&
1610 N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1611 N->getOperand(0).isMachineOpcode() &&
1612 onlyUsesZeroFlag(SDValue(N, 0))) {
1613 SDValue And = N->getOperand(0);
1614 unsigned N0Opc = And.getMachineOpcode();
1615 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1616 // KAND instructions and KTEST use the same ISA feature.
1617 if (N0Opc == X86::KANDBrr ||
1618 (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
1619 N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
1620 unsigned NewOpc;
1621 switch (Opc) {
1622 default: llvm_unreachable("Unexpected opcode!");
1623 case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1624 case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1625 case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1626 case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1627 }
1628 MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1629 MVT::i32,
1630 And.getOperand(0),
1631 And.getOperand(1));
1632 ReplaceUses(N, KTest);
1633 MadeChange = true;
1634 continue;
1635 }
1636 }
1637
1638 // Attempt to remove vectors moves that were inserted to zero upper bits.
1639 if (Opc != TargetOpcode::SUBREG_TO_REG)
1640 continue;
1641
1642 unsigned SubRegIdx = N->getConstantOperandVal(2);
1643 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1644 continue;
1645
1646 SDValue Move = N->getOperand(1);
1647 if (!Move.isMachineOpcode())
1648 continue;
1649
1650 // Make sure its one of the move opcodes we recognize.
1651 switch (Move.getMachineOpcode()) {
1652 default:
1653 continue;
1654 case X86::VMOVAPDrr: case X86::VMOVUPDrr:
1655 case X86::VMOVAPSrr: case X86::VMOVUPSrr:
1656 case X86::VMOVDQArr: case X86::VMOVDQUrr:
1657 case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
1658 case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
1659 case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
1660 case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
1661 case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
1662 case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1663 case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1664 case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
1665 case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
1666 case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1667 case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1668 break;
1669 }
1670
1671 SDValue In = Move.getOperand(0);
1672 if (!In.isMachineOpcode() ||
1673 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1674 continue;
1675
1676 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1677 // the SHA instructions which use a legacy encoding.
1678 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1679 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1680 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1681 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1682 continue;
1683
1684 // Producing instruction is another vector instruction. We can drop the
1685 // move.
1686 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1687 MadeChange = true;
1688 }
1689
1690 if (MadeChange)
1691 CurDAG->RemoveDeadNodes();
1692}
1693
1694
1695/// Emit any code that needs to be executed only in the main function.
1696void X86DAGToDAGISel::emitSpecialCodeForMain() {
1697 if (Subtarget->isTargetCygMing()) {
1699 auto &DL = CurDAG->getDataLayout();
1700
1702 CLI.setChain(CurDAG->getRoot())
1703 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1704 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1705 std::move(Args));
1706 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1707 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1708 CurDAG->setRoot(Result.second);
1709 }
1710}
1711
1712void X86DAGToDAGISel::emitFunctionEntryCode() {
1713 // If this is main, emit special code for main.
1714 const Function &F = MF->getFunction();
1715 if (F.hasExternalLinkage() && F.getName() == "main")
1716 emitSpecialCodeForMain();
1717}
1718
1719static bool isDispSafeForFrameIndex(int64_t Val) {
1720 // On 64-bit platforms, we can run into an issue where a frame index
1721 // includes a displacement that, when added to the explicit displacement,
1722 // will overflow the displacement field. Assuming that the frame index
1723 // displacement fits into a 31-bit integer (which is only slightly more
1724 // aggressive than the current fundamental assumption that it fits into
1725 // a 32-bit integer), a 31-bit disp should always be safe.
1726 return isInt<31>(Val);
1727}
1728
1729bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1730 X86ISelAddressMode &AM) {
1731 // We may have already matched a displacement and the caller just added the
1732 // symbolic displacement. So we still need to do the checks even if Offset
1733 // is zero.
1734
1735 int64_t Val = AM.Disp + Offset;
1736
1737 // Cannot combine ExternalSymbol displacements with integer offsets.
1738 if (Val != 0 && (AM.ES || AM.MCSym))
1739 return true;
1740
1741 CodeModel::Model M = TM.getCodeModel();
1742 if (Subtarget->is64Bit()) {
1743 if (Val != 0 &&
1745 AM.hasSymbolicDisplacement()))
1746 return true;
1747 // In addition to the checks required for a register base, check that
1748 // we do not try to use an unsafe Disp with a frame index.
1749 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1751 return true;
1752 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1753 // 64 bits. Instructions with 32-bit register addresses perform this zero
1754 // extension for us and we can safely ignore the high bits of Offset.
1755 // Instructions with only a 32-bit immediate address do not, though: they
1756 // sign extend instead. This means only address the low 2GB of address space
1757 // is directly addressable, we need indirect addressing for the high 2GB of
1758 // address space.
1759 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1760 // implicit zero extension of instructions would cover up any problem.
1761 // However, we have asserts elsewhere that get triggered if we do, so keep
1762 // the checks for now.
1763 // TODO: We would actually be able to accept these, as well as the same
1764 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1765 // to get an address size override to be emitted. However, this
1766 // pseudo-register is not part of any register class and therefore causes
1767 // MIR verification to fail.
1768 if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1769 !AM.hasBaseOrIndexReg())
1770 return true;
1771 }
1772 AM.Disp = Val;
1773 return false;
1774}
1775
1776bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1777 bool AllowSegmentRegForX32) {
1778 SDValue Address = N->getOperand(1);
1779
1780 // load gs:0 -> GS segment register.
1781 // load fs:0 -> FS segment register.
1782 //
1783 // This optimization is generally valid because the GNU TLS model defines that
1784 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1785 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1786 // zero-extended to 64 bits and then added it to the base address, which gives
1787 // unwanted results when the register holds a negative value.
1788 // For more information see http://people.redhat.com/drepper/tls.pdf
1789 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1790 !IndirectTlsSegRefs &&
1791 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1792 Subtarget->isTargetFuchsia())) {
1793 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1794 return true;
1795 switch (N->getPointerInfo().getAddrSpace()) {
1796 case X86AS::GS:
1797 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1798 return false;
1799 case X86AS::FS:
1800 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1801 return false;
1802 // Address space X86AS::SS is not handled here, because it is not used to
1803 // address TLS areas.
1804 }
1805 }
1806
1807 return true;
1808}
1809
1810/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1811/// mode. These wrap things that will resolve down into a symbol reference.
1812/// If no match is possible, this returns true, otherwise it returns false.
1813bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1814 // If the addressing mode already has a symbol as the displacement, we can
1815 // never match another symbol.
1816 if (AM.hasSymbolicDisplacement())
1817 return true;
1818
1819 bool IsRIPRelTLS = false;
1820 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1821 if (IsRIPRel) {
1822 SDValue Val = N.getOperand(0);
1824 IsRIPRelTLS = true;
1825 }
1826
1827 // We can't use an addressing mode in the 64-bit large code model.
1828 // Global TLS addressing is an exception. In the medium code model,
1829 // we use can use a mode when RIP wrappers are present.
1830 // That signifies access to globals that are known to be "near",
1831 // such as the GOT itself.
1832 CodeModel::Model M = TM.getCodeModel();
1833 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1834 return true;
1835
1836 // Base and index reg must be 0 in order to use %rip as base.
1837 if (IsRIPRel && AM.hasBaseOrIndexReg())
1838 return true;
1839
1840 // Make a local copy in case we can't do this fold.
1841 X86ISelAddressMode Backup = AM;
1842
1843 int64_t Offset = 0;
1844 SDValue N0 = N.getOperand(0);
1845 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1846 AM.GV = G->getGlobal();
1847 AM.SymbolFlags = G->getTargetFlags();
1848 Offset = G->getOffset();
1849 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1850 AM.CP = CP->getConstVal();
1851 AM.Alignment = CP->getAlign();
1852 AM.SymbolFlags = CP->getTargetFlags();
1853 Offset = CP->getOffset();
1854 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1855 AM.ES = S->getSymbol();
1856 AM.SymbolFlags = S->getTargetFlags();
1857 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1858 AM.MCSym = S->getMCSymbol();
1859 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1860 AM.JT = J->getIndex();
1861 AM.SymbolFlags = J->getTargetFlags();
1862 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1863 AM.BlockAddr = BA->getBlockAddress();
1864 AM.SymbolFlags = BA->getTargetFlags();
1865 Offset = BA->getOffset();
1866 } else
1867 llvm_unreachable("Unhandled symbol reference node.");
1868
1869 // Can't use an addressing mode with large globals.
1870 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1871 TM.isLargeGlobalValue(AM.GV)) {
1872 AM = Backup;
1873 return true;
1874 }
1875
1876 if (foldOffsetIntoAddress(Offset, AM)) {
1877 AM = Backup;
1878 return true;
1879 }
1880
1881 if (IsRIPRel)
1882 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1883
1884 // Commit the changes now that we know this fold is safe.
1885 return false;
1886}
1887
1888/// Add the specified node to the specified addressing mode, returning true if
1889/// it cannot be done. This just pattern matches for the addressing mode.
1890bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1891 if (matchAddressRecursively(N, AM, 0))
1892 return true;
1893
1894 // Post-processing: Make a second attempt to fold a load, if we now know
1895 // that there will not be any other register. This is only performed for
1896 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1897 // any foldable load the first time.
1898 if (Subtarget->isTarget64BitILP32() &&
1899 AM.BaseType == X86ISelAddressMode::RegBase &&
1900 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1901 SDValue Save_Base_Reg = AM.Base_Reg;
1902 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1903 AM.Base_Reg = SDValue();
1904 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1905 AM.Base_Reg = Save_Base_Reg;
1906 }
1907 }
1908
1909 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1910 // a smaller encoding and avoids a scaled-index.
1911 if (AM.Scale == 2 &&
1912 AM.BaseType == X86ISelAddressMode::RegBase &&
1913 AM.Base_Reg.getNode() == nullptr) {
1914 AM.Base_Reg = AM.IndexReg;
1915 AM.Scale = 1;
1916 }
1917
1918 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1919 // because it has a smaller encoding.
1920 if (TM.getCodeModel() != CodeModel::Large &&
1921 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
1922 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
1923 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
1924 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
1925 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1926 }
1927
1928 return false;
1929}
1930
1931bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1932 unsigned Depth) {
1933 // Add an artificial use to this node so that we can keep track of
1934 // it if it gets CSE'd with a different node.
1935 HandleSDNode Handle(N);
1936
1937 X86ISelAddressMode Backup = AM;
1938 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1939 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1940 return false;
1941 AM = Backup;
1942
1943 // Try again after commutating the operands.
1944 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
1945 Depth + 1) &&
1946 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
1947 return false;
1948 AM = Backup;
1949
1950 // If we couldn't fold both operands into the address at the same time,
1951 // see if we can just put each operand into a register and fold at least
1952 // the add.
1953 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1954 !AM.Base_Reg.getNode() &&
1955 !AM.IndexReg.getNode()) {
1956 N = Handle.getValue();
1957 AM.Base_Reg = N.getOperand(0);
1958 AM.IndexReg = N.getOperand(1);
1959 AM.Scale = 1;
1960 return false;
1961 }
1962 N = Handle.getValue();
1963 return true;
1964}
1965
1966// Insert a node into the DAG at least before the Pos node's position. This
1967// will reposition the node as needed, and will assign it a node ID that is <=
1968// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1969// IDs! The selection DAG must no longer depend on their uniqueness when this
1970// is used.
1971static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1972 if (N->getNodeId() == -1 ||
1975 DAG.RepositionNode(Pos->getIterator(), N.getNode());
1976 // Mark Node as invalid for pruning as after this it may be a successor to a
1977 // selected node but otherwise be in the same position of Pos.
1978 // Conservatively mark it with the same -abs(Id) to assure node id
1979 // invariant is preserved.
1980 N->setNodeId(Pos->getNodeId());
1982 }
1983}
1984
1985// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1986// safe. This allows us to convert the shift and and into an h-register
1987// extract and a scaled index. Returns false if the simplification is
1988// performed.
1990 uint64_t Mask,
1991 SDValue Shift, SDValue X,
1992 X86ISelAddressMode &AM) {
1993 if (Shift.getOpcode() != ISD::SRL ||
1994 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1995 !Shift.hasOneUse())
1996 return true;
1997
1998 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1999 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2000 Mask != (0xffu << ScaleLog))
2001 return true;
2002
2003 MVT XVT = X.getSimpleValueType();
2004 MVT VT = N.getSimpleValueType();
2005 SDLoc DL(N);
2006 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2007 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2008 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2009 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2010 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2011 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2012 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2013
2014 // Insert the new nodes into the topological ordering. We must do this in
2015 // a valid topological ordering as nothing is going to go back and re-sort
2016 // these nodes. We continually insert before 'N' in sequence as this is
2017 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2018 // hierarchy left to express.
2019 insertDAGNode(DAG, N, Eight);
2020 insertDAGNode(DAG, N, NewMask);
2021 insertDAGNode(DAG, N, Srl);
2022 insertDAGNode(DAG, N, And);
2023 insertDAGNode(DAG, N, Ext);
2024 insertDAGNode(DAG, N, ShlCount);
2025 insertDAGNode(DAG, N, Shl);
2026 DAG.ReplaceAllUsesWith(N, Shl);
2027 DAG.RemoveDeadNode(N.getNode());
2028 AM.IndexReg = Ext;
2029 AM.Scale = (1 << ScaleLog);
2030 return false;
2031}
2032
2033// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2034// allows us to fold the shift into this addressing mode. Returns false if the
2035// transform succeeded.
2037 X86ISelAddressMode &AM) {
2038 SDValue Shift = N.getOperand(0);
2039
2040 // Use a signed mask so that shifting right will insert sign bits. These
2041 // bits will be removed when we shift the result left so it doesn't matter
2042 // what we use. This might allow a smaller immediate encoding.
2043 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2044
2045 // If we have an any_extend feeding the AND, look through it to see if there
2046 // is a shift behind it. But only if the AND doesn't use the extended bits.
2047 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2048 bool FoundAnyExtend = false;
2049 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2050 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2051 isUInt<32>(Mask)) {
2052 FoundAnyExtend = true;
2053 Shift = Shift.getOperand(0);
2054 }
2055
2056 if (Shift.getOpcode() != ISD::SHL ||
2057 !isa<ConstantSDNode>(Shift.getOperand(1)))
2058 return true;
2059
2060 SDValue X = Shift.getOperand(0);
2061
2062 // Not likely to be profitable if either the AND or SHIFT node has more
2063 // than one use (unless all uses are for address computation). Besides,
2064 // isel mechanism requires their node ids to be reused.
2065 if (!N.hasOneUse() || !Shift.hasOneUse())
2066 return true;
2067
2068 // Verify that the shift amount is something we can fold.
2069 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2070 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2071 return true;
2072
2073 MVT VT = N.getSimpleValueType();
2074 SDLoc DL(N);
2075 if (FoundAnyExtend) {
2076 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2077 insertDAGNode(DAG, N, NewX);
2078 X = NewX;
2079 }
2080
2081 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
2082 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2083 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2084
2085 // Insert the new nodes into the topological ordering. We must do this in
2086 // a valid topological ordering as nothing is going to go back and re-sort
2087 // these nodes. We continually insert before 'N' in sequence as this is
2088 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2089 // hierarchy left to express.
2090 insertDAGNode(DAG, N, NewMask);
2091 insertDAGNode(DAG, N, NewAnd);
2092 insertDAGNode(DAG, N, NewShift);
2093 DAG.ReplaceAllUsesWith(N, NewShift);
2094 DAG.RemoveDeadNode(N.getNode());
2095
2096 AM.Scale = 1 << ShiftAmt;
2097 AM.IndexReg = NewAnd;
2098 return false;
2099}
2100
2101// Implement some heroics to detect shifts of masked values where the mask can
2102// be replaced by extending the shift and undoing that in the addressing mode
2103// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2104// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2105// the addressing mode. This results in code such as:
2106//
2107// int f(short *y, int *lookup_table) {
2108// ...
2109// return *y + lookup_table[*y >> 11];
2110// }
2111//
2112// Turning into:
2113// movzwl (%rdi), %eax
2114// movl %eax, %ecx
2115// shrl $11, %ecx
2116// addl (%rsi,%rcx,4), %eax
2117//
2118// Instead of:
2119// movzwl (%rdi), %eax
2120// movl %eax, %ecx
2121// shrl $9, %ecx
2122// andl $124, %rcx
2123// addl (%rsi,%rcx), %eax
2124//
2125// Note that this function assumes the mask is provided as a mask *after* the
2126// value is shifted. The input chain may or may not match that, but computing
2127// such a mask is trivial.
2129 uint64_t Mask,
2130 SDValue Shift, SDValue X,
2131 X86ISelAddressMode &AM) {
2132 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2133 !isa<ConstantSDNode>(Shift.getOperand(1)))
2134 return true;
2135
2136 // We need to ensure that mask is a continuous run of bits.
2137 unsigned MaskIdx, MaskLen;
2138 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2139 return true;
2140 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2141
2142 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2143
2144 // The amount of shift we're trying to fit into the addressing mode is taken
2145 // from the shifted mask index (number of trailing zeros of the mask).
2146 unsigned AMShiftAmt = MaskIdx;
2147
2148 // There is nothing we can do here unless the mask is removing some bits.
2149 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2150 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2151
2152 // Scale the leading zero count down based on the actual size of the value.
2153 // Also scale it down based on the size of the shift.
2154 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2155 if (MaskLZ < ScaleDown)
2156 return true;
2157 MaskLZ -= ScaleDown;
2158
2159 // The final check is to ensure that any masked out high bits of X are
2160 // already known to be zero. Otherwise, the mask has a semantic impact
2161 // other than masking out a couple of low bits. Unfortunately, because of
2162 // the mask, zero extensions will be removed from operands in some cases.
2163 // This code works extra hard to look through extensions because we can
2164 // replace them with zero extensions cheaply if necessary.
2165 bool ReplacingAnyExtend = false;
2166 if (X.getOpcode() == ISD::ANY_EXTEND) {
2167 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2168 X.getOperand(0).getSimpleValueType().getSizeInBits();
2169 // Assume that we'll replace the any-extend with a zero-extend, and
2170 // narrow the search to the extended value.
2171 X = X.getOperand(0);
2172 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2173 ReplacingAnyExtend = true;
2174 }
2175 APInt MaskedHighBits =
2176 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2177 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2178 return true;
2179
2180 // We've identified a pattern that can be transformed into a single shift
2181 // and an addressing mode. Make it so.
2182 MVT VT = N.getSimpleValueType();
2183 if (ReplacingAnyExtend) {
2184 assert(X.getValueType() != VT);
2185 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2186 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2187 insertDAGNode(DAG, N, NewX);
2188 X = NewX;
2189 }
2190
2191 MVT XVT = X.getSimpleValueType();
2192 SDLoc DL(N);
2193 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2194 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2195 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2196 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2197 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2198
2199 // Insert the new nodes into the topological ordering. We must do this in
2200 // a valid topological ordering as nothing is going to go back and re-sort
2201 // these nodes. We continually insert before 'N' in sequence as this is
2202 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2203 // hierarchy left to express.
2204 insertDAGNode(DAG, N, NewSRLAmt);
2205 insertDAGNode(DAG, N, NewSRL);
2206 insertDAGNode(DAG, N, NewExt);
2207 insertDAGNode(DAG, N, NewSHLAmt);
2208 insertDAGNode(DAG, N, NewSHL);
2209 DAG.ReplaceAllUsesWith(N, NewSHL);
2210 DAG.RemoveDeadNode(N.getNode());
2211
2212 AM.Scale = 1 << AMShiftAmt;
2213 AM.IndexReg = NewExt;
2214 return false;
2215}
2216
2217// Transform "(X >> SHIFT) & (MASK << C1)" to
2218// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2219// matched to a BEXTR later. Returns false if the simplification is performed.
2221 uint64_t Mask,
2222 SDValue Shift, SDValue X,
2223 X86ISelAddressMode &AM,
2224 const X86Subtarget &Subtarget) {
2225 if (Shift.getOpcode() != ISD::SRL ||
2226 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2227 !Shift.hasOneUse() || !N.hasOneUse())
2228 return true;
2229
2230 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2231 if (!Subtarget.hasTBM() &&
2232 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2233 return true;
2234
2235 // We need to ensure that mask is a continuous run of bits.
2236 unsigned MaskIdx, MaskLen;
2237 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2238 return true;
2239
2240 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2241
2242 // The amount of shift we're trying to fit into the addressing mode is taken
2243 // from the shifted mask index (number of trailing zeros of the mask).
2244 unsigned AMShiftAmt = MaskIdx;
2245
2246 // There is nothing we can do here unless the mask is removing some bits.
2247 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2248 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2249
2250 MVT XVT = X.getSimpleValueType();
2251 MVT VT = N.getSimpleValueType();
2252 SDLoc DL(N);
2253 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2254 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2255 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2256 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2257 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2258 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2259 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2260
2261 // Insert the new nodes into the topological ordering. We must do this in
2262 // a valid topological ordering as nothing is going to go back and re-sort
2263 // these nodes. We continually insert before 'N' in sequence as this is
2264 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2265 // hierarchy left to express.
2266 insertDAGNode(DAG, N, NewSRLAmt);
2267 insertDAGNode(DAG, N, NewSRL);
2268 insertDAGNode(DAG, N, NewMask);
2269 insertDAGNode(DAG, N, NewAnd);
2270 insertDAGNode(DAG, N, NewExt);
2271 insertDAGNode(DAG, N, NewSHLAmt);
2272 insertDAGNode(DAG, N, NewSHL);
2273 DAG.ReplaceAllUsesWith(N, NewSHL);
2274 DAG.RemoveDeadNode(N.getNode());
2275
2276 AM.Scale = 1 << AMShiftAmt;
2277 AM.IndexReg = NewExt;
2278 return false;
2279}
2280
2281// Attempt to peek further into a scaled index register, collecting additional
2282// extensions / offsets / etc. Returns /p N if we can't peek any further.
2283SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2284 X86ISelAddressMode &AM,
2285 unsigned Depth) {
2286 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2287 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2288 "Illegal index scale");
2289
2290 // Limit recursion.
2292 return N;
2293
2294 EVT VT = N.getValueType();
2295 unsigned Opc = N.getOpcode();
2296
2297 // index: add(x,c) -> index: x, disp + c
2298 if (CurDAG->isBaseWithConstantOffset(N)) {
2299 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2300 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2301 if (!foldOffsetIntoAddress(Offset, AM))
2302 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2303 }
2304
2305 // index: add(x,x) -> index: x, scale * 2
2306 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2307 if (AM.Scale <= 4) {
2308 AM.Scale *= 2;
2309 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2310 }
2311 }
2312
2313 // index: shl(x,i) -> index: x, scale * (1 << i)
2314 if (Opc == X86ISD::VSHLI) {
2315 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2316 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2317 if ((AM.Scale * ScaleAmt) <= 8) {
2318 AM.Scale *= ScaleAmt;
2319 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2320 }
2321 }
2322
2323 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2324 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2325 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2326 SDValue Src = N.getOperand(0);
2327 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2328 Src.hasOneUse()) {
2329 if (CurDAG->isBaseWithConstantOffset(Src)) {
2330 SDValue AddSrc = Src.getOperand(0);
2331 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2332 uint64_t Offset = (uint64_t)AddVal->getSExtValue();
2333 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2334 SDLoc DL(N);
2335 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2336 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2337 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2338 insertDAGNode(*CurDAG, N, ExtSrc);
2339 insertDAGNode(*CurDAG, N, ExtVal);
2340 insertDAGNode(*CurDAG, N, ExtAdd);
2341 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2342 CurDAG->RemoveDeadNode(N.getNode());
2343 return ExtSrc;
2344 }
2345 }
2346 }
2347 }
2348
2349 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2350 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2351 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2352 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2353 SDValue Src = N.getOperand(0);
2354 unsigned SrcOpc = Src.getOpcode();
2355 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2356 CurDAG->isADDLike(Src)) &&
2357 Src.hasOneUse()) {
2358 if (CurDAG->isBaseWithConstantOffset(Src)) {
2359 SDValue AddSrc = Src.getOperand(0);
2360 uint64_t Offset = Src.getConstantOperandVal(1);
2361 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2362 SDLoc DL(N);
2363 SDValue Res;
2364 // If we're also scaling, see if we can use that as well.
2365 if (AddSrc.getOpcode() == ISD::SHL &&
2366 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2367 SDValue ShVal = AddSrc.getOperand(0);
2368 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2369 APInt HiBits =
2371 uint64_t ScaleAmt = 1ULL << ShAmt;
2372 if ((AM.Scale * ScaleAmt) <= 8 &&
2373 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2374 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2375 AM.Scale *= ScaleAmt;
2376 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2377 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2378 AddSrc.getOperand(1));
2379 insertDAGNode(*CurDAG, N, ExtShVal);
2380 insertDAGNode(*CurDAG, N, ExtShift);
2381 AddSrc = ExtShift;
2382 Res = ExtShVal;
2383 }
2384 }
2385 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2386 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2387 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2388 insertDAGNode(*CurDAG, N, ExtSrc);
2389 insertDAGNode(*CurDAG, N, ExtVal);
2390 insertDAGNode(*CurDAG, N, ExtAdd);
2391 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2392 CurDAG->RemoveDeadNode(N.getNode());
2393 return Res ? Res : ExtSrc;
2394 }
2395 }
2396 }
2397 }
2398
2399 // TODO: Handle extensions, shifted masks etc.
2400 return N;
2401}
2402
2403bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2404 unsigned Depth) {
2405 SDLoc dl(N);
2406 LLVM_DEBUG({
2407 dbgs() << "MatchAddress: ";
2408 AM.dump(CurDAG);
2409 });
2410 // Limit recursion.
2412 return matchAddressBase(N, AM);
2413
2414 // If this is already a %rip relative address, we can only merge immediates
2415 // into it. Instead of handling this in every case, we handle it here.
2416 // RIP relative addressing: %rip + 32-bit displacement!
2417 if (AM.isRIPRelative()) {
2418 // FIXME: JumpTable and ExternalSymbol address currently don't like
2419 // displacements. It isn't very important, but this should be fixed for
2420 // consistency.
2421 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2422 return true;
2423
2424 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2425 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2426 return false;
2427 return true;
2428 }
2429
2430 switch (N.getOpcode()) {
2431 default: break;
2432 case ISD::LOCAL_RECOVER: {
2433 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2434 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2435 // Use the symbol and don't prefix it.
2436 AM.MCSym = ESNode->getMCSymbol();
2437 return false;
2438 }
2439 break;
2440 }
2441 case ISD::Constant: {
2442 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2443 if (!foldOffsetIntoAddress(Val, AM))
2444 return false;
2445 break;
2446 }
2447
2448 case X86ISD::Wrapper:
2449 case X86ISD::WrapperRIP:
2450 if (!matchWrapper(N, AM))
2451 return false;
2452 break;
2453
2454 case ISD::LOAD:
2455 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2456 return false;
2457 break;
2458
2459 case ISD::FrameIndex:
2460 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2461 AM.Base_Reg.getNode() == nullptr &&
2462 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2463 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2464 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2465 return false;
2466 }
2467 break;
2468
2469 case ISD::SHL:
2470 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2471 break;
2472
2473 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2474 unsigned Val = CN->getZExtValue();
2475 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2476 // that the base operand remains free for further matching. If
2477 // the base doesn't end up getting used, a post-processing step
2478 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2479 if (Val == 1 || Val == 2 || Val == 3) {
2480 SDValue ShVal = N.getOperand(0);
2481 AM.Scale = 1 << Val;
2482 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2483 return false;
2484 }
2485 }
2486 break;
2487
2488 case ISD::SRL: {
2489 // Scale must not be used already.
2490 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2491
2492 // We only handle up to 64-bit values here as those are what matter for
2493 // addressing mode optimizations.
2494 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2495 "Unexpected value size!");
2496
2497 SDValue And = N.getOperand(0);
2498 if (And.getOpcode() != ISD::AND) break;
2499 SDValue X = And.getOperand(0);
2500
2501 // The mask used for the transform is expected to be post-shift, but we
2502 // found the shift first so just apply the shift to the mask before passing
2503 // it down.
2504 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2505 !isa<ConstantSDNode>(And.getOperand(1)))
2506 break;
2507 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2508
2509 // Try to fold the mask and shift into the scale, and return false if we
2510 // succeed.
2511 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2512 return false;
2513 break;
2514 }
2515
2516 case ISD::SMUL_LOHI:
2517 case ISD::UMUL_LOHI:
2518 // A mul_lohi where we need the low part can be folded as a plain multiply.
2519 if (N.getResNo() != 0) break;
2520 [[fallthrough]];
2521 case ISD::MUL:
2522 case X86ISD::MUL_IMM:
2523 // X*[3,5,9] -> X+X*[2,4,8]
2524 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2525 AM.Base_Reg.getNode() == nullptr &&
2526 AM.IndexReg.getNode() == nullptr) {
2527 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2528 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2529 CN->getZExtValue() == 9) {
2530 AM.Scale = unsigned(CN->getZExtValue())-1;
2531
2532 SDValue MulVal = N.getOperand(0);
2533 SDValue Reg;
2534
2535 // Okay, we know that we have a scale by now. However, if the scaled
2536 // value is an add of something and a constant, we can fold the
2537 // constant into the disp field here.
2538 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2539 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2540 Reg = MulVal.getOperand(0);
2541 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2542 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2543 if (foldOffsetIntoAddress(Disp, AM))
2544 Reg = N.getOperand(0);
2545 } else {
2546 Reg = N.getOperand(0);
2547 }
2548
2549 AM.IndexReg = AM.Base_Reg = Reg;
2550 return false;
2551 }
2552 }
2553 break;
2554
2555 case ISD::SUB: {
2556 // Given A-B, if A can be completely folded into the address and
2557 // the index field with the index field unused, use -B as the index.
2558 // This is a win if a has multiple parts that can be folded into
2559 // the address. Also, this saves a mov if the base register has
2560 // other uses, since it avoids a two-address sub instruction, however
2561 // it costs an additional mov if the index register has other uses.
2562
2563 // Add an artificial use to this node so that we can keep track of
2564 // it if it gets CSE'd with a different node.
2565 HandleSDNode Handle(N);
2566
2567 // Test if the LHS of the sub can be folded.
2568 X86ISelAddressMode Backup = AM;
2569 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2570 N = Handle.getValue();
2571 AM = Backup;
2572 break;
2573 }
2574 N = Handle.getValue();
2575 // Test if the index field is free for use.
2576 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2577 AM = Backup;
2578 break;
2579 }
2580
2581 int Cost = 0;
2582 SDValue RHS = N.getOperand(1);
2583 // If the RHS involves a register with multiple uses, this
2584 // transformation incurs an extra mov, due to the neg instruction
2585 // clobbering its operand.
2586 if (!RHS.getNode()->hasOneUse() ||
2587 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2588 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2589 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2590 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2591 RHS.getOperand(0).getValueType() == MVT::i32))
2592 ++Cost;
2593 // If the base is a register with multiple uses, this
2594 // transformation may save a mov.
2595 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2596 !AM.Base_Reg.getNode()->hasOneUse()) ||
2597 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2598 --Cost;
2599 // If the folded LHS was interesting, this transformation saves
2600 // address arithmetic.
2601 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2602 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2603 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2604 --Cost;
2605 // If it doesn't look like it may be an overall win, don't do it.
2606 if (Cost >= 0) {
2607 AM = Backup;
2608 break;
2609 }
2610
2611 // Ok, the transformation is legal and appears profitable. Go for it.
2612 // Negation will be emitted later to avoid creating dangling nodes if this
2613 // was an unprofitable LEA.
2614 AM.IndexReg = RHS;
2615 AM.NegateIndex = true;
2616 AM.Scale = 1;
2617 return false;
2618 }
2619
2620 case ISD::OR:
2621 case ISD::XOR:
2622 // See if we can treat the OR/XOR node as an ADD node.
2623 if (!CurDAG->isADDLike(N))
2624 break;
2625 [[fallthrough]];
2626 case ISD::ADD:
2627 if (!matchAdd(N, AM, Depth))
2628 return false;
2629 break;
2630
2631 case ISD::AND: {
2632 // Perform some heroic transforms on an and of a constant-count shift
2633 // with a constant to enable use of the scaled offset field.
2634
2635 // Scale must not be used already.
2636 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2637
2638 // We only handle up to 64-bit values here as those are what matter for
2639 // addressing mode optimizations.
2640 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2641 "Unexpected value size!");
2642
2643 if (!isa<ConstantSDNode>(N.getOperand(1)))
2644 break;
2645
2646 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2647 SDValue Shift = N.getOperand(0);
2648 SDValue X = Shift.getOperand(0);
2649
2650 uint64_t Mask = N.getConstantOperandVal(1);
2651
2652 // Try to fold the mask and shift into an extract and scale.
2653 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2654 return false;
2655
2656 // Try to fold the mask and shift directly into the scale.
2657 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2658 return false;
2659
2660 // Try to fold the mask and shift into BEXTR and scale.
2661 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2662 return false;
2663 }
2664
2665 // Try to swap the mask and shift to place shifts which can be done as
2666 // a scale on the outside of the mask.
2667 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2668 return false;
2669
2670 break;
2671 }
2672 case ISD::ZERO_EXTEND: {
2673 // Try to widen a zexted shift left to the same size as its use, so we can
2674 // match the shift as a scale factor.
2675 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2676 break;
2677
2678 SDValue Src = N.getOperand(0);
2679
2680 // See if we can match a zext(addlike(x,c)).
2681 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2682 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2683 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2684 if (Index != N) {
2685 AM.IndexReg = Index;
2686 return false;
2687 }
2688
2689 // Peek through mask: zext(and(shl(x,c1),c2))
2690 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2691 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2692 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2693 Mask = MaskC->getAPIntValue();
2694 Src = Src.getOperand(0);
2695 }
2696
2697 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse()) {
2698 // Give up if the shift is not a valid scale factor [1,2,3].
2699 SDValue ShlSrc = Src.getOperand(0);
2700 SDValue ShlAmt = Src.getOperand(1);
2701 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2702 if (!ShAmtC)
2703 break;
2704 unsigned ShAmtV = ShAmtC->getZExtValue();
2705 if (ShAmtV > 3)
2706 break;
2707
2708 // The narrow shift must only shift out zero bits (it must be 'nuw').
2709 // That makes it safe to widen to the destination type.
2710 APInt HighZeros =
2711 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2712 if (!Src->getFlags().hasNoUnsignedWrap() &&
2713 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2714 break;
2715
2716 // zext (shl nuw i8 %x, C1) to i32
2717 // --> shl (zext i8 %x to i32), (zext C1)
2718 // zext (and (shl nuw i8 %x, C1), C2) to i32
2719 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2720 MVT SrcVT = ShlSrc.getSimpleValueType();
2721 MVT VT = N.getSimpleValueType();
2722 SDLoc DL(N);
2723
2724 SDValue Res = ShlSrc;
2725 if (!Mask.isAllOnes()) {
2726 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2727 insertDAGNode(*CurDAG, N, Res);
2728 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2729 insertDAGNode(*CurDAG, N, Res);
2730 }
2731 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2732 insertDAGNode(*CurDAG, N, Zext);
2733 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2734 insertDAGNode(*CurDAG, N, NewShl);
2735 CurDAG->ReplaceAllUsesWith(N, NewShl);
2736 CurDAG->RemoveDeadNode(N.getNode());
2737
2738 // Convert the shift to scale factor.
2739 AM.Scale = 1 << ShAmtV;
2740 // If matchIndexRecursively is not called here,
2741 // Zext may be replaced by other nodes but later used to call a builder
2742 // method
2743 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2744 return false;
2745 }
2746
2747 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2748 // Try to fold the mask and shift into an extract and scale.
2749 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2750 Src.getOperand(0), AM))
2751 return false;
2752
2753 // Try to fold the mask and shift directly into the scale.
2754 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2755 Src.getOperand(0), AM))
2756 return false;
2757
2758 // Try to fold the mask and shift into BEXTR and scale.
2759 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2760 Src.getOperand(0), AM, *Subtarget))
2761 return false;
2762 }
2763
2764 break;
2765 }
2766 }
2767
2768 return matchAddressBase(N, AM);
2769}
2770
2771/// Helper for MatchAddress. Add the specified node to the
2772/// specified addressing mode without any further recursion.
2773bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2774 // Is the base register already occupied?
2775 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2776 // If so, check to see if the scale index register is set.
2777 if (!AM.IndexReg.getNode()) {
2778 AM.IndexReg = N;
2779 AM.Scale = 1;
2780 return false;
2781 }
2782
2783 // Otherwise, we cannot select it.
2784 return true;
2785 }
2786
2787 // Default, generate it as a register.
2788 AM.BaseType = X86ISelAddressMode::RegBase;
2789 AM.Base_Reg = N;
2790 return false;
2791}
2792
2793bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2794 X86ISelAddressMode &AM,
2795 unsigned Depth) {
2796 SDLoc dl(N);
2797 LLVM_DEBUG({
2798 dbgs() << "MatchVectorAddress: ";
2799 AM.dump(CurDAG);
2800 });
2801 // Limit recursion.
2803 return matchAddressBase(N, AM);
2804
2805 // TODO: Support other operations.
2806 switch (N.getOpcode()) {
2807 case ISD::Constant: {
2808 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2809 if (!foldOffsetIntoAddress(Val, AM))
2810 return false;
2811 break;
2812 }
2813 case X86ISD::Wrapper:
2814 if (!matchWrapper(N, AM))
2815 return false;
2816 break;
2817 case ISD::ADD: {
2818 // Add an artificial use to this node so that we can keep track of
2819 // it if it gets CSE'd with a different node.
2820 HandleSDNode Handle(N);
2821
2822 X86ISelAddressMode Backup = AM;
2823 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2824 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2825 Depth + 1))
2826 return false;
2827 AM = Backup;
2828
2829 // Try again after commuting the operands.
2830 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2831 Depth + 1) &&
2832 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2833 Depth + 1))
2834 return false;
2835 AM = Backup;
2836
2837 N = Handle.getValue();
2838 break;
2839 }
2840 }
2841
2842 return matchAddressBase(N, AM);
2843}
2844
2845/// Helper for selectVectorAddr. Handles things that can be folded into a
2846/// gather/scatter address. The index register and scale should have already
2847/// been handled.
2848bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2849 return matchVectorAddressRecursively(N, AM, 0);
2850}
2851
2852bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2853 SDValue IndexOp, SDValue ScaleOp,
2854 SDValue &Base, SDValue &Scale,
2855 SDValue &Index, SDValue &Disp,
2856 SDValue &Segment) {
2857 X86ISelAddressMode AM;
2858 AM.Scale = ScaleOp->getAsZExtVal();
2859
2860 // Attempt to match index patterns, as long as we're not relying on implicit
2861 // sign-extension, which is performed BEFORE scale.
2862 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2863 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2864 else
2865 AM.IndexReg = IndexOp;
2866
2867 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2868 if (AddrSpace == X86AS::GS)
2869 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2870 if (AddrSpace == X86AS::FS)
2871 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2872 if (AddrSpace == X86AS::SS)
2873 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2874
2875 SDLoc DL(BasePtr);
2876 MVT VT = BasePtr.getSimpleValueType();
2877
2878 // Try to match into the base and displacement fields.
2879 if (matchVectorAddress(BasePtr, AM))
2880 return false;
2881
2882 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2883 return true;
2884}
2885
2886/// Returns true if it is able to pattern match an addressing mode.
2887/// It returns the operands which make up the maximal addressing mode it can
2888/// match by reference.
2889///
2890/// Parent is the parent node of the addr operand that is being matched. It
2891/// is always a load, store, atomic node, or null. It is only null when
2892/// checking memory operands for inline asm nodes.
2893bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2894 SDValue &Scale, SDValue &Index,
2895 SDValue &Disp, SDValue &Segment) {
2896 X86ISelAddressMode AM;
2897
2898 if (Parent &&
2899 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2900 // that are not a MemSDNode, and thus don't have proper addrspace info.
2901 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2902 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2903 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2904 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2905 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2906 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2907 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2908 unsigned AddrSpace =
2909 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2910 if (AddrSpace == X86AS::GS)
2911 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2912 if (AddrSpace == X86AS::FS)
2913 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2914 if (AddrSpace == X86AS::SS)
2915 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2916 }
2917
2918 // Save the DL and VT before calling matchAddress, it can invalidate N.
2919 SDLoc DL(N);
2920 MVT VT = N.getSimpleValueType();
2921
2922 if (matchAddress(N, AM))
2923 return false;
2924
2925 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2926 return true;
2927}
2928
2929bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2930 // Cannot use 32 bit constants to reference objects in kernel/large code
2931 // model.
2932 if (TM.getCodeModel() == CodeModel::Kernel ||
2933 TM.getCodeModel() == CodeModel::Large)
2934 return false;
2935
2936 // In static codegen with small code model, we can get the address of a label
2937 // into a register with 'movl'
2938 if (N->getOpcode() != X86ISD::Wrapper)
2939 return false;
2940
2941 N = N.getOperand(0);
2942
2943 // At least GNU as does not accept 'movl' for TPOFF relocations.
2944 // FIXME: We could use 'movl' when we know we are targeting MC.
2945 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2946 return false;
2947
2948 Imm = N;
2949 // Small/medium code model can reference non-TargetGlobalAddress objects with
2950 // 32 bit constants.
2951 if (N->getOpcode() != ISD::TargetGlobalAddress) {
2952 return TM.getCodeModel() == CodeModel::Small ||
2953 TM.getCodeModel() == CodeModel::Medium;
2954 }
2955
2956 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
2957 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
2958 return CR->getUnsignedMax().ult(1ull << 32);
2959
2960 return !TM.isLargeGlobalValue(GV);
2961}
2962
2963bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2964 SDValue &Scale, SDValue &Index,
2965 SDValue &Disp, SDValue &Segment) {
2966 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2967 SDLoc DL(N);
2968
2969 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2970 return false;
2971
2972 auto *RN = dyn_cast<RegisterSDNode>(Base);
2973 if (RN && RN->getReg() == 0)
2974 Base = CurDAG->getRegister(0, MVT::i64);
2975 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2976 // Base could already be %rip, particularly in the x32 ABI.
2977 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2978 MVT::i64), 0);
2979 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2980 Base);
2981 }
2982
2983 RN = dyn_cast<RegisterSDNode>(Index);
2984 if (RN && RN->getReg() == 0)
2985 Index = CurDAG->getRegister(0, MVT::i64);
2986 else {
2987 assert(Index.getValueType() == MVT::i32 &&
2988 "Expect to be extending 32-bit registers for use in LEA");
2989 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2990 MVT::i64), 0);
2991 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2992 Index);
2993 }
2994
2995 return true;
2996}
2997
2998/// Calls SelectAddr and determines if the maximal addressing
2999/// mode it matches can be cost effectively emitted as an LEA instruction.
3000bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3001 SDValue &Base, SDValue &Scale,
3002 SDValue &Index, SDValue &Disp,
3003 SDValue &Segment) {
3004 X86ISelAddressMode AM;
3005
3006 // Save the DL and VT before calling matchAddress, it can invalidate N.
3007 SDLoc DL(N);
3008 MVT VT = N.getSimpleValueType();
3009
3010 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3011 // segments.
3012 SDValue Copy = AM.Segment;
3013 SDValue T = CurDAG->getRegister(0, MVT::i32);
3014 AM.Segment = T;
3015 if (matchAddress(N, AM))
3016 return false;
3017 assert (T == AM.Segment);
3018 AM.Segment = Copy;
3019
3020 unsigned Complexity = 0;
3021 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3022 Complexity = 1;
3023 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3024 Complexity = 4;
3025
3026 if (AM.IndexReg.getNode())
3027 Complexity++;
3028
3029 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3030 // a simple shift.
3031 if (AM.Scale > 1)
3032 Complexity++;
3033
3034 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3035 // to a LEA. This is determined with some experimentation but is by no means
3036 // optimal (especially for code size consideration). LEA is nice because of
3037 // its three-address nature. Tweak the cost function again when we can run
3038 // convertToThreeAddress() at register allocation time.
3039 if (AM.hasSymbolicDisplacement()) {
3040 // For X86-64, always use LEA to materialize RIP-relative addresses.
3041 if (Subtarget->is64Bit())
3042 Complexity = 4;
3043 else
3044 Complexity += 2;
3045 }
3046
3047 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3048 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3049 // duplicating flag-producing instructions later in the pipeline.
3050 if (N.getOpcode() == ISD::ADD) {
3051 auto isMathWithFlags = [](SDValue V) {
3052 switch (V.getOpcode()) {
3053 case X86ISD::ADD:
3054 case X86ISD::SUB:
3055 case X86ISD::ADC:
3056 case X86ISD::SBB:
3057 case X86ISD::SMUL:
3058 case X86ISD::UMUL:
3059 /* TODO: These opcodes can be added safely, but we may want to justify
3060 their inclusion for different reasons (better for reg-alloc).
3061 case X86ISD::OR:
3062 case X86ISD::XOR:
3063 case X86ISD::AND:
3064 */
3065 // Value 1 is the flag output of the node - verify it's not dead.
3066 return !SDValue(V.getNode(), 1).use_empty();
3067 default:
3068 return false;
3069 }
3070 };
3071 // TODO: We might want to factor in whether there's a load folding
3072 // opportunity for the math op that disappears with LEA.
3073 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3074 Complexity++;
3075 }
3076
3077 if (AM.Disp)
3078 Complexity++;
3079
3080 // If it isn't worth using an LEA, reject it.
3081 if (Complexity <= 2)
3082 return false;
3083
3084 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3085 return true;
3086}
3087
3088/// This is only run on TargetGlobalTLSAddress nodes.
3089bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3090 SDValue &Scale, SDValue &Index,
3091 SDValue &Disp, SDValue &Segment) {
3092 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3093 N.getOpcode() == ISD::TargetExternalSymbol);
3094
3095 X86ISelAddressMode AM;
3096 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3097 AM.GV = GA->getGlobal();
3098 AM.Disp += GA->getOffset();
3099 AM.SymbolFlags = GA->getTargetFlags();
3100 } else {
3101 auto *SA = cast<ExternalSymbolSDNode>(N);
3102 AM.ES = SA->getSymbol();
3103 AM.SymbolFlags = SA->getTargetFlags();
3104 }
3105
3106 if (Subtarget->is32Bit()) {
3107 AM.Scale = 1;
3108 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3109 }
3110
3111 MVT VT = N.getSimpleValueType();
3112 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3113 return true;
3114}
3115
3116bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3117 // Keep track of the original value type and whether this value was
3118 // truncated. If we see a truncation from pointer type to VT that truncates
3119 // bits that are known to be zero, we can use a narrow reference.
3120 EVT VT = N.getValueType();
3121 bool WasTruncated = false;
3122 if (N.getOpcode() == ISD::TRUNCATE) {
3123 WasTruncated = true;
3124 N = N.getOperand(0);
3125 }
3126
3127 if (N.getOpcode() != X86ISD::Wrapper)
3128 return false;
3129
3130 // We can only use non-GlobalValues as immediates if they were not truncated,
3131 // as we do not have any range information. If we have a GlobalValue and the
3132 // address was not truncated, we can select it as an operand directly.
3133 unsigned Opc = N.getOperand(0)->getOpcode();
3134 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3135 Op = N.getOperand(0);
3136 // We can only select the operand directly if we didn't have to look past a
3137 // truncate.
3138 return !WasTruncated;
3139 }
3140
3141 // Check that the global's range fits into VT.
3142 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3143 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3144 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3145 return false;
3146
3147 // Okay, we can use a narrow reference.
3148 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3149 GA->getOffset(), GA->getTargetFlags());
3150 return true;
3151}
3152
3153bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3154 SDValue &Base, SDValue &Scale,
3155 SDValue &Index, SDValue &Disp,
3156 SDValue &Segment) {
3157 assert(Root && P && "Unknown root/parent nodes");
3158 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3159 !IsProfitableToFold(N, P, Root) ||
3160 !IsLegalToFold(N, P, Root, OptLevel))
3161 return false;
3162
3163 return selectAddr(N.getNode(),
3164 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3165}
3166
3167bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3168 SDValue &Base, SDValue &Scale,
3169 SDValue &Index, SDValue &Disp,
3170 SDValue &Segment) {
3171 assert(Root && P && "Unknown root/parent nodes");
3172 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3173 !IsProfitableToFold(N, P, Root) ||
3174 !IsLegalToFold(N, P, Root, OptLevel))
3175 return false;
3176
3177 return selectAddr(N.getNode(),
3178 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3179}
3180
3181/// Return an SDNode that returns the value of the global base register.
3182/// Output instructions required to initialize the global base register,
3183/// if necessary.
3184SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3185 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3186 auto &DL = MF->getDataLayout();
3187 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3188}
3189
3190bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3191 if (N->getOpcode() == ISD::TRUNCATE)
3192 N = N->getOperand(0).getNode();
3193 if (N->getOpcode() != X86ISD::Wrapper)
3194 return false;
3195
3196 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3197 if (!GA)
3198 return false;
3199
3200 auto *GV = GA->getGlobal();
3201 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3202 if (CR)
3203 return CR->getSignedMin().sge(-1ull << Width) &&
3204 CR->getSignedMax().slt(1ull << Width);
3205 // In the kernel code model, globals are in the negative 2GB of the address
3206 // space, so globals can be a sign extended 32-bit immediate.
3207 // In other code models, small globals are in the low 2GB of the address
3208 // space, so sign extending them is equivalent to zero extending them.
3209 return Width == 32 && !TM.isLargeGlobalValue(GV);
3210}
3211
3212X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3213 assert(N->isMachineOpcode() && "Unexpected node");
3214 unsigned Opc = N->getMachineOpcode();
3215 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3216 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3217 if (CondNo < 0)
3218 return X86::COND_INVALID;
3219
3220 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3221}
3222
3223/// Test whether the given X86ISD::CMP node has any users that use a flag
3224/// other than ZF.
3225bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3226 // Examine each user of the node.
3227 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3228 UI != UE; ++UI) {
3229 // Only check things that use the flags.
3230 if (UI.getUse().getResNo() != Flags.getResNo())
3231 continue;
3232 // Only examine CopyToReg uses that copy to EFLAGS.
3233 if (UI->getOpcode() != ISD::CopyToReg ||
3234 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3235 return false;
3236 // Examine each user of the CopyToReg use.
3237 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3238 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3239 // Only examine the Flag result.
3240 if (FlagUI.getUse().getResNo() != 1) continue;
3241 // Anything unusual: assume conservatively.
3242 if (!FlagUI->isMachineOpcode()) return false;
3243 // Examine the condition code of the user.
3244 X86::CondCode CC = getCondFromNode(*FlagUI);
3245
3246 switch (CC) {
3247 // Comparisons which only use the zero flag.
3248 case X86::COND_E: case X86::COND_NE:
3249 continue;
3250 // Anything else: assume conservatively.
3251 default:
3252 return false;
3253 }
3254 }
3255 }
3256 return true;
3257}
3258
3259/// Test whether the given X86ISD::CMP node has any uses which require the SF
3260/// flag to be accurate.
3261bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3262 // Examine each user of the node.
3263 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3264 UI != UE; ++UI) {
3265 // Only check things that use the flags.
3266 if (UI.getUse().getResNo() != Flags.getResNo())
3267 continue;
3268 // Only examine CopyToReg uses that copy to EFLAGS.
3269 if (UI->getOpcode() != ISD::CopyToReg ||
3270 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3271 return false;
3272 // Examine each user of the CopyToReg use.
3273 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3274 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3275 // Only examine the Flag result.
3276 if (FlagUI.getUse().getResNo() != 1) continue;
3277 // Anything unusual: assume conservatively.
3278 if (!FlagUI->isMachineOpcode()) return false;
3279 // Examine the condition code of the user.
3280 X86::CondCode CC = getCondFromNode(*FlagUI);
3281
3282 switch (CC) {
3283 // Comparisons which don't examine the SF flag.
3284 case X86::COND_A: case X86::COND_AE:
3285 case X86::COND_B: case X86::COND_BE:
3286 case X86::COND_E: case X86::COND_NE:
3287 case X86::COND_O: case X86::COND_NO:
3288 case X86::COND_P: case X86::COND_NP:
3289 continue;
3290 // Anything else: assume conservatively.
3291 default:
3292 return false;
3293 }
3294 }
3295 }
3296 return true;
3297}
3298
3300 switch (CC) {
3301 // Comparisons which don't examine the CF flag.
3302 case X86::COND_O: case X86::COND_NO:
3303 case X86::COND_E: case X86::COND_NE:
3304 case X86::COND_S: case X86::COND_NS:
3305 case X86::COND_P: case X86::COND_NP:
3306 case X86::COND_L: case X86::COND_GE:
3307 case X86::COND_G: case X86::COND_LE:
3308 return false;
3309 // Anything else: assume conservatively.
3310 default:
3311 return true;
3312 }
3313}
3314
3315/// Test whether the given node which sets flags has any uses which require the
3316/// CF flag to be accurate.
3317 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3318 // Examine each user of the node.
3319 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3320 UI != UE; ++UI) {
3321 // Only check things that use the flags.
3322 if (UI.getUse().getResNo() != Flags.getResNo())
3323 continue;
3324
3325 unsigned UIOpc = UI->getOpcode();
3326
3327 if (UIOpc == ISD::CopyToReg) {
3328 // Only examine CopyToReg uses that copy to EFLAGS.
3329 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3330 return false;
3331 // Examine each user of the CopyToReg use.
3332 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
3333 FlagUI != FlagUE; ++FlagUI) {
3334 // Only examine the Flag result.
3335 if (FlagUI.getUse().getResNo() != 1)
3336 continue;
3337 // Anything unusual: assume conservatively.
3338 if (!FlagUI->isMachineOpcode())
3339 return false;
3340 // Examine the condition code of the user.
3341 X86::CondCode CC = getCondFromNode(*FlagUI);
3342
3343 if (mayUseCarryFlag(CC))
3344 return false;
3345 }
3346
3347 // This CopyToReg is ok. Move on to the next user.
3348 continue;
3349 }
3350
3351 // This might be an unselected node. So look for the pre-isel opcodes that
3352 // use flags.
3353 unsigned CCOpNo;
3354 switch (UIOpc) {
3355 default:
3356 // Something unusual. Be conservative.
3357 return false;
3358 case X86ISD::SETCC: CCOpNo = 0; break;
3359 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3360 case X86ISD::CMOV: CCOpNo = 2; break;
3361 case X86ISD::BRCOND: CCOpNo = 2; break;
3362 }
3363
3364 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
3365 if (mayUseCarryFlag(CC))
3366 return false;
3367 }
3368 return true;
3369}
3370
3371/// Check whether or not the chain ending in StoreNode is suitable for doing
3372/// the {load; op; store} to modify transformation.
3374 SDValue StoredVal, SelectionDAG *CurDAG,
3375 unsigned LoadOpNo,
3376 LoadSDNode *&LoadNode,
3377 SDValue &InputChain) {
3378 // Is the stored value result 0 of the operation?
3379 if (StoredVal.getResNo() != 0) return false;
3380
3381 // Are there other uses of the operation other than the store?
3382 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3383
3384 // Is the store non-extending and non-indexed?
3385 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3386 return false;
3387
3388 SDValue Load = StoredVal->getOperand(LoadOpNo);
3389 // Is the stored value a non-extending and non-indexed load?
3390 if (!ISD::isNormalLoad(Load.getNode())) return false;
3391
3392 // Return LoadNode by reference.
3393 LoadNode = cast<LoadSDNode>(Load);
3394
3395 // Is store the only read of the loaded value?
3396 if (!Load.hasOneUse())
3397 return false;
3398
3399 // Is the address of the store the same as the load?
3400 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3401 LoadNode->getOffset() != StoreNode->getOffset())
3402 return false;
3403
3404 bool FoundLoad = false;
3405 SmallVector<SDValue, 4> ChainOps;
3406 SmallVector<const SDNode *, 4> LoopWorklist;
3408 const unsigned int Max = 1024;
3409
3410 // Visualization of Load-Op-Store fusion:
3411 // -------------------------
3412 // Legend:
3413 // *-lines = Chain operand dependencies.
3414 // |-lines = Normal operand dependencies.
3415 // Dependencies flow down and right. n-suffix references multiple nodes.
3416 //
3417 // C Xn C
3418 // * * *
3419 // * * *
3420 // Xn A-LD Yn TF Yn
3421 // * * \ | * |
3422 // * * \ | * |
3423 // * * \ | => A--LD_OP_ST
3424 // * * \| \
3425 // TF OP \
3426 // * | \ Zn
3427 // * | \
3428 // A-ST Zn
3429 //
3430
3431 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3432 // #2: Yn -> LD
3433 // #3: ST -> Zn
3434
3435 // Ensure the transform is safe by checking for the dual
3436 // dependencies to make sure we do not induce a loop.
3437
3438 // As LD is a predecessor to both OP and ST we can do this by checking:
3439 // a). if LD is a predecessor to a member of Xn or Yn.
3440 // b). if a Zn is a predecessor to ST.
3441
3442 // However, (b) can only occur through being a chain predecessor to
3443 // ST, which is the same as Zn being a member or predecessor of Xn,
3444 // which is a subset of LD being a predecessor of Xn. So it's
3445 // subsumed by check (a).
3446
3447 SDValue Chain = StoreNode->getChain();
3448
3449 // Gather X elements in ChainOps.
3450 if (Chain == Load.getValue(1)) {
3451 FoundLoad = true;
3452 ChainOps.push_back(Load.getOperand(0));
3453 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3454 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3455 SDValue Op = Chain.getOperand(i);
3456 if (Op == Load.getValue(1)) {
3457 FoundLoad = true;
3458 // Drop Load, but keep its chain. No cycle check necessary.
3459 ChainOps.push_back(Load.getOperand(0));
3460 continue;
3461 }
3462 LoopWorklist.push_back(Op.getNode());
3463 ChainOps.push_back(Op);
3464 }
3465 }
3466
3467 if (!FoundLoad)
3468 return false;
3469
3470 // Worklist is currently Xn. Add Yn to worklist.
3471 for (SDValue Op : StoredVal->ops())
3472 if (Op.getNode() != LoadNode)
3473 LoopWorklist.push_back(Op.getNode());
3474
3475 // Check (a) if Load is a predecessor to Xn + Yn
3476 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3477 true))
3478 return false;
3479
3480 InputChain =
3481 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3482 return true;
3483}
3484
3485// Change a chain of {load; op; store} of the same value into a simple op
3486// through memory of that value, if the uses of the modified value and its
3487// address are suitable.
3488//
3489// The tablegen pattern memory operand pattern is currently not able to match
3490// the case where the EFLAGS on the original operation are used.
3491//
3492// To move this to tablegen, we'll need to improve tablegen to allow flags to
3493// be transferred from a node in the pattern to the result node, probably with
3494// a new keyword. For example, we have this
3495// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3496// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3497// (implicit EFLAGS)]>;
3498// but maybe need something like this
3499// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3500// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3501// (transferrable EFLAGS)]>;
3502//
3503// Until then, we manually fold these and instruction select the operation
3504// here.
3505bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3506 auto *StoreNode = cast<StoreSDNode>(Node);
3507 SDValue StoredVal = StoreNode->getOperand(1);
3508 unsigned Opc = StoredVal->getOpcode();
3509
3510 // Before we try to select anything, make sure this is memory operand size
3511 // and opcode we can handle. Note that this must match the code below that
3512 // actually lowers the opcodes.
3513 EVT MemVT = StoreNode->getMemoryVT();
3514 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3515 MemVT != MVT::i8)
3516 return false;
3517
3518 bool IsCommutable = false;
3519 bool IsNegate = false;
3520 switch (Opc) {
3521 default:
3522 return false;
3523 case X86ISD::SUB:
3524 IsNegate = isNullConstant(StoredVal.getOperand(0));
3525 break;
3526 case X86ISD::SBB:
3527 break;
3528 case X86ISD::ADD:
3529 case X86ISD::ADC:
3530 case X86ISD::AND:
3531 case X86ISD::OR:
3532 case X86ISD::XOR:
3533 IsCommutable = true;
3534 break;
3535 }
3536
3537 unsigned LoadOpNo = IsNegate ? 1 : 0;
3538 LoadSDNode *LoadNode = nullptr;
3539 SDValue InputChain;
3540 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3541 LoadNode, InputChain)) {
3542 if (!IsCommutable)
3543 return false;
3544
3545 // This operation is commutable, try the other operand.
3546 LoadOpNo = 1;
3547 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3548 LoadNode, InputChain))
3549 return false;
3550 }
3551
3552 SDValue Base, Scale, Index, Disp, Segment;
3553 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3554 Segment))
3555 return false;
3556
3557 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3558 unsigned Opc8) {
3559 switch (MemVT.getSimpleVT().SimpleTy) {
3560 case MVT::i64:
3561 return Opc64;
3562 case MVT::i32:
3563 return Opc32;
3564 case MVT::i16:
3565 return Opc16;
3566 case MVT::i8:
3567 return Opc8;
3568 default:
3569 llvm_unreachable("Invalid size!");
3570 }
3571 };
3572
3574 switch (Opc) {
3575 case X86ISD::SUB:
3576 // Handle negate.
3577 if (IsNegate) {
3578 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3579 X86::NEG8m);
3580 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3581 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3582 MVT::Other, Ops);
3583 break;
3584 }
3585 [[fallthrough]];
3586 case X86ISD::ADD:
3587 // Try to match inc/dec.
3588 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3589 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3590 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3591 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3592 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3593 unsigned NewOpc =
3594 ((Opc == X86ISD::ADD) == IsOne)
3595 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3596 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3597 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3598 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3599 MVT::Other, Ops);
3600 break;
3601 }
3602 }
3603 [[fallthrough]];
3604 case X86ISD::ADC:
3605 case X86ISD::SBB:
3606 case X86ISD::AND:
3607 case X86ISD::OR:
3608 case X86ISD::XOR: {
3609 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3610 switch (Opc) {
3611 case X86ISD::ADD:
3612 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3613 X86::ADD8mr);
3614 case X86ISD::ADC:
3615 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3616 X86::ADC8mr);
3617 case X86ISD::SUB:
3618 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3619 X86::SUB8mr);
3620 case X86ISD::SBB:
3621 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3622 X86::SBB8mr);
3623 case X86ISD::AND:
3624 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3625 X86::AND8mr);
3626 case X86ISD::OR:
3627 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3628 case X86ISD::XOR:
3629 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3630 X86::XOR8mr);
3631 default:
3632 llvm_unreachable("Invalid opcode!");
3633 }
3634 };
3635 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3636 switch (Opc) {
3637 case X86ISD::ADD:
3638 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3639 X86::ADD8mi);
3640 case X86ISD::ADC:
3641 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3642 X86::ADC8mi);
3643 case X86ISD::SUB:
3644 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3645 X86::SUB8mi);
3646 case X86ISD::SBB:
3647 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3648 X86::SBB8mi);
3649 case X86ISD::AND:
3650 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3651 X86::AND8mi);
3652 case X86ISD::OR:
3653 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3654 X86::OR8mi);
3655 case X86ISD::XOR:
3656 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3657 X86::XOR8mi);
3658 default:
3659 llvm_unreachable("Invalid opcode!");
3660 }
3661 };
3662
3663 unsigned NewOpc = SelectRegOpcode(Opc);
3664 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3665
3666 // See if the operand is a constant that we can fold into an immediate
3667 // operand.
3668 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3669 int64_t OperandV = OperandC->getSExtValue();
3670
3671 // Check if we can shrink the operand enough to fit in an immediate (or
3672 // fit into a smaller immediate) by negating it and switching the
3673 // operation.
3674 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3675 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3676 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3677 isInt<32>(-OperandV))) &&
3678 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3679 OperandV = -OperandV;
3680 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3681 }
3682
3683 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3684 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3685 NewOpc = SelectImmOpcode(Opc);
3686 }
3687 }
3688
3689 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3690 SDValue CopyTo =
3691 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3692 StoredVal.getOperand(2), SDValue());
3693
3694 const SDValue Ops[] = {Base, Scale, Index, Disp,
3695 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3696 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3697 Ops);
3698 } else {
3699 const SDValue Ops[] = {Base, Scale, Index, Disp,
3700 Segment, Operand, InputChain};
3701 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3702 Ops);
3703 }
3704 break;
3705 }
3706 default:
3707 llvm_unreachable("Invalid opcode!");
3708 }
3709
3710 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3711 LoadNode->getMemOperand()};
3712 CurDAG->setNodeMemRefs(Result, MemOps);
3713
3714 // Update Load Chain uses as well.
3715 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3716 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3717 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3718 CurDAG->RemoveDeadNode(Node);
3719 return true;
3720}
3721
3722// See if this is an X & Mask that we can match to BEXTR/BZHI.
3723// Where Mask is one of the following patterns:
3724// a) x & (1 << nbits) - 1
3725// b) x & ~(-1 << nbits)
3726// c) x & (-1 >> (32 - y))
3727// d) x << (32 - y) >> (32 - y)
3728// e) (1 << nbits) - 1
3729bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3730 assert(
3731 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3732 Node->getOpcode() == ISD::SRL) &&
3733 "Should be either an and-mask, or right-shift after clearing high bits.");
3734
3735 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3736 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3737 return false;
3738
3739 MVT NVT = Node->getSimpleValueType(0);
3740
3741 // Only supported for 32 and 64 bits.
3742 if (NVT != MVT::i32 && NVT != MVT::i64)
3743 return false;
3744
3745 SDValue NBits;
3746 bool NegateNBits;
3747
3748 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3749 // Else, if we only have BMI1's BEXTR, we require one-use.
3750 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3751 auto checkUses = [AllowExtraUsesByDefault](
3752 SDValue Op, unsigned NUses,
3753 std::optional<bool> AllowExtraUses) {
3754 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3755 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3756 };
3757 auto checkOneUse = [checkUses](SDValue Op,
3758 std::optional<bool> AllowExtraUses =
3759 std::nullopt) {
3760 return checkUses(Op, 1, AllowExtraUses);
3761 };
3762 auto checkTwoUse = [checkUses](SDValue Op,
3763 std::optional<bool> AllowExtraUses =
3764 std::nullopt) {
3765 return checkUses(Op, 2, AllowExtraUses);
3766 };
3767
3768 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3769 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3770 assert(V.getSimpleValueType() == MVT::i32 &&
3771 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3772 "Expected i64 -> i32 truncation");
3773 V = V.getOperand(0);
3774 }
3775 return V;
3776 };
3777
3778 // a) x & ((1 << nbits) + (-1))
3779 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3780 &NegateNBits](SDValue Mask) -> bool {
3781 // Match `add`. Must only have one use!
3782 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3783 return false;
3784 // We should be adding all-ones constant (i.e. subtracting one.)
3785 if (!isAllOnesConstant(Mask->getOperand(1)))
3786 return false;
3787 // Match `1 << nbits`. Might be truncated. Must only have one use!
3788 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3789 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3790 return false;
3791 if (!isOneConstant(M0->getOperand(0)))
3792 return false;
3793 NBits = M0->getOperand(1);
3794 NegateNBits = false;
3795 return true;
3796 };
3797
3798 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3799 V = peekThroughOneUseTruncation(V);
3800 return CurDAG->MaskedValueIsAllOnes(
3801 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3802 NVT.getSizeInBits()));
3803 };
3804
3805 // b) x & ~(-1 << nbits)
3806 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3807 &NBits, &NegateNBits](SDValue Mask) -> bool {
3808 // Match `~()`. Must only have one use!
3809 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3810 return false;
3811 // The -1 only has to be all-ones for the final Node's NVT.
3812 if (!isAllOnes(Mask->getOperand(1)))
3813 return false;
3814 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3815 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3816 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3817 return false;
3818 // The -1 only has to be all-ones for the final Node's NVT.
3819 if (!isAllOnes(M0->getOperand(0)))
3820 return false;
3821 NBits = M0->getOperand(1);
3822 NegateNBits = false;
3823 return true;
3824 };
3825
3826 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3827 // or leave the shift amount as-is, but then we'll have to negate it.
3828 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3829 unsigned Bitwidth) {
3830 NBits = ShiftAmt;
3831 NegateNBits = true;
3832 // Skip over a truncate of the shift amount, if any.
3833 if (NBits.getOpcode() == ISD::TRUNCATE)
3834 NBits = NBits.getOperand(0);
3835 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3836 // If it doesn't match, that's fine, we'll just negate it ourselves.
3837 if (NBits.getOpcode() != ISD::SUB)
3838 return;
3839 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3840 if (!V0 || V0->getZExtValue() != Bitwidth)
3841 return;
3842 NBits = NBits.getOperand(1);
3843 NegateNBits = false;
3844 };
3845
3846 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3847 // or
3848 // c) x & (-1 >> (32 - y))
3849 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3850 canonicalizeShiftAmt](SDValue Mask) -> bool {
3851 // The mask itself may be truncated.
3852 Mask = peekThroughOneUseTruncation(Mask);
3853 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3854 // Match `l>>`. Must only have one use!
3855 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3856 return false;
3857 // We should be shifting truly all-ones constant.
3858 if (!isAllOnesConstant(Mask.getOperand(0)))
3859 return false;
3860 SDValue M1 = Mask.getOperand(1);
3861 // The shift amount should not be used externally.
3862 if (!checkOneUse(M1))
3863 return false;
3864 canonicalizeShiftAmt(M1, Bitwidth);
3865 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3866 // is no extra use of the mask. Clearly, there was one since we are here.
3867 // But at the same time, if we need to negate the shift amount,
3868 // then we don't want the mask to stick around, else it's unprofitable.
3869 return !NegateNBits;
3870 };
3871
3872 SDValue X;
3873
3874 // d) x << z >> z but then we'll have to subtract z from bitwidth
3875 // or
3876 // d) x << (32 - y) >> (32 - y)
3877 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3878 AllowExtraUsesByDefault, &NegateNBits,
3879 &X](SDNode *Node) -> bool {
3880 if (Node->getOpcode() != ISD::SRL)
3881 return false;
3882 SDValue N0 = Node->getOperand(0);
3883 if (N0->getOpcode() != ISD::SHL)
3884 return false;
3885 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3886 SDValue N1 = Node->getOperand(1);
3887 SDValue N01 = N0->getOperand(1);
3888 // Both of the shifts must be by the exact same value.
3889 if (N1 != N01)
3890 return false;
3891 canonicalizeShiftAmt(N1, Bitwidth);
3892 // There should not be any external uses of the inner shift / shift amount.
3893 // Note that while we are generally okay with external uses given BMI2,
3894 // iff we need to negate the shift amount, we are not okay with extra uses.
3895 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3896 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3897 return false;
3898 X = N0->getOperand(0);
3899 return true;
3900 };
3901
3902 auto matchLowBitMask = [matchPatternA, matchPatternB,
3903 matchPatternC](SDValue Mask) -> bool {
3904 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3905 };
3906
3907 if (Node->getOpcode() == ISD::AND) {
3908 X = Node->getOperand(0);
3909 SDValue Mask = Node->getOperand(1);
3910
3911 if (matchLowBitMask(Mask)) {
3912 // Great.
3913 } else {
3914 std::swap(X, Mask);
3915 if (!matchLowBitMask(Mask))
3916 return false;
3917 }
3918 } else if (matchLowBitMask(SDValue(Node, 0))) {
3919 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
3920 } else if (!matchPatternD(Node))
3921 return false;
3922
3923 // If we need to negate the shift amount, require BMI2 BZHI support.
3924 // It's just too unprofitable for BMI1 BEXTR.
3925 if (NegateNBits && !Subtarget->hasBMI2())
3926 return false;
3927
3928 SDLoc DL(Node);
3929
3930 // Truncate the shift amount.
3931 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3932 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3933
3934 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3935 // All the other bits are undefined, we do not care about them.
3936 SDValue ImplDef = SDValue(
3937 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3938 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3939
3940 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3941 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3942 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
3943 MVT::i32, ImplDef, NBits, SRIdxVal),
3944 0);
3945 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3946
3947 // We might have matched the amount of high bits to be cleared,
3948 // but we want the amount of low bits to be kept, so negate it then.
3949 if (NegateNBits) {
3950 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
3951 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
3952
3953 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
3954 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3955 }
3956
3957 if (Subtarget->hasBMI2()) {
3958 // Great, just emit the BZHI..
3959 if (NVT != MVT::i32) {
3960 // But have to place the bit count into the wide-enough register first.
3961 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3962 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3963 }
3964
3965 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3966 ReplaceNode(Node, Extract.getNode());
3967 SelectCode(Extract.getNode());
3968 return true;
3969 }
3970
3971 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3972 // *logically* shifted (potentially with one-use trunc inbetween),
3973 // and the truncation was the only use of the shift,
3974 // and if so look past one-use truncation.
3975 {
3976 SDValue RealX = peekThroughOneUseTruncation(X);
3977 // FIXME: only if the shift is one-use?
3978 if (RealX != X && RealX.getOpcode() == ISD::SRL)
3979 X = RealX;
3980 }
3981
3982 MVT XVT = X.getSimpleValueType();
3983
3984 // Else, emitting BEXTR requires one more step.
3985 // The 'control' of BEXTR has the pattern of:
3986 // [15...8 bit][ 7...0 bit] location
3987 // [ bit count][ shift] name
3988 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3989
3990 // Shift NBits left by 8 bits, thus producing 'control'.
3991 // This makes the low 8 bits to be zero.
3992 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3993 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
3994 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3995 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3996
3997 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3998 // FIXME: only if the shift is one-use?
3999 if (X.getOpcode() == ISD::SRL) {
4000 SDValue ShiftAmt = X.getOperand(1);
4001 X = X.getOperand(0);
4002
4003 assert(ShiftAmt.getValueType() == MVT::i8 &&
4004 "Expected shift amount to be i8");
4005
4006 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4007 // We could zext to i16 in some form, but we intentionally don't do that.
4008 SDValue OrigShiftAmt = ShiftAmt;
4009 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4010 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4011
4012 // And now 'or' these low 8 bits of shift amount into the 'control'.
4013 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4014 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4015 }
4016
4017 // But have to place the 'control' into the wide-enough register first.
4018 if (XVT != MVT::i32) {
4019 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4020 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4021 }
4022
4023 // And finally, form the BEXTR itself.
4024 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4025
4026 // The 'X' was originally truncated. Do that now.
4027 if (XVT != NVT) {
4028 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4029 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4030 }
4031
4032 ReplaceNode(Node, Extract.getNode());
4033 SelectCode(Extract.getNode());
4034
4035 return true;
4036}
4037
4038// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4039MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4040 MVT NVT = Node->getSimpleValueType(0);
4041 SDLoc dl(Node);
4042
4043 SDValue N0 = Node->getOperand(0);
4044 SDValue N1 = Node->getOperand(1);
4045
4046 // If we have TBM we can use an immediate for the control. If we have BMI
4047 // we should only do this if the BEXTR instruction is implemented well.
4048 // Otherwise moving the control into a register makes this more costly.
4049 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4050 // hoisting the move immediate would make it worthwhile with a less optimal
4051 // BEXTR?
4052 bool PreferBEXTR =
4053 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4054 if (!PreferBEXTR && !Subtarget->hasBMI2())
4055 return nullptr;
4056
4057 // Must have a shift right.
4058 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4059 return nullptr;
4060
4061 // Shift can't have additional users.
4062 if (!N0->hasOneUse())
4063 return nullptr;
4064
4065 // Only supported for 32 and 64 bits.
4066 if (NVT != MVT::i32 && NVT != MVT::i64)
4067 return nullptr;
4068
4069 // Shift amount and RHS of and must be constant.
4070 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4071 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4072 if (!MaskCst || !ShiftCst)
4073 return nullptr;
4074
4075 // And RHS must be a mask.
4076 uint64_t Mask = MaskCst->getZExtValue();
4077 if (!isMask_64(Mask))
4078 return nullptr;
4079
4080 uint64_t Shift = ShiftCst->getZExtValue();
4081 uint64_t MaskSize = llvm::popcount(Mask);
4082
4083 // Don't interfere with something that can be handled by extracting AH.
4084 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4085 if (Shift == 8 && MaskSize == 8)
4086 return nullptr;
4087
4088 // Make sure we are only using bits that were in the original value, not
4089 // shifted in.
4090 if (Shift + MaskSize > NVT.getSizeInBits())
4091 return nullptr;
4092
4093 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4094 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4095 // does not fit into 32 bits. Load folding is not a sufficient reason.
4096 if (!PreferBEXTR && MaskSize <= 32)
4097 return nullptr;
4098
4099 SDValue Control;
4100 unsigned ROpc, MOpc;
4101
4102#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4103 if (!PreferBEXTR) {
4104 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4105 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4106 // Let's perform the mask first, and apply shift later. Note that we need to
4107 // widen the mask to account for the fact that we'll apply shift afterwards!
4108 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4109 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4110 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4111 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4112 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4113 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4114 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4115 } else {
4116 // The 'control' of BEXTR has the pattern of:
4117 // [15...8 bit][ 7...0 bit] location
4118 // [ bit count][ shift] name
4119 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4120 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4121 if (Subtarget->hasTBM()) {
4122 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4123 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4124 } else {
4125 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4126 // BMI requires the immediate to placed in a register.
4127 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4128 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4129 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4130 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4131 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4132 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4133 }
4134 }
4135
4136 MachineSDNode *NewNode;
4137 SDValue Input = N0->getOperand(0);
4138 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4139 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4140 SDValue Ops[] = {
4141 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4142 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4143 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4144 // Update the chain.
4145 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4146 // Record the mem-refs
4147 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4148 } else {
4149 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4150 }
4151
4152 if (!PreferBEXTR) {
4153 // We still need to apply the shift.
4154 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4155 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4156 : GET_ND_IF_ENABLED(X86::SHR32ri);
4157 NewNode =
4158 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4159 }
4160
4161 return NewNode;
4162}
4163
4164// Emit a PCMISTR(I/M) instruction.
4165MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4166 bool MayFoldLoad, const SDLoc &dl,
4167 MVT VT, SDNode *Node) {
4168 SDValue N0 = Node->getOperand(0);
4169 SDValue N1 = Node->getOperand(1);
4170 SDValue Imm = Node->getOperand(2);
4171 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4172 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4173
4174 // Try to fold a load. No need to check alignment.
4175 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4176 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4177 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4178 N1.getOperand(0) };
4179 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4180 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4181 // Update the chain.
4182 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4183 // Record the mem-refs
4184 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4185 return CNode;
4186 }
4187
4188 SDValue Ops[] = { N0, N1, Imm };
4189 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4190 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4191 return CNode;
4192}
4193
4194// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4195// to emit a second instruction after this one. This is needed since we have two
4196// copyToReg nodes glued before this and we need to continue that glue through.
4197MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4198 bool MayFoldLoad, const SDLoc &dl,
4199 MVT VT, SDNode *Node,
4200 SDValue &InGlue) {
4201 SDValue N0 = Node->getOperand(0);
4202 SDValue N2 = Node->getOperand(2);
4203 SDValue Imm = Node->getOperand(4);
4204 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4205 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4206
4207 // Try to fold a load. No need to check alignment.
4208 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4209 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4210 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4211 N2.getOperand(0), InGlue };
4212 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4213 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4214 InGlue = SDValue(CNode, 3);
4215 // Update the chain.
4216 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4217 // Record the mem-refs
4218 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4219 return CNode;
4220 }
4221
4222 SDValue Ops[] = { N0, N2, Imm, InGlue };
4223 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4224 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4225 InGlue = SDValue(CNode, 2);
4226 return CNode;
4227}
4228
4229bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4230 EVT VT = N->getValueType(0);
4231
4232 // Only handle scalar shifts.
4233 if (VT.isVector())
4234 return false;
4235
4236 // Narrower shifts only mask to 5 bits in hardware.
4237 unsigned Size = VT == MVT::i64 ? 64 : 32;
4238
4239 SDValue OrigShiftAmt = N->getOperand(1);
4240 SDValue ShiftAmt = OrigShiftAmt;
4241 SDLoc DL(N);
4242
4243 // Skip over a truncate of the shift amount.
4244 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4245 ShiftAmt = ShiftAmt->getOperand(0);
4246
4247 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4248 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4249
4250 SDValue NewShiftAmt;
4251 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4252 ShiftAmt->getOpcode() == ISD::XOR) {
4253 SDValue Add0 = ShiftAmt->getOperand(0);
4254 SDValue Add1 = ShiftAmt->getOperand(1);
4255 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4256 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4257 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4258 // to avoid the ADD/SUB/XOR.
4259 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4260 NewShiftAmt = Add0;
4261
4262 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4263 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4264 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4265 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4266 // we can replace it with a NOT. In the XOR case it may save some code
4267 // size, in the SUB case it also may save a move.
4268 assert(Add0C == nullptr || Add1C == nullptr);
4269
4270 // We can only do N-X, not X-N
4271 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4272 return false;
4273
4274 EVT OpVT = ShiftAmt.getValueType();
4275
4276 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4277 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4278 Add0C == nullptr ? Add0 : Add1, AllOnes);
4279 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4280 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4281 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4282 // -X to generate a NEG instead of a SUB of a constant.
4283 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4284 Add0C->getZExtValue() != 0) {
4285 EVT SubVT = ShiftAmt.getValueType();
4286 SDValue X;
4287 if (Add0C->getZExtValue() % Size == 0)
4288 X = Add1;
4289 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4290 Add0C->getZExtValue() % 32 == 0) {
4291 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4292 // This is mainly beneficial if we already compute (x+n*32).
4293 if (Add1.getOpcode() == ISD::TRUNCATE) {
4294 Add1 = Add1.getOperand(0);
4295 SubVT = Add1.getValueType();
4296 }
4297 if (Add0.getValueType() != SubVT) {
4298 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4299 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4300 }
4301
4302 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4303 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4304 } else
4305 return false;
4306 // Insert a negate op.
4307 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4308 // that uses it that's not a shift.
4309 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4310 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4311 NewShiftAmt = Neg;
4312
4313 // Insert these operands into a valid topological order so they can
4314 // get selected independently.
4315 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4316 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4317 } else
4318 return false;
4319 } else
4320 return false;
4321
4322 if (NewShiftAmt.getValueType() != MVT::i8) {
4323 // Need to truncate the shift amount.
4324 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4325 // Add to a correct topological ordering.
4326 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4327 }
4328
4329 // Insert a new mask to keep the shift amount legal. This should be removed
4330 // by isel patterns.
4331 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4332 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4333 // Place in a correct topological ordering.
4334 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4335
4336 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4337 NewShiftAmt);
4338 if (UpdatedNode != N) {
4339 // If we found an existing node, we should replace ourselves with that node
4340 // and wait for it to be selected after its other users.
4341 ReplaceNode(N, UpdatedNode);
4342 return true;
4343 }
4344
4345 // If the original shift amount is now dead, delete it so that we don't run
4346 // it through isel.
4347 if (OrigShiftAmt.getNode()->use_empty())
4348 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4349
4350 // Now that we've optimized the shift amount, defer to normal isel to get
4351 // load folding and legacy vs BMI2 selection without repeating it here.
4352 SelectCode(N);
4353 return true;
4354}
4355
4356bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4357 MVT NVT = N->getSimpleValueType(0);
4358 unsigned Opcode = N->getOpcode();
4359 SDLoc dl(N);
4360
4361 // For operations of the form (x << C1) op C2, check if we can use a smaller
4362 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4363 SDValue Shift = N->getOperand(0);
4364 SDValue N1 = N->getOperand(1);
4365
4366 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4367 if (!Cst)
4368 return false;
4369
4370 int64_t Val = Cst->getSExtValue();
4371
4372 // If we have an any_extend feeding the AND, look through it to see if there
4373 // is a shift behind it. But only if the AND doesn't use the extended bits.
4374 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4375 bool FoundAnyExtend = false;
4376 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4377 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4378 isUInt<32>(Val)) {
4379 FoundAnyExtend = true;
4380 Shift = Shift.getOperand(0);
4381 }
4382
4383 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4384 return false;
4385
4386 // i8 is unshrinkable, i16 should be promoted to i32.
4387 if (NVT != MVT::i32 && NVT != MVT::i64)
4388 return false;
4389
4390 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4391 if (!ShlCst)
4392 return false;
4393
4394 uint64_t ShAmt = ShlCst->getZExtValue();
4395
4396 // Make sure that we don't change the operation by removing bits.
4397 // This only matters for OR and XOR, AND is unaffected.
4398 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4399 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4400 return false;
4401
4402 // Check the minimum bitwidth for the new constant.
4403 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4404 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4405 if (Opcode == ISD::AND) {
4406 // AND32ri is the same as AND64ri32 with zext imm.
4407 // Try this before sign extended immediates below.
4408 ShiftedVal = (uint64_t)Val >> ShAmt;
4409 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4410 return true;
4411 // Also swap order when the AND can become MOVZX.
4412 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4413 return true;
4414 }
4415 ShiftedVal = Val >> ShAmt;
4416 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4417 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4418 return true;
4419 if (Opcode != ISD::AND) {
4420 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4421 ShiftedVal = (uint64_t)Val >> ShAmt;
4422 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4423 return true;
4424 }
4425 return false;
4426 };
4427
4428 int64_t ShiftedVal;
4429 if (!CanShrinkImmediate(ShiftedVal))
4430 return false;
4431
4432 // Ok, we can reorder to get a smaller immediate.
4433
4434 // But, its possible the original immediate allowed an AND to become MOVZX.
4435 // Doing this late due to avoid the MakedValueIsZero call as late as
4436 // possible.
4437 if (Opcode == ISD::AND) {
4438 // Find the smallest zext this could possibly be.
4439 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4440 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4441
4442 // Figure out which bits need to be zero to achieve that mask.
4443 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4444 ZExtWidth);
4445 NeededMask &= ~Cst->getAPIntValue();
4446
4447 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4448 return false;
4449 }
4450
4451 SDValue X = Shift.getOperand(0);
4452 if (FoundAnyExtend) {
4453 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4454 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4455 X = NewX;
4456 }
4457
4458 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
4459 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4460 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4461 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4462 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4463 Shift.getOperand(1));
4464 ReplaceNode(N, NewSHL.getNode());
4465 SelectCode(NewSHL.getNode());
4466 return true;
4467}
4468
4469bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4470 SDNode *ParentB, SDNode *ParentC,
4472 uint8_t Imm) {
4473 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4474 C.isOperandOf(ParentC) && "Incorrect parent node");
4475
4476 auto tryFoldLoadOrBCast =
4477 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4478 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4479 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4480 return true;
4481
4482 // Not a load, check for broadcast which may be behind a bitcast.
4483 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4484 P = L.getNode();
4485 L = L.getOperand(0);
4486 }
4487
4488 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4489 return false;
4490
4491 // Only 32 and 64 bit broadcasts are supported.
4492 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4493 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4494 if (Size != 32 && Size != 64)
4495 return false;
4496
4497 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4498 };
4499
4500 bool FoldedLoad = false;
4501 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4502 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4503 FoldedLoad = true;
4504 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4505 Tmp4)) {
4506 FoldedLoad = true;
4507 std::swap(A, C);
4508 // Swap bits 1/4 and 3/6.
4509 uint8_t OldImm = Imm;
4510 Imm = OldImm & 0xa5;
4511 if (OldImm & 0x02) Imm |= 0x10;
4512 if (OldImm & 0x10) Imm |= 0x02;
4513 if (OldImm & 0x08) Imm |= 0x40;
4514 if (OldImm & 0x40) Imm |= 0x08;
4515 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4516 Tmp4)) {
4517 FoldedLoad = true;
4518 std::swap(B, C);
4519 // Swap bits 1/2 and 5/6.
4520 uint8_t OldImm = Imm;
4521 Imm = OldImm & 0x99;
4522 if (OldImm & 0x02) Imm |= 0x04;
4523 if (OldImm & 0x04) Imm |= 0x02;
4524 if (OldImm & 0x20) Imm |= 0x40;
4525 if (OldImm & 0x40) Imm |= 0x20;
4526 }
4527
4528 SDLoc DL(Root);
4529
4530 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4531
4532 MVT NVT = Root->getSimpleValueType(0);
4533
4534 MachineSDNode *MNode;
4535 if (FoldedLoad) {
4536 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4537
4538 unsigned Opc;
4539 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4540 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4541 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4542 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4543
4544 bool UseD = EltSize == 32;
4545 if (NVT.is128BitVector())
4546 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4547 else if (NVT.is256BitVector())
4548 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4549 else if (NVT.is512BitVector())
4550 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4551 else
4552 llvm_unreachable("Unexpected vector size!");
4553 } else {
4554 bool UseD = NVT.getVectorElementType() == MVT::i32;
4555 if (NVT.is128BitVector())
4556 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4557 else if (NVT.is256BitVector())
4558 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4559 else if (NVT.is512BitVector())
4560 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4561 else
4562 llvm_unreachable("Unexpected vector size!");
4563 }
4564
4565 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4566 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4567
4568 // Update the chain.
4569 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4570 // Record the mem-refs
4571 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4572 } else {
4573 bool UseD = NVT.getVectorElementType() == MVT::i32;
4574 unsigned Opc;
4575 if (NVT.is128BitVector())
4576 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4577 else if (NVT.is256BitVector())
4578 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4579 else if (NVT.is512BitVector())
4580 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4581 else
4582 llvm_unreachable("Unexpected vector size!");
4583
4584 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4585 }
4586
4587 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4588 CurDAG->RemoveDeadNode(Root);
4589 return true;
4590}
4591
4592// Try to match two logic ops to a VPTERNLOG.
4593// FIXME: Handle more complex patterns that use an operand more than once?
4594bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4595 MVT NVT = N->getSimpleValueType(0);
4596
4597 // Make sure we support VPTERNLOG.
4598 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4599 NVT.getVectorElementType() == MVT::i1)
4600 return false;
4601
4602 // We need VLX for 128/256-bit.
4603 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4604 return false;
4605
4606 SDValue N0 = N->getOperand(0);
4607 SDValue N1 = N->getOperand(1);
4608
4609 auto getFoldableLogicOp = [](SDValue Op) {
4610 // Peek through single use bitcast.
4611 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4612 Op = Op.getOperand(0);
4613
4614 if (!Op.hasOneUse())
4615 return SDValue();
4616
4617 unsigned Opc = Op.getOpcode();
4618 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4619 Opc == X86ISD::ANDNP)
4620 return Op;
4621
4622 return SDValue();
4623 };
4624
4625 SDValue A, FoldableOp;
4626 if ((FoldableOp = getFoldableLogicOp(N1))) {
4627 A = N0;
4628 } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4629 A = N1;
4630 } else
4631 return false;
4632
4633 SDValue B = FoldableOp.getOperand(0);
4634 SDValue C = FoldableOp.getOperand(1);
4635 SDNode *ParentA = N;
4636 SDNode *ParentB = FoldableOp.getNode();
4637 SDNode *ParentC = FoldableOp.getNode();
4638
4639 // We can build the appropriate control immediate by performing the logic
4640 // operation we're matching using these constants for A, B, and C.
4641 uint8_t TernlogMagicA = 0xf0;
4642 uint8_t TernlogMagicB = 0xcc;
4643 uint8_t TernlogMagicC = 0xaa;
4644
4645 // Some of the inputs may be inverted, peek through them and invert the
4646 // magic values accordingly.
4647 // TODO: There may be a bitcast before the xor that we should peek through.
4648 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4649 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4650 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4651 Magic = ~Magic;
4652 Parent = Op.getNode();
4653 Op = Op.getOperand(0);
4654 }
4655 };
4656
4657 PeekThroughNot(A, ParentA, TernlogMagicA);
4658 PeekThroughNot(B, ParentB, TernlogMagicB);
4659 PeekThroughNot(C, ParentC, TernlogMagicC);
4660
4661 uint8_t Imm;
4662 switch (FoldableOp.getOpcode()) {
4663 default: llvm_unreachable("Unexpected opcode!");
4664 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4665 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4666 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4667 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4668 }
4669
4670 switch (N->getOpcode()) {
4671 default: llvm_unreachable("Unexpected opcode!");
4672 case X86ISD::ANDNP:
4673 if (A == N0)
4674 Imm &= ~TernlogMagicA;
4675 else
4676 Imm = ~(Imm) & TernlogMagicA;
4677 break;
4678 case ISD::AND: Imm &= TernlogMagicA; break;
4679 case ISD::OR: Imm |= TernlogMagicA; break;
4680 case ISD::XOR: Imm ^= TernlogMagicA; break;
4681 }
4682
4683 return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4684}
4685
4686/// If the high bits of an 'and' operand are known zero, try setting the
4687/// high bits of an 'and' constant operand to produce a smaller encoding by
4688/// creating a small, sign-extended negative immediate rather than a large
4689/// positive one. This reverses a transform in SimplifyDemandedBits that
4690/// shrinks mask constants by clearing bits. There is also a possibility that
4691/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4692/// case, just replace the 'and'. Return 'true' if the node is replaced.
4693bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4694 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4695 // have immediate operands.
4696 MVT VT = And->getSimpleValueType(0);
4697 if (VT != MVT::i32 && VT != MVT::i64)
4698 return false;
4699
4700 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4701 if (!And1C)
4702 return false;
4703
4704 // Bail out if the mask constant is already negative. It's can't shrink more.
4705 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4706 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4707 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4708 // are negative too.
4709 APInt MaskVal = And1C->getAPIntValue();
4710 unsigned MaskLZ = MaskVal.countl_zero();
4711 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4712 return false;
4713
4714 // Don't extend into the upper 32 bits of a 64 bit mask.
4715 if (VT == MVT::i64 && MaskLZ >= 32) {
4716 MaskLZ -= 32;
4717 MaskVal = MaskVal.trunc(32);
4718 }
4719
4720 SDValue And0 = And->getOperand(0);
4721 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4722 APInt NegMaskVal = MaskVal | HighZeros;
4723
4724 // If a negative constant would not allow a smaller encoding, there's no need
4725 // to continue. Only change the constant when we know it's a win.
4726 unsigned MinWidth = NegMaskVal.getSignificantBits();
4727 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4728 return false;
4729
4730 // Extend masks if we truncated above.
4731 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4732 NegMaskVal = NegMaskVal.zext(64);
4733 HighZeros = HighZeros.zext(64);
4734 }
4735
4736 // The variable operand must be all zeros in the top bits to allow using the
4737 // new, negative constant as the mask.
4738 if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
4739 return false;
4740
4741 // Check if the mask is -1. In that case, this is an unnecessary instruction
4742 // that escaped earlier analysis.
4743 if (NegMaskVal.isAllOnes()) {
4744 ReplaceNode(And, And0.getNode());
4745 return true;
4746 }
4747
4748 // A negative mask allows a smaller encoding. Create a new 'and' node.
4749 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4750 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4751 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4752 ReplaceNode(And, NewAnd.getNode());
4753 SelectCode(NewAnd.getNode());
4754 return true;
4755}
4756
4757static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4758 bool FoldedBCast, bool Masked) {
4759#define VPTESTM_CASE(VT, SUFFIX) \
4760case MVT::VT: \
4761 if (Masked) \
4762 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4763 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4764
4765
4766#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4767default: llvm_unreachable("Unexpected VT!"); \
4768VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4769VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4770VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4771VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4772VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4773VPTESTM_CASE(v8i64, QZ##SUFFIX)
4774
4775#define VPTESTM_FULL_CASES(SUFFIX) \
4776VPTESTM_BROADCAST_CASES(SUFFIX) \
4777VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4778VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4779VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4780VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4781VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4782VPTESTM_CASE(v32i16, WZ##SUFFIX)
4783
4784 if (FoldedBCast) {
4785 switch (TestVT.SimpleTy) {
4787 }
4788 }
4789
4790 if (FoldedLoad) {
4791 switch (TestVT.SimpleTy) {
4793 }
4794 }
4795
4796 switch (TestVT.SimpleTy) {
4798 }
4799
4800#undef VPTESTM_FULL_CASES
4801#undef VPTESTM_BROADCAST_CASES
4802#undef VPTESTM_CASE
4803}
4804
4805// Try to create VPTESTM instruction. If InMask is not null, it will be used
4806// to form a masked operation.
4807bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4808 SDValue InMask) {
4809 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4810 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4811 "Unexpected VT!");
4812
4813 // Look for equal and not equal compares.
4814 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4815 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4816 return false;
4817
4818 SDValue SetccOp0 = Setcc.getOperand(0);
4819 SDValue SetccOp1 = Setcc.getOperand(1);
4820
4821 // Canonicalize the all zero vector to the RHS.
4822 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4823 std::swap(SetccOp0, SetccOp1);
4824
4825 // See if we're comparing against zero.
4826 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4827 return false;
4828
4829 SDValue N0 = SetccOp0;
4830
4831 MVT CmpVT = N0.getSimpleValueType();
4832 MVT CmpSVT = CmpVT.getVectorElementType();
4833
4834 // Start with both operands the same. We'll try to refine this.
4835 SDValue Src0 = N0;
4836 SDValue Src1 = N0;
4837
4838 {
4839 // Look through single use bitcasts.
4840 SDValue N0Temp = N0;
4841 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4842 N0Temp = N0.getOperand(0);
4843
4844 // Look for single use AND.
4845 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4846 Src0 = N0Temp.getOperand(0);
4847 Src1 = N0Temp.getOperand(1);
4848 }
4849 }
4850
4851 // Without VLX we need to widen the operation.
4852 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4853
4854 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4855 SDValue &Base, SDValue &Scale, SDValue &Index,
4856 SDValue &Disp, SDValue &Segment) {
4857 // If we need to widen, we can't fold the load.
4858 if (!Widen)
4859 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4860 return true;
4861
4862 // If we didn't fold a load, try to match broadcast. No widening limitation
4863 // for this. But only 32 and 64 bit types are supported.
4864 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4865 return false;
4866
4867 // Look through single use bitcasts.
4868 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4869 P = L.getNode();
4870 L = L.getOperand(0);
4871 }
4872
4873 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4874 return false;
4875
4876 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4877 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4878 return false;
4879
4880 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4881 };
4882
4883 // We can only fold loads if the sources are unique.
4884 bool CanFoldLoads = Src0 != Src1;
4885
4886 bool FoldedLoad = false;
4887 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4888 if (CanFoldLoads) {
4889 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4890 Tmp3, Tmp4);
4891 if (!FoldedLoad) {
4892 // And is commutative.
4893 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4894 Tmp2, Tmp3, Tmp4);
4895 if (FoldedLoad)
4896 std::swap(Src0, Src1);
4897 }
4898 }
4899
4900 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4901
4902 bool IsMasked = InMask.getNode() != nullptr;
4903
4904 SDLoc dl(Root);
4905
4906 MVT ResVT = Setcc.getSimpleValueType();
4907 MVT MaskVT = ResVT;
4908 if (Widen) {
4909 // Widen the inputs using insert_subreg or copy_to_regclass.
4910 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4911 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4912 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4913 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4914 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4915 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4916 CmpVT), 0);
4917 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4918
4919 if (!FoldedBCast)
4920 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4921
4922 if (IsMasked) {
4923 // Widen the mask.
4924 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
4925 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4926 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4927 dl, MaskVT, InMask, RC), 0);
4928 }
4929 }
4930
4931 bool IsTestN = CC == ISD::SETEQ;
4932 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4933 IsMasked);
4934
4935 MachineSDNode *CNode;
4936 if (FoldedLoad) {
4937 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4938
4939 if (IsMasked) {
4940 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4941 Src1.getOperand(0) };
4942 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4943 } else {
4944 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4945 Src1.getOperand(0) };
4946 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4947 }
4948
4949 // Update the chain.
4950 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
4951 // Record the mem-refs
4952 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
4953 } else {
4954 if (IsMasked)
4955 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4956 else
4957 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4958 }
4959
4960 // If we widened, we need to shrink the mask VT.
4961 if (Widen) {
4962 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
4963 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4964 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4965 dl, ResVT, SDValue(CNode, 0), RC);
4966 }
4967
4968 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4969 CurDAG->RemoveDeadNode(Root);
4970 return true;
4971}
4972
4973// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4974// into vpternlog.
4975bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
4976 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
4977
4978 MVT NVT = N->getSimpleValueType(0);
4979
4980 // Make sure we support VPTERNLOG.
4981 if (!NVT.isVector() || !Subtarget->hasAVX512())
4982 return false;
4983
4984 // We need VLX for 128/256-bit.
4985 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4986 return false;
4987
4988 SDValue N0 = N->getOperand(0);
4989 SDValue N1 = N->getOperand(1);
4990
4991 // Canonicalize AND to LHS.
4992 if (N1.getOpcode() == ISD::AND)
4993 std::swap(N0, N1);
4994
4995 if (N0.getOpcode() != ISD::AND ||
4996 N1.getOpcode() != X86ISD::ANDNP ||
4997 !N0.hasOneUse() || !N1.hasOneUse())
4998 return false;
4999
5000 // ANDN is not commutable, use it to pick down A and C.
5001 SDValue A = N1.getOperand(0);
5002 SDValue C = N1.getOperand(1);
5003
5004 // AND is commutable, if one operand matches A, the other operand is B.
5005 // Otherwise this isn't a match.
5006 SDValue B;
5007 if (N0.getOperand(0) == A)
5008 B = N0.getOperand(1);
5009 else if (N0.getOperand(1) == A)
5010 B = N0.getOperand(0);
5011 else
5012 return false;
5013
5014 SDLoc dl(N);
5015 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5016 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5017 ReplaceNode(N, Ternlog.getNode());
5018
5019 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5020 Ternlog.getNode(), A, B, C, 0xCA);
5021}
5022
5023void X86DAGToDAGISel::Select(SDNode *Node) {
5024 MVT NVT = Node->getSimpleValueType(0);
5025 unsigned Opcode = Node->getOpcode();
5026 SDLoc dl(Node);
5027
5028 if (Node->isMachineOpcode()) {
5029 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5030 Node->setNodeId(-1);
5031 return; // Already selected.
5032 }
5033
5034 switch (Opcode) {
5035 default: break;
5037 unsigned IntNo = Node->getConstantOperandVal(1);
5038 switch (IntNo) {
5039 default: break;
5040 case Intrinsic::x86_encodekey128:
5041 case Intrinsic::x86_encodekey256: {
5042 if (!Subtarget->hasKL())
5043 break;
5044
5045 unsigned Opcode;
5046 switch (IntNo) {
5047 default: llvm_unreachable("Impossible intrinsic");
5048 case Intrinsic::x86_encodekey128:
5049 Opcode = GET_EGPR_IF_ENABLED(X86::ENCODEKEY128);
5050 break;
5051 case Intrinsic::x86_encodekey256:
5052 Opcode = GET_EGPR_IF_ENABLED(X86::ENCODEKEY256);
5053 break;
5054 }
5055
5056 SDValue Chain = Node->getOperand(0);
5057 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5058 SDValue());
5059 if (Opcode == X86::ENCODEKEY256 || Opcode == X86::ENCODEKEY256_EVEX)
5060 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5061 Chain.getValue(1));
5062
5063 MachineSDNode *Res = CurDAG->getMachineNode(
5064 Opcode, dl, Node->getVTList(),
5065 {Node->getOperand(2), Chain, Chain.getValue(1)});
5066 ReplaceNode(Node, Res);
5067 return;
5068 }
5069 case Intrinsic::x86_tileloadd64_internal:
5070 case Intrinsic::x86_tileloaddt164_internal: {
5071 if (!Subtarget->hasAMXTILE())
5072 break;
5073 unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
5074 ? X86::PTILELOADDV
5075 : X86::PTILELOADDT1V;
5076 // _tile_loadd_internal(row, col, buf, STRIDE)
5077 SDValue Base = Node->getOperand(4);
5078 SDValue Scale = getI8Imm(1, dl);
5079 SDValue Index = Node->getOperand(5);
5080 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5081 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5082 SDValue Chain = Node->getOperand(0);
5083 MachineSDNode *CNode;
5084 SDValue Ops[] = {Node->getOperand(2),
5085 Node->getOperand(3),
5086 Base,
5087 Scale,
5088 Index,
5089 Disp,
5090 Segment,
5091 Chain};
5092 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5093 ReplaceNode(Node, CNode);
5094 return;
5095 }
5096 }
5097 break;
5098 }
5099 case ISD::INTRINSIC_VOID: {
5100 unsigned IntNo = Node->getConstantOperandVal(1);
5101 switch (IntNo) {
5102 default: break;
5103 case Intrinsic::x86_sse3_monitor:
5104 case Intrinsic::x86_monitorx:
5105 case Intrinsic::x86_clzero: {
5106 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5107
5108 unsigned Opc = 0;
5109 switch (IntNo) {
5110 default: llvm_unreachable("Unexpected intrinsic!");
5111 case Intrinsic::x86_sse3_monitor:
5112 if (!Subtarget->hasSSE3())
5113 break;
5114 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5115 break;
5116 case Intrinsic::x86_monitorx:
5117 if (!Subtarget->hasMWAITX())
5118 break;
5119 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5120 break;
5121 case Intrinsic::x86_clzero:
5122 if (!Subtarget->hasCLZERO())
5123 break;
5124 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5125 break;
5126 }
5127
5128 if (Opc) {
5129 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5130 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5131 Node->getOperand(2), SDValue());
5132 SDValue InGlue = Chain.getValue(1);
5133
5134 if (IntNo == Intrinsic::x86_sse3_monitor ||
5135 IntNo == Intrinsic::x86_monitorx) {
5136 // Copy the other two operands to ECX and EDX.
5137 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5138 InGlue);
5139 InGlue = Chain.getValue(1);
5140 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5141 InGlue);
5142 InGlue = Chain.getValue(1);
5143 }
5144
5145 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5146 { Chain, InGlue});
5147 ReplaceNode(Node, CNode);
5148 return;
5149 }
5150
5151 break;
5152 }
5153 case Intrinsic::x86_tilestored64_internal: {
5154 unsigned Opc = X86::PTILESTOREDV;
5155 // _tile_stored_internal(row, col, buf, STRIDE, c)
5156 SDValue Base = Node->getOperand(4);
5157 SDValue Scale = getI8Imm(1, dl);
5158 SDValue Index = Node->getOperand(5);
5159 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5160 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5161 SDValue Chain = Node->getOperand(0);
5162 MachineSDNode *CNode;
5163 SDValue Ops[] = {Node->getOperand(2),
5164 Node->getOperand(3),
5165 Base,
5166 Scale,
5167 Index,
5168 Disp,
5169 Segment,
5170 Node->getOperand(6),
5171 Chain};
5172 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5173 ReplaceNode(Node, CNode);
5174 return;
5175 }
5176 case Intrinsic::x86_tileloadd64:
5177 case Intrinsic::x86_tileloaddt164:
5178 case Intrinsic::x86_tilestored64: {
5179 if (!Subtarget->hasAMXTILE())
5180 break;
5181 unsigned Opc;
5182 switch (IntNo) {
5183 default: llvm_unreachable("Unexpected intrinsic!");
5184 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5185 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5186 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5187 }
5188 // FIXME: Match displacement and scale.
5189 unsigned TIndex = Node->getConstantOperandVal(2);
5190 SDValue TReg = getI8Imm(TIndex, dl);
5191 SDValue Base = Node->getOperand(3);
5192 SDValue Scale = getI8Imm(1, dl);
5193 SDValue Index = Node->getOperand(4);
5194 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5195 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5196 SDValue Chain = Node->getOperand(0);
5197 MachineSDNode *CNode;
5198 if (Opc == X86::PTILESTORED) {
5199 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5200 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5201 } else {
5202 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5203 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5204 }
5205 ReplaceNode(Node, CNode);
5206 return;
5207 }
5208 }
5209 break;
5210 }
5211 case ISD::BRIND:
5212 case X86ISD::NT_BRIND: {
5213 if (Subtarget->isTargetNaCl())
5214 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5215 // leave the instruction alone.
5216 break;
5217 if (Subtarget->isTarget64BitILP32()) {
5218 // Converts a 32-bit register to a 64-bit, zero-extended version of
5219 // it. This is needed because x86-64 can do many things, but jmp %r32
5220 // ain't one of them.
5221 SDValue Target = Node->getOperand(1);
5222 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5223 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5224 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5225 Node->getOperand(0), ZextTarget);
5226 ReplaceNode(Node, Brind.getNode());
5227 SelectCode(ZextTarget.getNode());
5228 SelectCode(Brind.getNode());
5229 return;
5230 }
5231 break;
5232 }
5234 ReplaceNode(Node, getGlobalBaseReg());
5235 return;
5236
5237 case ISD::BITCAST:
5238 // Just drop all 128/256/512-bit bitcasts.
5239 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5240 NVT == MVT::f128) {
5241 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5242 CurDAG->RemoveDeadNode(Node);
5243 return;
5244 }
5245 break;
5246
5247 case ISD::SRL:
5248 if (matchBitExtract(Node))
5249 return;
5250 [[fallthrough]];
5251 case ISD::SRA:
5252 case ISD::SHL:
5253 if (tryShiftAmountMod(Node))
5254 return;
5255 break;
5256
5257 case X86ISD::VPTERNLOG: {
5258 uint8_t Imm = Node->getConstantOperandVal(3);
5259 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5260 Node->getOperand(1), Node->getOperand(2), Imm))
5261 return;
5262 break;
5263 }
5264
5265 case X86ISD::ANDNP:
5266 if (tryVPTERNLOG(Node))
5267 return;
5268 break;
5269
5270 case ISD::AND:
5271 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5272 // Try to form a masked VPTESTM. Operands can be in either order.
5273 SDValue N0 = Node->getOperand(0);
5274 SDValue N1 = Node->getOperand(1);
5275 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5276 tryVPTESTM(Node, N0, N1))
5277 return;
5278 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5279 tryVPTESTM(Node, N1, N0))
5280 return;
5281 }
5282
5283 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5284 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5285 CurDAG->RemoveDeadNode(Node);
5286 return;
5287 }
5288 if (matchBitExtract(Node))
5289 return;
5290 if (AndImmShrink && shrinkAndImmediate(Node))
5291 return;
5292
5293 [[fallthrough]];
5294 case ISD::OR:
5295 case ISD::XOR:
5296 if (tryShrinkShlLogicImm(Node))
5297 return;
5298 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5299 return;
5300 if (tryVPTERNLOG(Node))
5301 return;
5302
5303 [[fallthrough]];
5304 case ISD::ADD:
5305 if (Opcode == ISD::ADD && matchBitExtract(Node))
5306 return;
5307 [[fallthrough]];
5308 case ISD::SUB: {
5309 // Try to avoid folding immediates with multiple uses for optsize.
5310 // This code tries to select to register form directly to avoid going
5311 // through the isel table which might fold the immediate. We can't change
5312 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5313 // tablegen files to check immediate use count without making the patterns
5314 // unavailable to the fast-isel table.
5315 if (!CurDAG->shouldOptForSize())
5316 break;
5317
5318 // Only handle i8/i16/i32/i64.
5319 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5320 break;
5321
5322 SDValue N0 = Node->getOperand(0);
5323 SDValue N1 = Node->getOperand(1);
5324
5325 auto *Cst = dyn_cast<ConstantSDNode>(N1);
5326 if (!Cst)
5327 break;
5328
5329 int64_t Val = Cst->getSExtValue();
5330
5331 // Make sure its an immediate that is considered foldable.
5332 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5333 if (!isInt<8>(Val) && !isInt<32>(Val))
5334 break;
5335
5336 // If this can match to INC/DEC, let it go.
5337 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5338 break;
5339
5340 // Check if we should avoid folding this immediate.
5341 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5342 break;
5343
5344 // We should not fold the immediate. So we need a register form instead.
5345 unsigned ROpc, MOpc;
5346 switch (NVT.SimpleTy) {
5347 default: llvm_unreachable("Unexpected VT!");
5348 case MVT::i8:
5349 switch (Opcode) {
5350 default: llvm_unreachable("Unexpected opcode!");
5351 case ISD::ADD:
5352 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5353 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5354 break;
5355 case ISD::SUB:
5356 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5357 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5358 break;
5359 case ISD::AND:
5360 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5361 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5362 break;
5363 case ISD::OR:
5364 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5365 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5366 break;
5367 case ISD::XOR:
5368 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5369 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5370 break;
5371 }
5372 break;
5373 case MVT::i16:
5374 switch (Opcode) {
5375 default: llvm_unreachable("Unexpected opcode!");
5376 case ISD::ADD:
5377 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5378 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5379 break;
5380 case ISD::SUB:
5381 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5382 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5383 break;
5384 case ISD::AND:
5385 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5386 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5387 break;
5388 case ISD::OR:
5389 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5390 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5391 break;
5392 case ISD::XOR:
5393 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5394 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5395 break;
5396 }
5397 break;
5398 case MVT::i32:
5399 switch (Opcode) {
5400 default: llvm_unreachable("Unexpected opcode!");
5401 case ISD::ADD:
5402 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5403 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5404 break;
5405 case ISD::SUB:
5406 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5407 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5408 break;
5409 case ISD::AND:
5410 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5411 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5412 break;
5413 case ISD::OR:
5414 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5415 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5416 break;
5417 case ISD::XOR:
5418 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5419 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5420 break;
5421 }
5422 break;
5423 case MVT::i64:
5424 switch (Opcode) {
5425 default: llvm_unreachable("Unexpected opcode!");
5426 case ISD::ADD:
5427 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5428 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5429 break;
5430 case ISD::SUB:
5431 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5432 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5433 break;
5434 case ISD::AND:
5435 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5436 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5437 break;
5438 case ISD::OR:
5439 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5440 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5441 break;
5442 case ISD::XOR:
5443 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5444 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5445 break;
5446 }
5447 break;
5448 }
5449
5450 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5451
5452 // If this is a not a subtract, we can still try to fold a load.
5453 if (Opcode != ISD::SUB) {
5454 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5455 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5456 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5457 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5458 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5459 // Update the chain.
5460 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5461 // Record the mem-refs
5462 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5463 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5464 CurDAG->RemoveDeadNode(Node);
5465 return;
5466 }
5467 }
5468
5469 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5470 return;
5471 }
5472
5473 case X86ISD::SMUL:
5474 // i16/i32/i64 are handled with isel patterns.
5475 if (NVT != MVT::i8)
5476 break;
5477 [[fallthrough]];
5478 case X86ISD::UMUL: {
5479 SDValue N0 = Node->getOperand(0);
5480 SDValue N1 = Node->getOperand(1);
5481
5482 unsigned LoReg, ROpc, MOpc;
5483 switch (NVT.SimpleTy) {
5484 default: llvm_unreachable("Unsupported VT!");
5485 case MVT::i8:
5486 LoReg = X86::AL;
5487 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5488 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5489 break;
5490 case MVT::i16:
5491 LoReg = X86::AX;
5492 ROpc = X86::MUL16r;
5493 MOpc = X86::MUL16m;
5494 break;
5495 case MVT::i32:
5496 LoReg = X86::EAX;
5497 ROpc = X86::MUL32r;
5498 MOpc = X86::MUL32m;
5499 break;
5500 case MVT::i64:
5501 LoReg = X86::RAX;
5502 ROpc = X86::MUL64r;
5503 MOpc = X86::MUL64m;
5504 break;
5505 }
5506
5507 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5508 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5509 // Multiply is commutative.
5510 if (!FoldedLoad) {
5511 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5512 if (FoldedLoad)
5513 std::swap(N0, N1);
5514 }
5515
5516 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5517 N0, SDValue()).getValue(1);
5518
5519 MachineSDNode *CNode;
5520 if (FoldedLoad) {
5521 // i16/i32/i64 use an instruction that produces a low and high result even
5522 // though only the low result is used.
5523 SDVTList VTs;
5524 if (NVT == MVT::i8)
5525 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5526 else
5527 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5528
5529 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5530 InGlue };
5531 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5532
5533 // Update the chain.
5534 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5535 // Record the mem-refs
5536 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5537 } else {
5538 // i16/i32/i64 use an instruction that produces a low and high result even
5539 // though only the low result is used.
5540 SDVTList VTs;
5541 if (NVT == MVT::i8)
5542 VTs = CurDAG->getVTList(NVT, MVT::i32);
5543 else
5544 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5545
5546 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5547 }
5548
5549 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5550 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5551 CurDAG->RemoveDeadNode(Node);
5552 return;
5553 }
5554
5555 case ISD::SMUL_LOHI:
5556 case ISD::UMUL_LOHI: {
5557 SDValue N0 = Node->getOperand(0);
5558 SDValue N1 = Node->getOperand(1);
5559
5560 unsigned Opc, MOpc;
5561 unsigned LoReg, HiReg;
5562 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5563 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5564 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5565 switch (NVT.SimpleTy) {
5566 default: llvm_unreachable("Unsupported VT!");
5567 case MVT::i32:
5568 Opc = UseMULXHi ? X86::MULX32Hrr
5569 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5570 : IsSigned ? X86::IMUL32r
5571 : X86::MUL32r;
5572 MOpc = UseMULXHi ? X86::MULX32Hrm
5573 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5574 : IsSigned ? X86::IMUL32m
5575 : X86::MUL32m;
5576 LoReg = UseMULX ? X86::EDX : X86::EAX;
5577 HiReg = X86::EDX;
5578 break;
5579 case MVT::i64:
5580 Opc = UseMULXHi ? X86::MULX64Hrr
5581 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5582 : IsSigned ? X86::IMUL64r
5583 : X86::MUL64r;
5584 MOpc = UseMULXHi ? X86::MULX64Hrm
5585 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5586 : IsSigned ? X86::IMUL64m
5587 : X86::MUL64m;
5588 LoReg = UseMULX ? X86::RDX : X86::RAX;
5589 HiReg = X86::RDX;
5590 break;
5591 }
5592
5593 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5594 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5595 // Multiply is commutative.
5596 if (!foldedLoad) {
5597 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5598 if (foldedLoad)
5599 std::swap(N0, N1);
5600 }
5601
5602 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5603 N0, SDValue()).getValue(1);
5604 SDValue ResHi, ResLo;
5605 if (foldedLoad) {
5606 SDValue Chain;
5607 MachineSDNode *CNode = nullptr;
5608 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5609 InGlue };
5610 if (UseMULXHi) {
5611 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5612 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5613 ResHi = SDValue(CNode, 0);
5614 Chain = SDValue(CNode, 1);
5615 } else if (UseMULX) {
5616 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5617 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5618 ResHi = SDValue(CNode, 0);
5619 ResLo = SDValue(CNode, 1);
5620 Chain = SDValue(CNode, 2);
5621 } else {
5622 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5623 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5624 Chain = SDValue(CNode, 0);
5625 InGlue = SDValue(CNode, 1);
5626 }
5627
5628 // Update the chain.
5629 ReplaceUses(N1.getValue(1), Chain);
5630 // Record the mem-refs
5631 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5632 } else {
5633 SDValue Ops[] = { N1, InGlue };
5634 if (UseMULXHi) {
5635 SDVTList VTs = CurDAG->getVTList(NVT);
5636 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5637 ResHi = SDValue(CNode, 0);
5638 } else if (UseMULX) {
5639 SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5640 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5641 ResHi = SDValue(CNode, 0);
5642 ResLo = SDValue(CNode, 1);
5643 } else {
5644 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5645 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5646 InGlue = SDValue(CNode, 0);
5647 }
5648 }
5649
5650 // Copy the low half of the result, if it is needed.
5651 if (!SDValue(Node, 0).use_empty()) {
5652 if (!ResLo) {
5653 assert(LoReg && "Register for low half is not defined!");
5654 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5655 NVT, InGlue);
5656 InGlue = ResLo.getValue(2);
5657 }
5658 ReplaceUses(SDValue(Node, 0), ResLo);
5659 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5660 dbgs() << '\n');
5661 }
5662 // Copy the high half of the result, if it is needed.
5663 if (!SDValue(Node, 1).use_empty()) {
5664 if (!ResHi) {
5665 assert(HiReg && "Register for high half is not defined!");
5666 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5667 NVT, InGlue);
5668 InGlue = ResHi.getValue(2);
5669 }
5670 ReplaceUses(SDValue(Node, 1), ResHi);
5671 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5672 dbgs() << '\n');
5673 }
5674
5675 CurDAG->RemoveDeadNode(Node);
5676 return;
5677 }
5678
5679 case ISD::SDIVREM:
5680 case ISD::UDIVREM: {
5681 SDValue N0 = Node->getOperand(0);
5682 SDValue N1 = Node->getOperand(1);
5683
5684 unsigned ROpc, MOpc;
5685 bool isSigned = Opcode == ISD::SDIVREM;
5686 if (!isSigned) {
5687 switch (NVT.SimpleTy) {
5688 default: llvm_unreachable("Unsupported VT!");
5689 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5690 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5691 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5692 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5693 }
5694 } else {
5695 switch (NVT.SimpleTy) {
5696 default: llvm_unreachable("Unsupported VT!");
5697 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5698 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5699 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5700 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5701 }
5702 }
5703
5704 unsigned LoReg, HiReg, ClrReg;
5705 unsigned SExtOpcode;
5706 switch (NVT.SimpleTy) {
5707 default: llvm_unreachable("Unsupported VT!");
5708 case MVT::i8:
5709 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5710 SExtOpcode = 0; // Not used.
5711 break;
5712 case MVT::i16:
5713 LoReg = X86::AX; HiReg = X86::DX;
5714 ClrReg = X86::DX;
5715 SExtOpcode = X86::CWD;
5716 break;
5717 case MVT::i32:
5718 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5719 SExtOpcode = X86::CDQ;
5720 break;
5721 case MVT::i64:
5722 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5723 SExtOpcode = X86::CQO;
5724 break;
5725 }
5726
5727 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5728 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5729 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5730
5731 SDValue InGlue;
5732 if (NVT == MVT::i8) {
5733 // Special case for div8, just use a move with zero extension to AX to
5734 // clear the upper 8 bits (AH).
5735 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5736 MachineSDNode *Move;
5737 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5738 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5739 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5740 : X86::MOVZX16rm8;
5741 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5742 Chain = SDValue(Move, 1);
5743 ReplaceUses(N0.getValue(1), Chain);
5744 // Record the mem-refs
5745 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5746 } else {
5747 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5748 : X86::MOVZX16rr8;
5749 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5750 Chain = CurDAG->getEntryNode();
5751 }
5752 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5753 SDValue());
5754 InGlue = Chain.getValue(1);
5755 } else {
5756 InGlue =
5757 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5758 LoReg, N0, SDValue()).getValue(1);
5759 if (isSigned && !signBitIsZero) {
5760 // Sign extend the low part into the high part.
5761 InGlue =
5762 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5763 } else {
5764 // Zero out the high part, effectively zero extending the input.
5765 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5766 SDValue ClrNode = SDValue(
5767 CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
5768 switch (NVT.SimpleTy) {
5769 case MVT::i16:
5770 ClrNode =
5771 SDValue(CurDAG->getMachineNode(
5772 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5773 CurDAG->getTargetConstant(X86::sub_16bit, dl,
5774 MVT::i32)),
5775 0);
5776 break;
5777 case MVT::i32:
5778 break;
5779 case MVT::i64:
5780 ClrNode =
5781 SDValue(CurDAG->getMachineNode(
5782 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5783 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5784 CurDAG->getTargetConstant(X86::sub_32bit, dl,
5785 MVT::i32)),
5786 0);
5787 break;
5788 default:
5789 llvm_unreachable("Unexpected division source");
5790 }
5791
5792 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5793 ClrNode, InGlue).getValue(1);
5794 }
5795 }
5796
5797 if (foldedLoad) {
5798 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5799 InGlue };
5800 MachineSDNode *CNode =
5801 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5802 InGlue = SDValue(CNode, 1);
5803 // Update the chain.
5804 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5805 // Record the mem-refs
5806 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5807 } else {
5808 InGlue =
5809 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
5810 }
5811
5812 // Prevent use of AH in a REX instruction by explicitly copying it to
5813 // an ABCD_L register.
5814 //
5815 // The current assumption of the register allocator is that isel
5816 // won't generate explicit references to the GR8_ABCD_H registers. If
5817 // the allocator and/or the backend get enhanced to be more robust in
5818 // that regard, this can be, and should be, removed.
5819 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
5820 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
5821 unsigned AHExtOpcode =
5822 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5823
5824 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
5825 MVT::Glue, AHCopy, InGlue);
5826 SDValue Result(RNode, 0);
5827 InGlue = SDValue(RNode, 1);
5828
5829 Result =
5830 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
5831
5832 ReplaceUses(SDValue(Node, 1), Result);
5833 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5834 dbgs() << '\n');
5835 }
5836 // Copy the division (low) result, if it is needed.
5837 if (!SDValue(Node, 0).use_empty()) {
5838 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5839 LoReg, NVT, InGlue);
5840 InGlue = Result.getValue(2);
5841 ReplaceUses(SDValue(Node, 0), Result);
5842 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5843 dbgs() << '\n');
5844 }
5845 // Copy the remainder (high) result, if it is needed.
5846 if (!SDValue(Node, 1).use_empty()) {
5847 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5848 HiReg, NVT, InGlue);
5849 InGlue = Result.getValue(2);
5850 ReplaceUses(SDValue(Node, 1), Result);
5851 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5852 dbgs() << '\n');
5853 }
5854 CurDAG->RemoveDeadNode(Node);
5855 return;
5856 }
5857
5858 case X86ISD::FCMP:
5860 case X86ISD::STRICT_FCMPS: {
5861 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
5862 Node->getOpcode() == X86ISD::STRICT_FCMPS;
5863 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
5864 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
5865
5866 // Save the original VT of the compare.
5867 MVT CmpVT = N0.getSimpleValueType();
5868
5869 // Floating point needs special handling if we don't have FCOMI.
5870 if (Subtarget->canUseCMOV())
5871 break;
5872
5873 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
5874
5875 unsigned Opc;
5876 switch (CmpVT.SimpleTy) {
5877 default: llvm_unreachable("Unexpected type!");
5878 case MVT::f32:
5879 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
5880 break;
5881 case MVT::f64:
5882 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
5883 break;
5884 case MVT::f80:
5885 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
5886 break;
5887 }
5888
5889 SDValue Chain =
5890 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
5891 SDValue Glue;
5892 if (IsStrictCmp) {
5893 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5894 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
5895 Glue = Chain.getValue(1);
5896 } else {
5897 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
5898 }
5899
5900 // Move FPSW to AX.
5901 SDValue FNSTSW =
5902 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
5903
5904 // Extract upper 8-bits of AX.
5905 SDValue Extract =
5906 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
5907
5908 // Move AH into flags.
5909 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
5910 assert(Subtarget->canUseLAHFSAHF() &&
5911 "Target doesn't support SAHF or FCOMI?");
5912 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
5913 Chain = AH;
5914 SDValue SAHF = SDValue(
5915 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
5916
5917 if (IsStrictCmp)
5918 ReplaceUses(SDValue(Node, 1), Chain);
5919
5920 ReplaceUses(SDValue(Node, 0), SAHF);
5921 CurDAG->RemoveDeadNode(Node);
5922 return;
5923 }
5924
5925 case X86ISD::CMP: {
5926 SDValue N0 = Node->getOperand(0);
5927 SDValue N1 = Node->getOperand(1);
5928
5929 // Optimizations for TEST compares.
5930 if (!isNullConstant(N1))
5931 break;
5932
5933 // Save the original VT of the compare.
5934 MVT CmpVT = N0.getSimpleValueType();
5935
5936 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5937 // by a test instruction. The test should be removed later by
5938 // analyzeCompare if we are using only the zero flag.
5939 // TODO: Should we check the users and use the BEXTR flags directly?
5940 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
5941 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
5942 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
5943 : X86::TEST32rr;
5944 SDValue BEXTR = SDValue(NewNode, 0);
5945 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
5946 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5947 CurDAG->RemoveDeadNode(Node);
5948 return;
5949 }
5950 }
5951
5952 // We can peek through truncates, but we need to be careful below.
5953 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
5954 N0 = N0.getOperand(0);
5955
5956 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5957 // use a smaller encoding.
5958 // Look past the truncate if CMP is the only use of it.
5959 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
5960 N0.getValueType() != MVT::i8) {
5961 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5962 if (!MaskC)
5963 break;
5964
5965 // We may have looked through a truncate so mask off any bits that
5966 // shouldn't be part of the compare.
5967 uint64_t Mask = MaskC->getZExtValue();
5968 Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
5969
5970 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
5971 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
5972 // zero flag.
5973 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
5974 onlyUsesZeroFlag(SDValue(Node, 0))) {
5975 unsigned ShiftOpcode = ISD::DELETED_NODE;
5976 unsigned ShiftAmt;
5977 unsigned SubRegIdx;
5978 MVT SubRegVT;
5979 unsigned TestOpcode;
5980 unsigned LeadingZeros = llvm::countl_zero(Mask);
5981 unsigned TrailingZeros = llvm::countr_zero(Mask);
5982
5983 // With leading/trailing zeros, the transform is profitable if we can
5984 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
5985 // incurring any extra register moves.
5986 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
5987 if (LeadingZeros == 0 && SavesBytes) {
5988 // If the mask covers the most significant bit, then we can replace
5989 // TEST+AND with a SHR and check eflags.
5990 // This emits a redundant TEST which is subsequently eliminated.
5991 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
5992 ShiftAmt = TrailingZeros;
5993 SubRegIdx = 0;
5994 TestOpcode = X86::TEST64rr;
5995 } else if (TrailingZeros == 0 && SavesBytes) {
5996 // If the mask covers the least significant bit, then we can replace
5997 // TEST+AND with a SHL and check eflags.
5998 // This emits a redundant TEST which is subsequently eliminated.
5999 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6000 ShiftAmt = LeadingZeros;
6001 SubRegIdx = 0;
6002 TestOpcode = X86::TEST64rr;
6003 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6004 // If the shifted mask extends into the high half and is 8/16/32 bits
6005 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6006 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6007 if (PopCount == 8) {
6008 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6009 ShiftAmt = TrailingZeros;
6010 SubRegIdx = X86::sub_8bit;
6011 SubRegVT = MVT::i8;
6012 TestOpcode = X86::TEST8rr;
6013 } else if (PopCount == 16) {
6014 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6015 ShiftAmt = TrailingZeros;
6016 SubRegIdx = X86::sub_16bit;
6017 SubRegVT = MVT::i16;
6018 TestOpcode = X86::TEST16rr;
6019 } else if (PopCount == 32) {
6020 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6021 ShiftAmt = TrailingZeros;
6022 SubRegIdx = X86::sub_32bit;
6023 SubRegVT = MVT::i32;
6024 TestOpcode = X86::TEST32rr;
6025 }
6026 }
6027 if (ShiftOpcode != ISD::DELETED_NODE) {
6028 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6029 SDValue Shift = SDValue(
6030 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6031 N0.getOperand(0), ShiftC),
6032 0);
6033 if (SubRegIdx != 0) {
6034 Shift =
6035 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6036 }
6038 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6039 ReplaceNode(Node, Test);
6040 return;
6041 }
6042 }
6043
6044 MVT VT;
6045 int SubRegOp;
6046 unsigned ROpc, MOpc;
6047
6048 // For each of these checks we need to be careful if the sign flag is
6049 // being used. It is only safe to use the sign flag in two conditions,
6050 // either the sign bit in the shrunken mask is zero or the final test
6051 // size is equal to the original compare size.
6052
6053 if (isUInt<8>(Mask) &&
6054 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6055 hasNoSignFlagUses(SDValue(Node, 0)))) {
6056 // For example, convert "testl %eax, $8" to "testb %al, $8"
6057 VT = MVT::i8;
6058 SubRegOp = X86::sub_8bit;
6059 ROpc = X86::TEST8ri;
6060 MOpc = X86::TEST8mi;
6061 } else if (OptForMinSize && isUInt<16>(Mask) &&
6062 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6063 hasNoSignFlagUses(SDValue(Node, 0)))) {
6064 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6065 // NOTE: We only want to form TESTW instructions if optimizing for
6066 // min size. Otherwise we only save one byte and possibly get a length
6067 // changing prefix penalty in the decoders.
6068 VT = MVT::i16;
6069 SubRegOp = X86::sub_16bit;
6070 ROpc = X86::TEST16ri;
6071 MOpc = X86::TEST16mi;
6072 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6073 ((!(Mask & 0x80000000) &&
6074 // Without minsize 16-bit Cmps can get here so we need to
6075 // be sure we calculate the correct sign flag if needed.
6076 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6077 CmpVT == MVT::i32 ||
6078 hasNoSignFlagUses(SDValue(Node, 0)))) {
6079 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6080 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6081 // Otherwize, we find ourselves in a position where we have to do
6082 // promotion. If previous passes did not promote the and, we assume
6083 // they had a good reason not to and do not promote here.
6084 VT = MVT::i32;
6085 SubRegOp = X86::sub_32bit;
6086 ROpc = X86::TEST32ri;
6087 MOpc = X86::TEST32mi;
6088 } else {
6089 // No eligible transformation was found.
6090 break;
6091 }
6092
6093 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6094 SDValue Reg = N0.getOperand(0);
6095
6096 // Emit a testl or testw.
6097 MachineSDNode *NewNode;
6098 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6099 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6100 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6101 if (!LoadN->isSimple()) {
6102 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6103 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6104 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6105 (MOpc == X86::TEST32mi && NumVolBits != 32))
6106 break;
6107 }
6108 }
6109 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6110 Reg.getOperand(0) };
6111 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6112 // Update the chain.
6113 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6114 // Record the mem-refs
6115 CurDAG->setNodeMemRefs(NewNode,
6116 {cast<LoadSDNode>(Reg)->getMemOperand()});
6117 } else {
6118 // Extract the subregister if necessary.
6119 if (N0.getValueType() != VT)
6120 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6121
6122 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6123 }
6124 // Replace CMP with TEST.
6125 ReplaceNode(Node, NewNode);
6126 return;
6127 }
6128 break;
6129 }
6130 case X86ISD::PCMPISTR: {
6131 if (!Subtarget->hasSSE42())
6132 break;
6133
6134 bool NeedIndex = !SDValue(Node, 0).use_empty();
6135 bool NeedMask = !SDValue(Node, 1).use_empty();
6136 // We can't fold a load if we are going to make two instructions.
6137 bool MayFoldLoad = !NeedIndex || !NeedMask;
6138
6139 MachineSDNode *CNode;
6140 if (NeedMask) {
6141 unsigned ROpc =
6142 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6143 unsigned MOpc =
6144 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6145 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6146 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6147 }
6148 if (NeedIndex || !NeedMask) {
6149 unsigned ROpc =
6150 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6151 unsigned MOpc =
6152 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6153 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6154 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6155 }
6156
6157 // Connect the flag usage to the last instruction created.
6158 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6159 CurDAG->RemoveDeadNode(Node);
6160 return;
6161 }
6162 case X86ISD::PCMPESTR: {
6163 if (!Subtarget->hasSSE42())
6164 break;
6165
6166 // Copy the two implicit register inputs.
6167 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6168 Node->getOperand(1),
6169 SDValue()).getValue(1);
6170 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6171 Node->getOperand(3), InGlue).getValue(1);
6172
6173 bool NeedIndex = !SDValue(Node, 0).use_empty();
6174 bool NeedMask = !SDValue(Node, 1).use_empty();
6175 // We can't fold a load if we are going to make two instructions.
6176 bool MayFoldLoad = !NeedIndex || !NeedMask;
6177
6178 MachineSDNode *CNode;
6179 if (NeedMask) {
6180 unsigned ROpc =
6181 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6182 unsigned MOpc =
6183 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6184 CNode =
6185 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6186 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6187 }
6188 if (NeedIndex || !NeedMask) {
6189 unsigned ROpc =
6190 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6191 unsigned MOpc =
6192 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6193 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6194 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6195 }
6196 // Connect the flag usage to the last instruction created.
6197 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6198 CurDAG->RemoveDeadNode(Node);
6199 return;
6200 }
6201
6202 case ISD::SETCC: {
6203 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6204 return;
6205
6206 break;
6207 }
6208
6209 case ISD::STORE:
6210 if (foldLoadStoreIntoMemOperand(Node))
6211 return;
6212 break;
6213
6214 case X86ISD::SETCC_CARRY: {
6215 MVT VT = Node->getSimpleValueType(0);
6217 if (Subtarget->hasSBBDepBreaking()) {
6218 // We have to do this manually because tblgen will put the eflags copy in
6219 // the wrong place if we use an extract_subreg in the pattern.
6220 // Copy flags to the EFLAGS register and glue it to next node.
6221 SDValue EFLAGS =
6222 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6223 Node->getOperand(1), SDValue());
6224
6225 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6226 // 32-bit version.
6227 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6228 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6229 Result = SDValue(
6230 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6231 0);
6232 } else {
6233 // The target does not recognize sbb with the same reg operand as a
6234 // no-source idiom, so we explicitly zero the input values.
6235 Result = getSBBZero(Node);
6236 }
6237
6238 // For less than 32-bits we need to extract from the 32-bit node.
6239 if (VT == MVT::i8 || VT == MVT::i16) {
6240 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6241 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6242 }
6243
6244 ReplaceUses(SDValue(Node, 0), Result);
6245 CurDAG->RemoveDeadNode(Node);
6246 return;
6247 }
6248 case X86ISD::SBB: {
6249 if (isNullConstant(Node->getOperand(0)) &&
6250 isNullConstant(Node->getOperand(1))) {
6251 SDValue Result = getSBBZero(Node);
6252
6253 // Replace the flag use.
6254 ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6255
6256 // Replace the result use.
6257 if (!SDValue(Node, 0).use_empty()) {
6258 // For less than 32-bits we need to extract from the 32-bit node.
6259 MVT VT = Node->getSimpleValueType(0);
6260 if (VT == MVT::i8 || VT == MVT::i16) {
6261 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6262 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6263 }
6264 ReplaceUses(SDValue(Node, 0), Result);
6265 }
6266
6267 CurDAG->RemoveDeadNode(Node);
6268 return;
6269 }
6270 break;
6271 }
6272 case X86ISD::MGATHER: {
6273 auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6274 SDValue IndexOp = Mgt->getIndex();
6275 SDValue Mask = Mgt->getMask();
6276 MVT IndexVT = IndexOp.getSimpleValueType();
6277 MVT ValueVT = Node->getSimpleValueType(0);
6278 MVT MaskVT = Mask.getSimpleValueType();
6279
6280 // This is just to prevent crashes if the nodes are malformed somehow. We're
6281 // otherwise only doing loose type checking in here based on type what
6282 // a type constraint would say just like table based isel.
6283 if (!ValueVT.isVector() || !MaskVT.isVector())
6284 break;
6285
6286 unsigned NumElts = ValueVT.getVectorNumElements();
6287 MVT ValueSVT = ValueVT.getVectorElementType();
6288
6289 bool IsFP = ValueSVT.isFloatingPoint();
6290 unsigned EltSize = ValueSVT.getSizeInBits();
6291
6292 unsigned Opc = 0;
6293 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6294 if (AVX512Gather) {
6295 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6296 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6297 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6298 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6299 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6300 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6301 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6302 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6303 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6304 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6305 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6306 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6307 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6308 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6309 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6310 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6311 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6312 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6313 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6314 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6315 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6316 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6317 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6318 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6319 } else {
6320 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6321 "Unexpected mask VT!");
6322 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6323 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6324 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6325 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6326 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6327 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6328 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6329 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6330 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6331 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6332 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6333 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6334 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6335 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6336 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6337 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6338 }
6339
6340 if (!Opc)
6341 break;
6342
6343 SDValue Base, Scale, Index, Disp, Segment;
6344 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6345 Base, Scale, Index, Disp, Segment))
6346 break;
6347
6348 SDValue PassThru = Mgt->getPassThru();
6349 SDValue Chain = Mgt->getChain();
6350 // Gather instructions have a mask output not in the ISD node.
6351 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6352
6353 MachineSDNode *NewNode;
6354 if (AVX512Gather) {
6355 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6356 Index, Disp, Segment, Chain};
6357 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6358 } else {
6359 SDValue Ops[] = {PassThru, Base, Scale, Index,
6360 Disp, Segment, Mask, Chain};
6361 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6362 }
6363 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6364 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6365 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6366 CurDAG->RemoveDeadNode(Node);
6367 return;
6368 }
6369 case X86ISD::MSCATTER: {
6370 auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6371 SDValue Value = Sc->getValue();
6372 SDValue IndexOp = Sc->getIndex();
6373 MVT IndexVT = IndexOp.getSimpleValueType();
6374 MVT ValueVT = Value.getSimpleValueType();
6375
6376 // This is just to prevent crashes if the nodes are malformed somehow. We're
6377 // otherwise only doing loose type checking in here based on type what
6378 // a type constraint would say just like table based isel.
6379 if (!ValueVT.isVector())
6380 break;
6381
6382 unsigned NumElts = ValueVT.getVectorNumElements();
6383 MVT ValueSVT = ValueVT.getVectorElementType();
6384
6385 bool IsFP = ValueSVT.isFloatingPoint();
6386 unsigned EltSize = ValueSVT.getSizeInBits();
6387
6388 unsigned Opc;
6389 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6390 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6391 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6392 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6393 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6394 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6395 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6396 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6397 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6398 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6399 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6400 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6401 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6402 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6403 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6404 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6405 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6406 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6407 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6408 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6409 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6410 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6411 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6412 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6413 else
6414 break;
6415
6416 SDValue Base, Scale, Index, Disp, Segment;
6417 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6418 Base, Scale, Index, Disp, Segment))
6419 break;
6420
6421 SDValue Mask = Sc->getMask();
6422 SDValue Chain = Sc->getChain();
6423 // Scatter instructions have a mask output not in the ISD node.
6424 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6425 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6426
6427 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6428 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6429 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6430 CurDAG->RemoveDeadNode(Node);
6431 return;
6432 }
6434 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6435 auto CallId = MFI->getPreallocatedIdForCallSite(
6436 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6437 SDValue Chain = Node->getOperand(0);
6438 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6439 MachineSDNode *New = CurDAG->getMachineNode(
6440 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6441 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6442 CurDAG->RemoveDeadNode(Node);
6443 return;
6444 }
6445 case ISD::PREALLOCATED_ARG: {
6446 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6447 auto CallId = MFI->getPreallocatedIdForCallSite(
6448 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6449 SDValue Chain = Node->getOperand(0);
6450 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6451 SDValue ArgIndex = Node->getOperand(2);
6452 SDValue Ops[3];
6453 Ops[0] = CallIdValue;
6454 Ops[1] = ArgIndex;
6455 Ops[2] = Chain;
6456 MachineSDNode *New = CurDAG->getMachineNode(
6457 TargetOpcode::PREALLOCATED_ARG, dl,
6458 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6459 MVT::Other),
6460 Ops);
6461 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6462 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6463 CurDAG->RemoveDeadNode(Node);
6464 return;
6465 }
6470 if (!Subtarget->hasWIDEKL())
6471 break;
6472
6473 unsigned Opcode;
6474 switch (Node->getOpcode()) {
6475 default:
6476 llvm_unreachable("Unexpected opcode!");
6478 Opcode = GET_EGPR_IF_ENABLED(X86::AESENCWIDE128KL);
6479 break;
6481 Opcode = GET_EGPR_IF_ENABLED(X86::AESDECWIDE128KL);
6482 break;
6484 Opcode = GET_EGPR_IF_ENABLED(X86::AESENCWIDE256KL);
6485 break;
6487 Opcode = GET_EGPR_IF_ENABLED(X86::AESDECWIDE256KL);
6488 break;
6489#undef GET_EGPR_IF_ENABLED
6490 }
6491
6492 SDValue Chain = Node->getOperand(0);
6493 SDValue Addr = Node->getOperand(1);
6494
6495 SDValue Base, Scale, Index, Disp, Segment;
6496 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6497 break;
6498
6499 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6500 SDValue());
6501 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6502 Chain.getValue(1));
6503 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6504 Chain.getValue(1));
6505 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6506 Chain.getValue(1));
6507 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6508 Chain.getValue(1));
6509 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6510 Chain.getValue(1));
6511 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6512 Chain.getValue(1));
6513 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6514 Chain.getValue(1));
6515
6516 MachineSDNode *Res = CurDAG->getMachineNode(
6517 Opcode, dl, Node->getVTList(),
6518 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6519 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6520 ReplaceNode(Node, Res);
6521 return;
6522 }
6523 }
6524
6525 SelectCode(Node);
6526}
6527
6528bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6529 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6530 std::vector<SDValue> &OutOps) {
6531 SDValue Op0, Op1, Op2, Op3, Op4;
6532 switch (ConstraintID) {
6533 default:
6534 llvm_unreachable("Unexpected asm memory constraint");
6535 case InlineAsm::ConstraintCode::o: // offsetable ??
6536 case InlineAsm::ConstraintCode::v: // not offsetable ??
6537 case InlineAsm::ConstraintCode::m: // memory
6538 case InlineAsm::ConstraintCode::X:
6539 case InlineAsm::ConstraintCode::p: // address
6540 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6541 return true;
6542 break;
6543 }
6544
6545 OutOps.push_back(Op0);
6546 OutOps.push_back(Op1);
6547 OutOps.push_back(Op2);
6548 OutOps.push_back(Op3);
6549 OutOps.push_back(Op4);
6550 return false;
6551}
6552
6553/// This pass converts a legalized DAG into a X86-specific DAG,
6554/// ready for instruction scheduling.
6556 CodeGenOptLevel OptLevel) {
6557 return new X86DAGToDAGISel(TM, OptLevel);
6558}
unsigned SubReg
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
aarch64 promote const
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu AMDGPU Register Bank Select
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
#define P(N)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII)
Check if the instruction uses RIP relative addressing.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, unsigned LoadOpNo, LoadSDNode *&LoadNode, SDValue &InputChain)
Check whether or not the chain ending in StoreNode is suitable for doing the {load; op; store} to mod...
#define GET_EGPR_IF_ENABLED(OPC)
static bool needBWI(MVT VT)
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked)
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget)
static bool mayUseCarryFlag(X86::CondCode CC)
static cl::opt< bool > EnablePromoteAnyextLoad("x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden)
cl::opt< bool > IndirectBranchTracking
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain)
Replace the original chain operand of the call with load's chain operand and move load below the call...
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N)
#define GET_ND_IF_ENABLED(OPC)
#define VPTESTM_BROADCAST_CASES(SUFFIX)
static cl::opt< bool > AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden)
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
#define VPTESTM_FULL_CASES(SUFFIX)
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
static bool isDispSafeForFrameIndex(int64_t Val)
#define PASS_NAME
#define DEBUG_TYPE
static bool isEndbrImm64(uint64_t Imm)
#define GET_ND_IF_ENABLED(OPC)
Value * RHS
DEMANGLE_DUMP_METHOD void dump() const
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:906
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1548
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1482
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1606
The address of a basic block.
Definition: Constants.h:889
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:681
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:678
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
std::optional< ConstantRange > getAbsoluteSymbolRange() const
If this is an absolute symbol reference, returns the range of the symbol, otherwise returns std::null...
Definition: Globals.cpp:388
This class is used to form a handle around another node that is persistent and is updated across invo...
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineModuleInfo & getMMI() const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const Module * getModule() const
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
Root of the metadata hierarchy.
Definition: Metadata.h:62
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:331
Register getReg() const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
int getNodeId() const
Return the unique node id.
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
const SDValue & getOperand(unsigned Num) const
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
op_iterator op_end() const
op_iterator op_begin() const
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
bool use_empty() const
Return true if there are no nodes using value ResNo of Node.
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
unsigned getNumOperands() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps)
SelectInlineAsmMemoryOperand - Select the specified address as a target addressing mode,...
virtual void PostprocessISelDAG()
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
static int getUninvalidatedNodeId(SDNode *N)
virtual void emitFunctionEntryCode()
virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const
IsProfitableToFold - Returns true if it's profitable to fold the specific operand node N of U during ...
virtual bool ComplexPatternFuncMutatesDAG() const
Return true if complex patterns for this target can mutate the DAG.
virtual void PreprocessISelDAG()
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
static void InvalidateNodeId(SDNode *N)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:447
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
void RepositionNode(allnodes_iterator Position, SDNode *N)
Move node N in the AllNodes list to be immediately before the given iterator Position.
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:533
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
std::vector< ArgListEntry > ArgListTy
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
unsigned getID() const
Return the register class ID number.
Target - Wrapper for Target specific information.
static Type * getVoidTy(LLVMContext &C)
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5239
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
size_t getPreallocatedIdForCallSite(const Value *CS)
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ STRICT_FCEIL
Definition: ISDOpcodes.h:426
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1031
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ FrameIndex
Definition: ISDOpcodes.h:80
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ PREALLOCATED_SETUP
Definition: ISDOpcodes.h:1160
@ TargetExternalSymbol
Definition: ISDOpcodes.h:169
@ PREALLOCATED_ARG
Definition: ISDOpcodes.h:1163
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1052
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:208
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:164
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:430
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:114
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:809
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:427
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:429
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:443
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:442
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:831
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:423
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ CALLSEQ_START
CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end of a call sequence,...
Definition: ISDOpcodes.h:1140
@ STRICT_FRINT
Definition: ISDOpcodes.h:422
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ TargetGlobalTLSAddress
Definition: ISDOpcodes.h:165
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1523
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
const uint64_t Magic
Definition: InstrProf.h:1114
SymbolFlags
Symbol flags.
Definition: Symbol.h:24
@ GlobalBaseReg
The result of the mflr at function entry, used for PIC code.
@ X86
Windows x64, Windows Itanium (IA-64)
@ SS
Definition: X86.h:207
@ FS
Definition: X86.h:206
@ GS
Definition: X86.h:205
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:377
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
Definition: X86BaseInfo.h:839
@ VEX
VEX - encoding using 0xC4/0xC5.
Definition: X86BaseInfo.h:832
@ XOP
XOP - Opcode prefix used by XOP instructions.
Definition: X86BaseInfo.h:834
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ STRICT_FCMP
X86 strict FP compare instructions.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ CALL
These operations represent an abstract X86 call instruction, which includes a bunch of information.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FLD
This instruction implements an extending load to FP stack slots.
@ TC_RETURN
Tail call return.
@ FOR
Bitwise logical OR of floating point values.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:456
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
unsigned M1(unsigned Val)
Definition: VE.h:376
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:246
FunctionPass * createX86ISelDag(X86TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a X86-specific DAG, ready for instruction scheduling.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
@ And
Bitwise or logical AND of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This structure contains all information that is necessary for lowering calls.