LLVM 20.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
34#include <cstdint>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "x86-isel"
39#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46
48 "x86-promote-anyext-load", cl::init(true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
52
53//===----------------------------------------------------------------------===//
54// Pattern Matcher Implementation
55//===----------------------------------------------------------------------===//
56
57namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
65
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
69
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
83
84 X86ISelAddressMode() = default;
85
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89 }
90
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94 }
95
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
103 }
104
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
108 }
109
110#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151 }
152#endif
153 };
154}
155
156namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
160 ///
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
165
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
168
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
171
172 public:
173 X86DAGToDAGISel() = delete;
174
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 "indirect-tls-seg-refs");
184
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
187 assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
188 "OptForMinSize implies OptForSize");
190 }
191
192 void emitFunctionEntryCode() override;
193
194 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
195
196 void PreprocessISelDAG() override;
197 void PostprocessISelDAG() override;
198
199// Include the pieces autogenerated from the target description.
200#include "X86GenDAGISel.inc"
201
202 private:
203 void Select(SDNode *N) override;
204
205 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
206 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
207 bool AllowSegmentRegForX32 = false);
208 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
209 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
210 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
211 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
212 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
217 unsigned Depth);
218 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
219 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
220 SDValue &Scale, SDValue &Index, SDValue &Disp,
221 SDValue &Segment);
222 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
223 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
224 SDValue &Index, SDValue &Disp, SDValue &Segment);
225 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
226 bool selectLEAAddr(SDValue N, SDValue &Base,
227 SDValue &Scale, SDValue &Index, SDValue &Disp,
228 SDValue &Segment);
229 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
233 SDValue &Scale, SDValue &Index, SDValue &Disp,
234 SDValue &Segment);
235 bool selectRelocImm(SDValue N, SDValue &Op);
236
237 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
238 SDValue &Base, SDValue &Scale,
239 SDValue &Index, SDValue &Disp,
240 SDValue &Segment);
241
242 // Convenience method where P is also root.
243 bool tryFoldLoad(SDNode *P, SDValue N,
244 SDValue &Base, SDValue &Scale,
245 SDValue &Index, SDValue &Disp,
246 SDValue &Segment) {
247 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
248 }
249
250 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
251 SDValue &Base, SDValue &Scale,
252 SDValue &Index, SDValue &Disp,
253 SDValue &Segment);
254
255 bool isProfitableToFormMaskedOp(SDNode *N) const;
256
257 /// Implement addressing mode selection for inline asm expressions.
259 InlineAsm::ConstraintCode ConstraintID,
260 std::vector<SDValue> &OutOps) override;
261
262 void emitSpecialCodeForMain();
263
264 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
265 MVT VT, SDValue &Base, SDValue &Scale,
266 SDValue &Index, SDValue &Disp,
267 SDValue &Segment) {
268 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
269 Base = CurDAG->getTargetFrameIndex(
270 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
271 else if (AM.Base_Reg.getNode())
272 Base = AM.Base_Reg;
273 else
274 Base = CurDAG->getRegister(0, VT);
275
276 Scale = getI8Imm(AM.Scale, DL);
277
278#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
279 // Negate the index if needed.
280 if (AM.NegateIndex) {
281 unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
282 : GET_ND_IF_ENABLED(X86::NEG32r);
283 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
284 AM.IndexReg), 0);
285 AM.IndexReg = Neg;
286 }
287
288 if (AM.IndexReg.getNode())
289 Index = AM.IndexReg;
290 else
291 Index = CurDAG->getRegister(0, VT);
292
293 // These are 32-bit even in 64-bit mode since RIP-relative offset
294 // is 32-bit.
295 if (AM.GV)
296 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
297 MVT::i32, AM.Disp,
298 AM.SymbolFlags);
299 else if (AM.CP)
300 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
301 AM.Disp, AM.SymbolFlags);
302 else if (AM.ES) {
303 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
304 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
305 } else if (AM.MCSym) {
306 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
307 assert(AM.SymbolFlags == 0 && "oo");
308 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
309 } else if (AM.JT != -1) {
310 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
311 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
312 } else if (AM.BlockAddr)
313 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
314 AM.SymbolFlags);
315 else
316 Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
317
318 if (AM.Segment.getNode())
319 Segment = AM.Segment;
320 else
321 Segment = CurDAG->getRegister(0, MVT::i16);
322 }
323
324 // Utility function to determine whether it is AMX SDNode right after
325 // lowering but before ISEL.
326 bool isAMXSDNode(SDNode *N) const {
327 // Check if N is AMX SDNode:
328 // 1. check specific opcode since these carry MVT::Untyped instead of
329 // x86amx_type;
330 // 2. check result type;
331 // 3. check operand type;
332 switch (N->getOpcode()) {
333 default:
334 break;
335 case X86::PT2RPNTLVWZ0V:
336 case X86::PT2RPNTLVWZ0T1V:
337 case X86::PT2RPNTLVWZ1V:
338 case X86::PT2RPNTLVWZ1T1V:
339 case X86::PT2RPNTLVWZ0RSV:
340 case X86::PT2RPNTLVWZ0RST1V:
341 case X86::PT2RPNTLVWZ1RSV:
342 case X86::PT2RPNTLVWZ1RST1V:
343 return true;
344 }
345 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
346 if (N->getValueType(Idx) == MVT::x86amx)
347 return true;
348 }
349 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
350 SDValue Op = N->getOperand(Idx);
351 if (Op.getValueType() == MVT::x86amx)
352 return true;
353 }
354 return false;
355 }
356
357 // Utility function to determine whether we should avoid selecting
358 // immediate forms of instructions for better code size or not.
359 // At a high level, we'd like to avoid such instructions when
360 // we have similar constants used within the same basic block
361 // that can be kept in a register.
362 //
363 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
364 uint32_t UseCount = 0;
365
366 // Do not want to hoist if we're not optimizing for size.
367 // TODO: We'd like to remove this restriction.
368 // See the comment in X86InstrInfo.td for more info.
369 if (!CurDAG->shouldOptForSize())
370 return false;
371
372 // Walk all the users of the immediate.
373 for (const SDNode *User : N->users()) {
374 if (UseCount >= 2)
375 break;
376
377 // This user is already selected. Count it as a legitimate use and
378 // move on.
379 if (User->isMachineOpcode()) {
380 UseCount++;
381 continue;
382 }
383
384 // We want to count stores of immediates as real uses.
385 if (User->getOpcode() == ISD::STORE &&
386 User->getOperand(1).getNode() == N) {
387 UseCount++;
388 continue;
389 }
390
391 // We don't currently match users that have > 2 operands (except
392 // for stores, which are handled above)
393 // Those instruction won't match in ISEL, for now, and would
394 // be counted incorrectly.
395 // This may change in the future as we add additional instruction
396 // types.
397 if (User->getNumOperands() != 2)
398 continue;
399
400 // If this is a sign-extended 8-bit integer immediate used in an ALU
401 // instruction, there is probably an opcode encoding to save space.
402 auto *C = dyn_cast<ConstantSDNode>(N);
403 if (C && isInt<8>(C->getSExtValue()))
404 continue;
405
406 // Immediates that are used for offsets as part of stack
407 // manipulation should be left alone. These are typically
408 // used to indicate SP offsets for argument passing and
409 // will get pulled into stores/pushes (implicitly).
410 if (User->getOpcode() == X86ISD::ADD ||
411 User->getOpcode() == ISD::ADD ||
412 User->getOpcode() == X86ISD::SUB ||
413 User->getOpcode() == ISD::SUB) {
414
415 // Find the other operand of the add/sub.
416 SDValue OtherOp = User->getOperand(0);
417 if (OtherOp.getNode() == N)
418 OtherOp = User->getOperand(1);
419
420 // Don't count if the other operand is SP.
421 RegisterSDNode *RegNode;
422 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
423 (RegNode = dyn_cast_or_null<RegisterSDNode>(
424 OtherOp->getOperand(1).getNode())))
425 if ((RegNode->getReg() == X86::ESP) ||
426 (RegNode->getReg() == X86::RSP))
427 continue;
428 }
429
430 // ... otherwise, count this and move on.
431 UseCount++;
432 }
433
434 // If we have more than 1 use, then recommend for hoisting.
435 return (UseCount > 1);
436 }
437
438 /// Return a target constant with the specified value of type i8.
439 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
440 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
441 }
442
443 /// Return a target constant with the specified value, of type i32.
444 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
445 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
446 }
447
448 /// Return a target constant with the specified value, of type i64.
449 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
450 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
451 }
452
453 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
454 const SDLoc &DL) {
455 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
456 uint64_t Index = N->getConstantOperandVal(1);
457 MVT VecVT = N->getOperand(0).getSimpleValueType();
458 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
459 }
460
461 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
462 const SDLoc &DL) {
463 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
464 uint64_t Index = N->getConstantOperandVal(2);
465 MVT VecVT = N->getSimpleValueType(0);
466 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
467 }
468
469 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
470 const SDLoc &DL) {
471 assert(VecWidth == 128 && "Unexpected vector width");
472 uint64_t Index = N->getConstantOperandVal(2);
473 MVT VecVT = N->getSimpleValueType(0);
474 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
475 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
476 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
477 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
478 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
479 }
480
481 SDValue getSBBZero(SDNode *N) {
482 SDLoc dl(N);
483 MVT VT = N->getSimpleValueType(0);
484
485 // Create zero.
486 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
487 SDValue Zero =
488 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
489 if (VT == MVT::i64) {
490 Zero = SDValue(
491 CurDAG->getMachineNode(
492 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
493 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
494 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
495 0);
496 }
497
498 // Copy flags to the EFLAGS register and glue it to next node.
499 unsigned Opcode = N->getOpcode();
500 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
501 "Unexpected opcode for SBB materialization");
502 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
503 SDValue EFLAGS =
504 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
505 N->getOperand(FlagOpIndex), SDValue());
506
507 // Create a 64-bit instruction if the result is 64-bits otherwise use the
508 // 32-bit version.
509 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
510 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
511 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
512 return SDValue(
513 CurDAG->getMachineNode(Opc, dl, VTs,
514 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
515 0);
516 }
517
518 // Helper to detect unneeded and instructions on shift amounts. Called
519 // from PatFrags in tablegen.
520 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
521 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
522 const APInt &Val = N->getConstantOperandAPInt(1);
523
524 if (Val.countr_one() >= Width)
525 return true;
526
527 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
528 return Mask.countr_one() >= Width;
529 }
530
531 /// Return an SDNode that returns the value of the global base register.
532 /// Output instructions required to initialize the global base register,
533 /// if necessary.
534 SDNode *getGlobalBaseReg();
535
536 /// Return a reference to the TargetMachine, casted to the target-specific
537 /// type.
538 const X86TargetMachine &getTargetMachine() const {
539 return static_cast<const X86TargetMachine &>(TM);
540 }
541
542 /// Return a reference to the TargetInstrInfo, casted to the target-specific
543 /// type.
544 const X86InstrInfo *getInstrInfo() const {
545 return Subtarget->getInstrInfo();
546 }
547
548 /// Return a condition code of the given SDNode
549 X86::CondCode getCondFromNode(SDNode *N) const;
550
551 /// Address-mode matching performs shift-of-and to and-of-shift
552 /// reassociation in order to expose more scaled addressing
553 /// opportunities.
554 bool ComplexPatternFuncMutatesDAG() const override {
555 return true;
556 }
557
558 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
559
560 // Indicates we should prefer to use a non-temporal load for this load.
561 bool useNonTemporalLoad(LoadSDNode *N) const {
562 if (!N->isNonTemporal())
563 return false;
564
565 unsigned StoreSize = N->getMemoryVT().getStoreSize();
566
567 if (N->getAlign().value() < StoreSize)
568 return false;
569
570 switch (StoreSize) {
571 default: llvm_unreachable("Unsupported store size");
572 case 4:
573 case 8:
574 return false;
575 case 16:
576 return Subtarget->hasSSE41();
577 case 32:
578 return Subtarget->hasAVX2();
579 case 64:
580 return Subtarget->hasAVX512();
581 }
582 }
583
584 bool foldLoadStoreIntoMemOperand(SDNode *Node);
585 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
586 bool matchBitExtract(SDNode *Node);
587 bool shrinkAndImmediate(SDNode *N);
588 bool isMaskZeroExtended(SDNode *N) const;
589 bool tryShiftAmountMod(SDNode *N);
590 bool tryShrinkShlLogicImm(SDNode *N);
591 bool tryVPTERNLOG(SDNode *N);
592 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
593 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
594 uint8_t Imm);
595 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
596 bool tryMatchBitSelect(SDNode *N);
597
598 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
599 const SDLoc &dl, MVT VT, SDNode *Node);
600 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
601 const SDLoc &dl, MVT VT, SDNode *Node,
602 SDValue &InGlue);
603
604 bool tryOptimizeRem8Extend(SDNode *N);
605
606 bool onlyUsesZeroFlag(SDValue Flags) const;
607 bool hasNoSignFlagUses(SDValue Flags) const;
608 bool hasNoCarryFlagUses(SDValue Flags) const;
609 };
610
611 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
612 public:
613 static char ID;
614 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
615 CodeGenOptLevel OptLevel)
617 ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
618 };
619}
620
621char X86DAGToDAGISelLegacy::ID = 0;
622
623INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
624
625// Returns true if this masked compare can be implemented legally with this
626// type.
627static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
628 unsigned Opcode = N->getOpcode();
629 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
630 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
631 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
632 // We can get 256-bit 8 element types here without VLX being enabled. When
633 // this happens we will use 512-bit operations and the mask will not be
634 // zero extended.
635 EVT OpVT = N->getOperand(0).getValueType();
636 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
637 // second operand.
638 if (Opcode == X86ISD::STRICT_CMPM)
639 OpVT = N->getOperand(1).getValueType();
640 if (OpVT.is256BitVector() || OpVT.is128BitVector())
641 return Subtarget->hasVLX();
642
643 return true;
644 }
645 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
646 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
647 Opcode == X86ISD::FSETCCM_SAE)
648 return true;
649
650 return false;
651}
652
653// Returns true if we can assume the writer of the mask has zero extended it
654// for us.
655bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
656 // If this is an AND, check if we have a compare on either side. As long as
657 // one side guarantees the mask is zero extended, the AND will preserve those
658 // zeros.
659 if (N->getOpcode() == ISD::AND)
660 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
661 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
662
663 return isLegalMaskCompare(N, Subtarget);
664}
665
666bool
667X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
668 if (OptLevel == CodeGenOptLevel::None)
669 return false;
670
671 if (!N.hasOneUse())
672 return false;
673
674 if (N.getOpcode() != ISD::LOAD)
675 return true;
676
677 // Don't fold non-temporal loads if we have an instruction for them.
678 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
679 return false;
680
681 // If N is a load, do additional profitability checks.
682 if (U == Root) {
683 switch (U->getOpcode()) {
684 default: break;
685 case X86ISD::ADD:
686 case X86ISD::ADC:
687 case X86ISD::SUB:
688 case X86ISD::SBB:
689 case X86ISD::AND:
690 case X86ISD::XOR:
691 case X86ISD::OR:
692 case ISD::ADD:
693 case ISD::UADDO_CARRY:
694 case ISD::AND:
695 case ISD::OR:
696 case ISD::XOR: {
697 SDValue Op1 = U->getOperand(1);
698
699 // If the other operand is a 8-bit immediate we should fold the immediate
700 // instead. This reduces code size.
701 // e.g.
702 // movl 4(%esp), %eax
703 // addl $4, %eax
704 // vs.
705 // movl $4, %eax
706 // addl 4(%esp), %eax
707 // The former is 2 bytes shorter. In case where the increment is 1, then
708 // the saving can be 4 bytes (by using incl %eax).
709 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
710 if (Imm->getAPIntValue().isSignedIntN(8))
711 return false;
712
713 // If this is a 64-bit AND with an immediate that fits in 32-bits,
714 // prefer using the smaller and over folding the load. This is needed to
715 // make sure immediates created by shrinkAndImmediate are always folded.
716 // Ideally we would narrow the load during DAG combine and get the
717 // best of both worlds.
718 if (U->getOpcode() == ISD::AND &&
719 Imm->getAPIntValue().getBitWidth() == 64 &&
720 Imm->getAPIntValue().isIntN(32))
721 return false;
722
723 // If this really a zext_inreg that can be represented with a movzx
724 // instruction, prefer that.
725 // TODO: We could shrink the load and fold if it is non-volatile.
726 if (U->getOpcode() == ISD::AND &&
727 (Imm->getAPIntValue() == UINT8_MAX ||
728 Imm->getAPIntValue() == UINT16_MAX ||
729 Imm->getAPIntValue() == UINT32_MAX))
730 return false;
731
732 // ADD/SUB with can negate the immediate and use the opposite operation
733 // to fit 128 into a sign extended 8 bit immediate.
734 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
735 (-Imm->getAPIntValue()).isSignedIntN(8))
736 return false;
737
738 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
739 (-Imm->getAPIntValue()).isSignedIntN(8) &&
740 hasNoCarryFlagUses(SDValue(U, 1)))
741 return false;
742 }
743
744 // If the other operand is a TLS address, we should fold it instead.
745 // This produces
746 // movl %gs:0, %eax
747 // leal i@NTPOFF(%eax), %eax
748 // instead of
749 // movl $i@NTPOFF, %eax
750 // addl %gs:0, %eax
751 // if the block also has an access to a second TLS address this will save
752 // a load.
753 // FIXME: This is probably also true for non-TLS addresses.
754 if (Op1.getOpcode() == X86ISD::Wrapper) {
755 SDValue Val = Op1.getOperand(0);
757 return false;
758 }
759
760 // Don't fold load if this matches the BTS/BTR/BTC patterns.
761 // BTS: (or X, (shl 1, n))
762 // BTR: (and X, (rotl -2, n))
763 // BTC: (xor X, (shl 1, n))
764 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
765 if (U->getOperand(0).getOpcode() == ISD::SHL &&
766 isOneConstant(U->getOperand(0).getOperand(0)))
767 return false;
768
769 if (U->getOperand(1).getOpcode() == ISD::SHL &&
770 isOneConstant(U->getOperand(1).getOperand(0)))
771 return false;
772 }
773 if (U->getOpcode() == ISD::AND) {
774 SDValue U0 = U->getOperand(0);
775 SDValue U1 = U->getOperand(1);
776 if (U0.getOpcode() == ISD::ROTL) {
777 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
778 if (C && C->getSExtValue() == -2)
779 return false;
780 }
781
782 if (U1.getOpcode() == ISD::ROTL) {
783 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
784 if (C && C->getSExtValue() == -2)
785 return false;
786 }
787 }
788
789 break;
790 }
791 case ISD::SHL:
792 case ISD::SRA:
793 case ISD::SRL:
794 // Don't fold a load into a shift by immediate. The BMI2 instructions
795 // support folding a load, but not an immediate. The legacy instructions
796 // support folding an immediate, but can't fold a load. Folding an
797 // immediate is preferable to folding a load.
798 if (isa<ConstantSDNode>(U->getOperand(1)))
799 return false;
800
801 break;
802 }
803 }
804
805 // Prevent folding a load if this can implemented with an insert_subreg or
806 // a move that implicitly zeroes.
807 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
808 isNullConstant(Root->getOperand(2)) &&
809 (Root->getOperand(0).isUndef() ||
811 return false;
812
813 return true;
814}
815
816// Indicates it is profitable to form an AVX512 masked operation. Returning
817// false will favor a masked register-register masked move or vblendm and the
818// operation will be selected separately.
819bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
820 assert(
821 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
822 "Unexpected opcode!");
823
824 // If the operation has additional users, the operation will be duplicated.
825 // Check the use count to prevent that.
826 // FIXME: Are there cheap opcodes we might want to duplicate?
827 return N->getOperand(1).hasOneUse();
828}
829
830/// Replace the original chain operand of the call with
831/// load's chain operand and move load below the call's chain operand.
832static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
833 SDValue Call, SDValue OrigChain) {
835 SDValue Chain = OrigChain.getOperand(0);
836 if (Chain.getNode() == Load.getNode())
837 Ops.push_back(Load.getOperand(0));
838 else {
839 assert(Chain.getOpcode() == ISD::TokenFactor &&
840 "Unexpected chain operand");
841 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
842 if (Chain.getOperand(i).getNode() == Load.getNode())
843 Ops.push_back(Load.getOperand(0));
844 else
845 Ops.push_back(Chain.getOperand(i));
846 SDValue NewChain =
847 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
848 Ops.clear();
849 Ops.push_back(NewChain);
850 }
851 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
852 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
853 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
854 Load.getOperand(1), Load.getOperand(2));
855
856 Ops.clear();
857 Ops.push_back(SDValue(Load.getNode(), 1));
858 Ops.append(Call->op_begin() + 1, Call->op_end());
859 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
860}
861
862/// Return true if call address is a load and it can be
863/// moved below CALLSEQ_START and the chains leading up to the call.
864/// Return the CALLSEQ_START by reference as a second output.
865/// In the case of a tail call, there isn't a callseq node between the call
866/// chain and the load.
867static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
868 // The transformation is somewhat dangerous if the call's chain was glued to
869 // the call. After MoveBelowOrigChain the load is moved between the call and
870 // the chain, this can create a cycle if the load is not folded. So it is
871 // *really* important that we are sure the load will be folded.
872 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
873 return false;
874 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
875 if (!LD ||
876 !LD->isSimple() ||
877 LD->getAddressingMode() != ISD::UNINDEXED ||
878 LD->getExtensionType() != ISD::NON_EXTLOAD)
879 return false;
880
881 // Now let's find the callseq_start.
882 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
883 if (!Chain.hasOneUse())
884 return false;
885 Chain = Chain.getOperand(0);
886 }
887
888 if (!Chain.getNumOperands())
889 return false;
890 // Since we are not checking for AA here, conservatively abort if the chain
891 // writes to memory. It's not safe to move the callee (a load) across a store.
892 if (isa<MemSDNode>(Chain.getNode()) &&
893 cast<MemSDNode>(Chain.getNode())->writeMem())
894 return false;
895 if (Chain.getOperand(0).getNode() == Callee.getNode())
896 return true;
897 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
898 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
899 Callee.getValue(1).hasOneUse())
900 return true;
901 return false;
902}
903
904static bool isEndbrImm64(uint64_t Imm) {
905// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
906// i.g: 0xF3660F1EFA, 0xF3670F1EFA
907 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
908 return false;
909
910 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
911 0x65, 0x66, 0x67, 0xf0, 0xf2};
912 int i = 24; // 24bit 0x0F1EFA has matched
913 while (i < 64) {
914 uint8_t Byte = (Imm >> i) & 0xFF;
915 if (Byte == 0xF3)
916 return true;
917 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
918 return false;
919 i += 8;
920 }
921
922 return false;
923}
924
925static bool needBWI(MVT VT) {
926 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
927}
928
929void X86DAGToDAGISel::PreprocessISelDAG() {
930 bool MadeChange = false;
931 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
932 E = CurDAG->allnodes_end(); I != E; ) {
933 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
934
935 // This is for CET enhancement.
936 //
937 // ENDBR32 and ENDBR64 have specific opcodes:
938 // ENDBR32: F3 0F 1E FB
939 // ENDBR64: F3 0F 1E FA
940 // And we want that attackers won’t find unintended ENDBR32/64
941 // opcode matches in the binary
942 // Here’s an example:
943 // If the compiler had to generate asm for the following code:
944 // a = 0xF30F1EFA
945 // it could, for example, generate:
946 // mov 0xF30F1EFA, dword ptr[a]
947 // In such a case, the binary would include a gadget that starts
948 // with a fake ENDBR64 opcode. Therefore, we split such generation
949 // into multiple operations, let it not shows in the binary
950 if (N->getOpcode() == ISD::Constant) {
951 MVT VT = N->getSimpleValueType(0);
952 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
953 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
954 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
955 // Check that the cf-protection-branch is enabled.
956 Metadata *CFProtectionBranch =
958 "cf-protection-branch");
959 if (CFProtectionBranch || IndirectBranchTracking) {
960 SDLoc dl(N);
961 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
962 Complement = CurDAG->getNOT(dl, Complement, VT);
963 --I;
964 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
965 ++I;
966 MadeChange = true;
967 continue;
968 }
969 }
970 }
971
972 // If this is a target specific AND node with no flag usages, turn it back
973 // into ISD::AND to enable test instruction matching.
974 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
975 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
976 N->getOperand(0), N->getOperand(1));
977 --I;
978 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
979 ++I;
980 MadeChange = true;
981 continue;
982 }
983
984 // Convert vector increment or decrement to sub/add with an all-ones
985 // constant:
986 // add X, <1, 1...> --> sub X, <-1, -1...>
987 // sub X, <1, 1...> --> add X, <-1, -1...>
988 // The all-ones vector constant can be materialized using a pcmpeq
989 // instruction that is commonly recognized as an idiom (has no register
990 // dependency), so that's better/smaller than loading a splat 1 constant.
991 //
992 // But don't do this if it would inhibit a potentially profitable load
993 // folding opportunity for the other operand. That only occurs with the
994 // intersection of:
995 // (1) The other operand (op0) is load foldable.
996 // (2) The op is an add (otherwise, we are *creating* an add and can still
997 // load fold the other op).
998 // (3) The target has AVX (otherwise, we have a destructive add and can't
999 // load fold the other op without killing the constant op).
1000 // (4) The constant 1 vector has multiple uses (so it is profitable to load
1001 // into a register anyway).
1002 auto mayPreventLoadFold = [&]() {
1003 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
1004 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1005 !N->getOperand(1).hasOneUse();
1006 };
1007 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1008 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1009 APInt SplatVal;
1010 if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
1011 SplatVal.isOne()) {
1012 SDLoc DL(N);
1013
1014 MVT VT = N->getSimpleValueType(0);
1015 unsigned NumElts = VT.getSizeInBits() / 32;
1017 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
1018 AllOnes = CurDAG->getBitcast(VT, AllOnes);
1019
1020 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1021 SDValue Res =
1022 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
1023 --I;
1024 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1025 ++I;
1026 MadeChange = true;
1027 continue;
1028 }
1029 }
1030
1031 switch (N->getOpcode()) {
1032 case X86ISD::VBROADCAST: {
1033 MVT VT = N->getSimpleValueType(0);
1034 // Emulate v32i16/v64i8 broadcast without BWI.
1035 if (!Subtarget->hasBWI() && needBWI(VT)) {
1036 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1037 SDLoc dl(N);
1038 SDValue NarrowBCast =
1039 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1040 SDValue Res =
1041 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1042 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1043 unsigned Index = NarrowVT.getVectorMinNumElements();
1044 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1045 CurDAG->getIntPtrConstant(Index, dl));
1046
1047 --I;
1048 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1049 ++I;
1050 MadeChange = true;
1051 continue;
1052 }
1053
1054 break;
1055 }
1057 MVT VT = N->getSimpleValueType(0);
1058 // Emulate v32i16/v64i8 broadcast without BWI.
1059 if (!Subtarget->hasBWI() && needBWI(VT)) {
1060 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1061 auto *MemNode = cast<MemSDNode>(N);
1062 SDLoc dl(N);
1063 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1064 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1065 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1066 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1067 MemNode->getMemOperand());
1068 SDValue Res =
1069 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1070 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1071 unsigned Index = NarrowVT.getVectorMinNumElements();
1072 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1073 CurDAG->getIntPtrConstant(Index, dl));
1074
1075 --I;
1076 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1077 CurDAG->ReplaceAllUsesWith(N, To);
1078 ++I;
1079 MadeChange = true;
1080 continue;
1081 }
1082
1083 break;
1084 }
1085 case ISD::LOAD: {
1086 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1087 // load, then just extract the lower subvector and avoid the second load.
1088 auto *Ld = cast<LoadSDNode>(N);
1089 MVT VT = N->getSimpleValueType(0);
1090 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1091 !(VT.is128BitVector() || VT.is256BitVector()))
1092 break;
1093
1094 MVT MaxVT = VT;
1095 SDNode *MaxLd = nullptr;
1096 SDValue Ptr = Ld->getBasePtr();
1097 SDValue Chain = Ld->getChain();
1098 for (SDNode *User : Ptr->users()) {
1099 auto *UserLd = dyn_cast<LoadSDNode>(User);
1100 MVT UserVT = User->getSimpleValueType(0);
1101 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1102 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1103 !User->hasAnyUseOfValue(1) &&
1104 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1105 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1106 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1107 MaxLd = User;
1108 MaxVT = UserVT;
1109 }
1110 }
1111 if (MaxLd) {
1112 SDLoc dl(N);
1113 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1114 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1115 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1116 SDValue(MaxLd, 0),
1117 CurDAG->getIntPtrConstant(0, dl));
1118 SDValue Res = CurDAG->getBitcast(VT, Extract);
1119
1120 --I;
1121 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1122 CurDAG->ReplaceAllUsesWith(N, To);
1123 ++I;
1124 MadeChange = true;
1125 continue;
1126 }
1127 break;
1128 }
1129 case ISD::VSELECT: {
1130 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1131 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1132 if (EleVT == MVT::i1)
1133 break;
1134
1135 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1136 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1137 "We can't replace VSELECT with BLENDV in vXi16!");
1138 SDValue R;
1139 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1140 EleVT.getSizeInBits()) {
1141 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1142 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1143 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1144 } else {
1145 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1146 N->getOperand(0), N->getOperand(1),
1147 N->getOperand(2));
1148 }
1149 --I;
1150 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1151 ++I;
1152 MadeChange = true;
1153 continue;
1154 }
1155 case ISD::FP_ROUND:
1157 case ISD::FP_TO_SINT:
1158 case ISD::FP_TO_UINT:
1161 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1162 // don't need 2 sets of patterns.
1163 if (!N->getSimpleValueType(0).isVector())
1164 break;
1165
1166 unsigned NewOpc;
1167 switch (N->getOpcode()) {
1168 default: llvm_unreachable("Unexpected opcode!");
1169 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1170 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1171 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1172 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1173 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1174 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1175 }
1176 SDValue Res;
1177 if (N->isStrictFPOpcode())
1178 Res =
1179 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1180 {N->getOperand(0), N->getOperand(1)});
1181 else
1182 Res =
1183 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1184 N->getOperand(0));
1185 --I;
1186 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1187 ++I;
1188 MadeChange = true;
1189 continue;
1190 }
1191 case ISD::SHL:
1192 case ISD::SRA:
1193 case ISD::SRL: {
1194 // Replace vector shifts with their X86 specific equivalent so we don't
1195 // need 2 sets of patterns.
1196 if (!N->getValueType(0).isVector())
1197 break;
1198
1199 unsigned NewOpc;
1200 switch (N->getOpcode()) {
1201 default: llvm_unreachable("Unexpected opcode!");
1202 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1203 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1204 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1205 }
1206 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1207 N->getOperand(0), N->getOperand(1));
1208 --I;
1209 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1210 ++I;
1211 MadeChange = true;
1212 continue;
1213 }
1214 case ISD::ANY_EXTEND:
1216 // Replace vector any extend with the zero extend equivalents so we don't
1217 // need 2 sets of patterns. Ignore vXi1 extensions.
1218 if (!N->getValueType(0).isVector())
1219 break;
1220
1221 unsigned NewOpc;
1222 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1223 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1224 "Unexpected opcode for mask vector!");
1225 NewOpc = ISD::SIGN_EXTEND;
1226 } else {
1227 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1230 }
1231
1232 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1233 N->getOperand(0));
1234 --I;
1235 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1236 ++I;
1237 MadeChange = true;
1238 continue;
1239 }
1240 case ISD::FCEIL:
1241 case ISD::STRICT_FCEIL:
1242 case ISD::FFLOOR:
1243 case ISD::STRICT_FFLOOR:
1244 case ISD::FTRUNC:
1245 case ISD::STRICT_FTRUNC:
1246 case ISD::FROUNDEVEN:
1248 case ISD::FNEARBYINT:
1250 case ISD::FRINT:
1251 case ISD::STRICT_FRINT: {
1252 // Replace fp rounding with their X86 specific equivalent so we don't
1253 // need 2 sets of patterns.
1254 unsigned Imm;
1255 switch (N->getOpcode()) {
1256 default: llvm_unreachable("Unexpected opcode!");
1257 case ISD::STRICT_FCEIL:
1258 case ISD::FCEIL: Imm = 0xA; break;
1259 case ISD::STRICT_FFLOOR:
1260 case ISD::FFLOOR: Imm = 0x9; break;
1261 case ISD::STRICT_FTRUNC:
1262 case ISD::FTRUNC: Imm = 0xB; break;
1264 case ISD::FROUNDEVEN: Imm = 0x8; break;
1266 case ISD::FNEARBYINT: Imm = 0xC; break;
1267 case ISD::STRICT_FRINT:
1268 case ISD::FRINT: Imm = 0x4; break;
1269 }
1270 SDLoc dl(N);
1271 bool IsStrict = N->isStrictFPOpcode();
1272 SDValue Res;
1273 if (IsStrict)
1274 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1275 {N->getValueType(0), MVT::Other},
1276 {N->getOperand(0), N->getOperand(1),
1277 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1278 else
1279 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1280 N->getOperand(0),
1281 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1282 --I;
1283 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1284 ++I;
1285 MadeChange = true;
1286 continue;
1287 }
1288 case X86ISD::FANDN:
1289 case X86ISD::FAND:
1290 case X86ISD::FOR:
1291 case X86ISD::FXOR: {
1292 // Widen scalar fp logic ops to vector to reduce isel patterns.
1293 // FIXME: Can we do this during lowering/combine.
1294 MVT VT = N->getSimpleValueType(0);
1295 if (VT.isVector() || VT == MVT::f128)
1296 break;
1297
1298 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1299 : VT == MVT::f32 ? MVT::v4f32
1300 : MVT::v8f16;
1301
1302 SDLoc dl(N);
1303 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1304 N->getOperand(0));
1305 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1306 N->getOperand(1));
1307
1308 SDValue Res;
1309 if (Subtarget->hasSSE2()) {
1310 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1311 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1312 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1313 unsigned Opc;
1314 switch (N->getOpcode()) {
1315 default: llvm_unreachable("Unexpected opcode!");
1316 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1317 case X86ISD::FAND: Opc = ISD::AND; break;
1318 case X86ISD::FOR: Opc = ISD::OR; break;
1319 case X86ISD::FXOR: Opc = ISD::XOR; break;
1320 }
1321 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1322 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1323 } else {
1324 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1325 }
1326 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1327 CurDAG->getIntPtrConstant(0, dl));
1328 --I;
1329 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1330 ++I;
1331 MadeChange = true;
1332 continue;
1333 }
1334 }
1335
1336 if (OptLevel != CodeGenOptLevel::None &&
1337 // Only do this when the target can fold the load into the call or
1338 // jmp.
1339 !Subtarget->useIndirectThunkCalls() &&
1340 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1341 (N->getOpcode() == X86ISD::TC_RETURN &&
1342 (Subtarget->is64Bit() ||
1343 !getTargetMachine().isPositionIndependent())))) {
1344 /// Also try moving call address load from outside callseq_start to just
1345 /// before the call to allow it to be folded.
1346 ///
1347 /// [Load chain]
1348 /// ^
1349 /// |
1350 /// [Load]
1351 /// ^ ^
1352 /// | |
1353 /// / \--
1354 /// / |
1355 ///[CALLSEQ_START] |
1356 /// ^ |
1357 /// | |
1358 /// [LOAD/C2Reg] |
1359 /// | |
1360 /// \ /
1361 /// \ /
1362 /// [CALL]
1363 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1364 SDValue Chain = N->getOperand(0);
1365 SDValue Load = N->getOperand(1);
1366 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1367 continue;
1368 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1369 ++NumLoadMoved;
1370 MadeChange = true;
1371 continue;
1372 }
1373
1374 // Lower fpround and fpextend nodes that target the FP stack to be store and
1375 // load to the stack. This is a gross hack. We would like to simply mark
1376 // these as being illegal, but when we do that, legalize produces these when
1377 // it expands calls, then expands these in the same legalize pass. We would
1378 // like dag combine to be able to hack on these between the call expansion
1379 // and the node legalization. As such this pass basically does "really
1380 // late" legalization of these inline with the X86 isel pass.
1381 // FIXME: This should only happen when not compiled with -O0.
1382 switch (N->getOpcode()) {
1383 default: continue;
1384 case ISD::FP_ROUND:
1385 case ISD::FP_EXTEND:
1386 {
1387 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1388 MVT DstVT = N->getSimpleValueType(0);
1389
1390 // If any of the sources are vectors, no fp stack involved.
1391 if (SrcVT.isVector() || DstVT.isVector())
1392 continue;
1393
1394 // If the source and destination are SSE registers, then this is a legal
1395 // conversion that should not be lowered.
1396 const X86TargetLowering *X86Lowering =
1397 static_cast<const X86TargetLowering *>(TLI);
1398 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1399 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1400 if (SrcIsSSE && DstIsSSE)
1401 continue;
1402
1403 if (!SrcIsSSE && !DstIsSSE) {
1404 // If this is an FPStack extension, it is a noop.
1405 if (N->getOpcode() == ISD::FP_EXTEND)
1406 continue;
1407 // If this is a value-preserving FPStack truncation, it is a noop.
1408 if (N->getConstantOperandVal(1))
1409 continue;
1410 }
1411
1412 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1413 // FPStack has extload and truncstore. SSE can fold direct loads into other
1414 // operations. Based on this, decide what we want to do.
1415 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1416 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1417 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1418 MachinePointerInfo MPI =
1419 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1420 SDLoc dl(N);
1421
1422 // FIXME: optimize the case where the src/dest is a load or store?
1423
1424 SDValue Store = CurDAG->getTruncStore(
1425 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1426 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1427 MemTmp, MPI, MemVT);
1428
1429 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1430 // extload we created. This will cause general havok on the dag because
1431 // anything below the conversion could be folded into other existing nodes.
1432 // To avoid invalidating 'I', back it up to the convert node.
1433 --I;
1434 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1435 break;
1436 }
1437
1438 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1439 //dealing with the chain differently, as there is already a preexisting chain.
1442 {
1443 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1444 MVT DstVT = N->getSimpleValueType(0);
1445
1446 // If any of the sources are vectors, no fp stack involved.
1447 if (SrcVT.isVector() || DstVT.isVector())
1448 continue;
1449
1450 // If the source and destination are SSE registers, then this is a legal
1451 // conversion that should not be lowered.
1452 const X86TargetLowering *X86Lowering =
1453 static_cast<const X86TargetLowering *>(TLI);
1454 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1455 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1456 if (SrcIsSSE && DstIsSSE)
1457 continue;
1458
1459 if (!SrcIsSSE && !DstIsSSE) {
1460 // If this is an FPStack extension, it is a noop.
1461 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1462 continue;
1463 // If this is a value-preserving FPStack truncation, it is a noop.
1464 if (N->getConstantOperandVal(2))
1465 continue;
1466 }
1467
1468 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1469 // FPStack has extload and truncstore. SSE can fold direct loads into other
1470 // operations. Based on this, decide what we want to do.
1471 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1472 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1473 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1474 MachinePointerInfo MPI =
1475 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1476 SDLoc dl(N);
1477
1478 // FIXME: optimize the case where the src/dest is a load or store?
1479
1480 //Since the operation is StrictFP, use the preexisting chain.
1482 if (!SrcIsSSE) {
1483 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1484 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1485 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1486 MPI, /*Align*/ std::nullopt,
1488 if (N->getFlags().hasNoFPExcept()) {
1489 SDNodeFlags Flags = Store->getFlags();
1490 Flags.setNoFPExcept(true);
1491 Store->setFlags(Flags);
1492 }
1493 } else {
1494 assert(SrcVT == MemVT && "Unexpected VT!");
1495 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1496 MPI);
1497 }
1498
1499 if (!DstIsSSE) {
1500 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1501 SDValue Ops[] = {Store, MemTmp};
1502 Result = CurDAG->getMemIntrinsicNode(
1503 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1504 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1505 if (N->getFlags().hasNoFPExcept()) {
1506 SDNodeFlags Flags = Result->getFlags();
1507 Flags.setNoFPExcept(true);
1508 Result->setFlags(Flags);
1509 }
1510 } else {
1511 assert(DstVT == MemVT && "Unexpected VT!");
1512 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1513 }
1514
1515 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1516 // extload we created. This will cause general havok on the dag because
1517 // anything below the conversion could be folded into other existing nodes.
1518 // To avoid invalidating 'I', back it up to the convert node.
1519 --I;
1520 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1521 break;
1522 }
1523 }
1524
1525
1526 // Now that we did that, the node is dead. Increment the iterator to the
1527 // next node to process, then delete N.
1528 ++I;
1529 MadeChange = true;
1530 }
1531
1532 // Remove any dead nodes that may have been left behind.
1533 if (MadeChange)
1534 CurDAG->RemoveDeadNodes();
1535}
1536
1537// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1538bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1539 unsigned Opc = N->getMachineOpcode();
1540 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1541 Opc != X86::MOVSX64rr8)
1542 return false;
1543
1544 SDValue N0 = N->getOperand(0);
1545
1546 // We need to be extracting the lower bit of an extend.
1547 if (!N0.isMachineOpcode() ||
1548 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1549 N0.getConstantOperandVal(1) != X86::sub_8bit)
1550 return false;
1551
1552 // We're looking for either a movsx or movzx to match the original opcode.
1553 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1554 : X86::MOVSX32rr8_NOREX;
1555 SDValue N00 = N0.getOperand(0);
1556 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1557 return false;
1558
1559 if (Opc == X86::MOVSX64rr8) {
1560 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1561 // to 64.
1562 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1563 MVT::i64, N00);
1564 ReplaceUses(N, Extend);
1565 } else {
1566 // Ok we can drop this extend and just use the original extend.
1567 ReplaceUses(N, N00.getNode());
1568 }
1569
1570 return true;
1571}
1572
1573void X86DAGToDAGISel::PostprocessISelDAG() {
1574 // Skip peepholes at -O0.
1575 if (TM.getOptLevel() == CodeGenOptLevel::None)
1576 return;
1577
1578 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1579
1580 bool MadeChange = false;
1581 while (Position != CurDAG->allnodes_begin()) {
1582 SDNode *N = &*--Position;
1583 // Skip dead nodes and any non-machine opcodes.
1584 if (N->use_empty() || !N->isMachineOpcode())
1585 continue;
1586
1587 if (tryOptimizeRem8Extend(N)) {
1588 MadeChange = true;
1589 continue;
1590 }
1591
1592 unsigned Opc = N->getMachineOpcode();
1593 switch (Opc) {
1594 default:
1595 continue;
1596 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1597 case X86::TEST8rr:
1598 case X86::TEST16rr:
1599 case X86::TEST32rr:
1600 case X86::TEST64rr:
1601 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1602 case X86::CTEST8rr:
1603 case X86::CTEST16rr:
1604 case X86::CTEST32rr:
1605 case X86::CTEST64rr: {
1606 auto &Op0 = N->getOperand(0);
1607 if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1608 !Op0.isMachineOpcode())
1609 continue;
1610 SDValue And = N->getOperand(0);
1611#define CASE_ND(OP) \
1612 case X86::OP: \
1613 case X86::OP##_ND:
1614 switch (And.getMachineOpcode()) {
1615 default:
1616 continue;
1617 CASE_ND(AND8rr)
1618 CASE_ND(AND16rr)
1619 CASE_ND(AND32rr)
1620 CASE_ND(AND64rr) {
1621 if (And->hasAnyUseOfValue(1))
1622 continue;
1623 SmallVector<SDValue> Ops(N->op_values());
1624 Ops[0] = And.getOperand(0);
1625 Ops[1] = And.getOperand(1);
1627 CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1628 ReplaceUses(N, Test);
1629 MadeChange = true;
1630 continue;
1631 }
1632 CASE_ND(AND8rm)
1633 CASE_ND(AND16rm)
1634 CASE_ND(AND32rm)
1635 CASE_ND(AND64rm) {
1636 if (And->hasAnyUseOfValue(1))
1637 continue;
1638 unsigned NewOpc;
1639 bool IsCTESTCC = X86::isCTESTCC(Opc);
1640#define FROM_TO(A, B) \
1641 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1642 break;
1643 switch (And.getMachineOpcode()) {
1644 FROM_TO(AND8rm, TEST8mr);
1645 FROM_TO(AND16rm, TEST16mr);
1646 FROM_TO(AND32rm, TEST32mr);
1647 FROM_TO(AND64rm, TEST64mr);
1648 }
1649#undef FROM_TO
1650#undef CASE_ND
1651 // Need to swap the memory and register operand.
1652 SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1653 And.getOperand(3), And.getOperand(4),
1654 And.getOperand(5), And.getOperand(0)};
1655 // CC, Cflags.
1656 if (IsCTESTCC) {
1657 Ops.push_back(N->getOperand(2));
1658 Ops.push_back(N->getOperand(3));
1659 }
1660 // Chain of memory load
1661 Ops.push_back(And.getOperand(6));
1662 // Glue
1663 if (IsCTESTCC)
1664 Ops.push_back(N->getOperand(4));
1665
1666 MachineSDNode *Test = CurDAG->getMachineNode(
1667 NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1668 CurDAG->setNodeMemRefs(
1669 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1670 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1671 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1672 MadeChange = true;
1673 continue;
1674 }
1675 }
1676 }
1677 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1678 // used. We're doing this late so we can prefer to fold the AND into masked
1679 // comparisons. Doing that can be better for the live range of the mask
1680 // register.
1681 case X86::KORTESTBkk:
1682 case X86::KORTESTWkk:
1683 case X86::KORTESTDkk:
1684 case X86::KORTESTQkk: {
1685 SDValue Op0 = N->getOperand(0);
1686 if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1687 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1688 continue;
1689#define CASE(A) \
1690 case X86::A: \
1691 break;
1692 switch (Op0.getMachineOpcode()) {
1693 default:
1694 continue;
1695 CASE(KANDBkk)
1696 CASE(KANDWkk)
1697 CASE(KANDDkk)
1698 CASE(KANDQkk)
1699 }
1700 unsigned NewOpc;
1701#define FROM_TO(A, B) \
1702 case X86::A: \
1703 NewOpc = X86::B; \
1704 break;
1705 switch (Opc) {
1706 FROM_TO(KORTESTBkk, KTESTBkk)
1707 FROM_TO(KORTESTWkk, KTESTWkk)
1708 FROM_TO(KORTESTDkk, KTESTDkk)
1709 FROM_TO(KORTESTQkk, KTESTQkk)
1710 }
1711 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1712 // KAND instructions and KTEST use the same ISA feature.
1713 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1714 continue;
1715#undef FROM_TO
1716 MachineSDNode *KTest = CurDAG->getMachineNode(
1717 NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1718 ReplaceUses(N, KTest);
1719 MadeChange = true;
1720 continue;
1721 }
1722 // Attempt to remove vectors moves that were inserted to zero upper bits.
1723 case TargetOpcode::SUBREG_TO_REG: {
1724 unsigned SubRegIdx = N->getConstantOperandVal(2);
1725 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1726 continue;
1727
1728 SDValue Move = N->getOperand(1);
1729 if (!Move.isMachineOpcode())
1730 continue;
1731
1732 // Make sure its one of the move opcodes we recognize.
1733 switch (Move.getMachineOpcode()) {
1734 default:
1735 continue;
1736 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1737 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1738 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1739 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1740 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1741 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1742 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1743 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1744 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1745 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1746 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1747 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1748 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1749 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1750 }
1751#undef CASE
1752
1753 SDValue In = Move.getOperand(0);
1754 if (!In.isMachineOpcode() ||
1755 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1756 continue;
1757
1758 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1759 // the SHA instructions which use a legacy encoding.
1760 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1761 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1762 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1763 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1764 continue;
1765
1766 // Producing instruction is another vector instruction. We can drop the
1767 // move.
1768 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1769 MadeChange = true;
1770 }
1771 }
1772 }
1773
1774 if (MadeChange)
1775 CurDAG->RemoveDeadNodes();
1776}
1777
1778
1779/// Emit any code that needs to be executed only in the main function.
1780void X86DAGToDAGISel::emitSpecialCodeForMain() {
1781 if (Subtarget->isTargetCygMing()) {
1783 auto &DL = CurDAG->getDataLayout();
1784
1786 CLI.setChain(CurDAG->getRoot())
1787 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1788 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1789 std::move(Args));
1790 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1791 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1792 CurDAG->setRoot(Result.second);
1793 }
1794}
1795
1796void X86DAGToDAGISel::emitFunctionEntryCode() {
1797 // If this is main, emit special code for main.
1798 const Function &F = MF->getFunction();
1799 if (F.hasExternalLinkage() && F.getName() == "main")
1800 emitSpecialCodeForMain();
1801}
1802
1803static bool isDispSafeForFrameIndex(int64_t Val) {
1804 // On 64-bit platforms, we can run into an issue where a frame index
1805 // includes a displacement that, when added to the explicit displacement,
1806 // will overflow the displacement field. Assuming that the frame index
1807 // displacement fits into a 31-bit integer (which is only slightly more
1808 // aggressive than the current fundamental assumption that it fits into
1809 // a 32-bit integer), a 31-bit disp should always be safe.
1810 return isInt<31>(Val);
1811}
1812
1813bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1814 X86ISelAddressMode &AM) {
1815 // We may have already matched a displacement and the caller just added the
1816 // symbolic displacement. So we still need to do the checks even if Offset
1817 // is zero.
1818
1819 int64_t Val = AM.Disp + Offset;
1820
1821 // Cannot combine ExternalSymbol displacements with integer offsets.
1822 if (Val != 0 && (AM.ES || AM.MCSym))
1823 return true;
1824
1825 CodeModel::Model M = TM.getCodeModel();
1826 if (Subtarget->is64Bit()) {
1827 if (Val != 0 &&
1829 AM.hasSymbolicDisplacement()))
1830 return true;
1831 // In addition to the checks required for a register base, check that
1832 // we do not try to use an unsafe Disp with a frame index.
1833 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1835 return true;
1836 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1837 // 64 bits. Instructions with 32-bit register addresses perform this zero
1838 // extension for us and we can safely ignore the high bits of Offset.
1839 // Instructions with only a 32-bit immediate address do not, though: they
1840 // sign extend instead. This means only address the low 2GB of address space
1841 // is directly addressable, we need indirect addressing for the high 2GB of
1842 // address space.
1843 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1844 // implicit zero extension of instructions would cover up any problem.
1845 // However, we have asserts elsewhere that get triggered if we do, so keep
1846 // the checks for now.
1847 // TODO: We would actually be able to accept these, as well as the same
1848 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1849 // to get an address size override to be emitted. However, this
1850 // pseudo-register is not part of any register class and therefore causes
1851 // MIR verification to fail.
1852 if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1853 !AM.hasBaseOrIndexReg())
1854 return true;
1855 }
1856 AM.Disp = Val;
1857 return false;
1858}
1859
1860bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1861 bool AllowSegmentRegForX32) {
1862 SDValue Address = N->getOperand(1);
1863
1864 // load gs:0 -> GS segment register.
1865 // load fs:0 -> FS segment register.
1866 //
1867 // This optimization is generally valid because the GNU TLS model defines that
1868 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1869 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1870 // zero-extended to 64 bits and then added it to the base address, which gives
1871 // unwanted results when the register holds a negative value.
1872 // For more information see http://people.redhat.com/drepper/tls.pdf
1873 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1874 !IndirectTlsSegRefs &&
1875 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1876 Subtarget->isTargetFuchsia())) {
1877 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1878 return true;
1879 switch (N->getPointerInfo().getAddrSpace()) {
1880 case X86AS::GS:
1881 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1882 return false;
1883 case X86AS::FS:
1884 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1885 return false;
1886 // Address space X86AS::SS is not handled here, because it is not used to
1887 // address TLS areas.
1888 }
1889 }
1890
1891 return true;
1892}
1893
1894/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1895/// mode. These wrap things that will resolve down into a symbol reference.
1896/// If no match is possible, this returns true, otherwise it returns false.
1897bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1898 // If the addressing mode already has a symbol as the displacement, we can
1899 // never match another symbol.
1900 if (AM.hasSymbolicDisplacement())
1901 return true;
1902
1903 bool IsRIPRelTLS = false;
1904 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1905 if (IsRIPRel) {
1906 SDValue Val = N.getOperand(0);
1908 IsRIPRelTLS = true;
1909 }
1910
1911 // We can't use an addressing mode in the 64-bit large code model.
1912 // Global TLS addressing is an exception. In the medium code model,
1913 // we use can use a mode when RIP wrappers are present.
1914 // That signifies access to globals that are known to be "near",
1915 // such as the GOT itself.
1916 CodeModel::Model M = TM.getCodeModel();
1917 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1918 return true;
1919
1920 // Base and index reg must be 0 in order to use %rip as base.
1921 if (IsRIPRel && AM.hasBaseOrIndexReg())
1922 return true;
1923
1924 // Make a local copy in case we can't do this fold.
1925 X86ISelAddressMode Backup = AM;
1926
1927 int64_t Offset = 0;
1928 SDValue N0 = N.getOperand(0);
1929 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1930 AM.GV = G->getGlobal();
1931 AM.SymbolFlags = G->getTargetFlags();
1932 Offset = G->getOffset();
1933 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1934 AM.CP = CP->getConstVal();
1935 AM.Alignment = CP->getAlign();
1936 AM.SymbolFlags = CP->getTargetFlags();
1937 Offset = CP->getOffset();
1938 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1939 AM.ES = S->getSymbol();
1940 AM.SymbolFlags = S->getTargetFlags();
1941 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1942 AM.MCSym = S->getMCSymbol();
1943 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1944 AM.JT = J->getIndex();
1945 AM.SymbolFlags = J->getTargetFlags();
1946 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1947 AM.BlockAddr = BA->getBlockAddress();
1948 AM.SymbolFlags = BA->getTargetFlags();
1949 Offset = BA->getOffset();
1950 } else
1951 llvm_unreachable("Unhandled symbol reference node.");
1952
1953 // Can't use an addressing mode with large globals.
1954 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1955 TM.isLargeGlobalValue(AM.GV)) {
1956 AM = Backup;
1957 return true;
1958 }
1959
1960 if (foldOffsetIntoAddress(Offset, AM)) {
1961 AM = Backup;
1962 return true;
1963 }
1964
1965 if (IsRIPRel)
1966 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1967
1968 // Commit the changes now that we know this fold is safe.
1969 return false;
1970}
1971
1972/// Add the specified node to the specified addressing mode, returning true if
1973/// it cannot be done. This just pattern matches for the addressing mode.
1974bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1975 if (matchAddressRecursively(N, AM, 0))
1976 return true;
1977
1978 // Post-processing: Make a second attempt to fold a load, if we now know
1979 // that there will not be any other register. This is only performed for
1980 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1981 // any foldable load the first time.
1982 if (Subtarget->isTarget64BitILP32() &&
1983 AM.BaseType == X86ISelAddressMode::RegBase &&
1984 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1985 SDValue Save_Base_Reg = AM.Base_Reg;
1986 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1987 AM.Base_Reg = SDValue();
1988 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1989 AM.Base_Reg = Save_Base_Reg;
1990 }
1991 }
1992
1993 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1994 // a smaller encoding and avoids a scaled-index.
1995 if (AM.Scale == 2 &&
1996 AM.BaseType == X86ISelAddressMode::RegBase &&
1997 AM.Base_Reg.getNode() == nullptr) {
1998 AM.Base_Reg = AM.IndexReg;
1999 AM.Scale = 1;
2000 }
2001
2002 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2003 // because it has a smaller encoding.
2004 if (TM.getCodeModel() != CodeModel::Large &&
2005 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
2006 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2007 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2008 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2009 // However, when GV is a local function symbol and in the same section as
2010 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2011 // referencing GV+Disp generates a relocation referencing the section symbol
2012 // with an even smaller offset, which might underflow. We should bail out if
2013 // the negative offset is too close to INT32_MIN. Actually, we are more
2014 // conservative here, using a smaller magic number also used by
2015 // isOffsetSuitableForCodeModel.
2016 if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
2017 return true;
2018
2019 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
2020 }
2021
2022 return false;
2023}
2024
2025bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2026 unsigned Depth) {
2027 // Add an artificial use to this node so that we can keep track of
2028 // it if it gets CSE'd with a different node.
2029 HandleSDNode Handle(N);
2030
2031 X86ISelAddressMode Backup = AM;
2032 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
2033 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
2034 return false;
2035 AM = Backup;
2036
2037 // Try again after commutating the operands.
2038 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
2039 Depth + 1) &&
2040 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
2041 return false;
2042 AM = Backup;
2043
2044 // If we couldn't fold both operands into the address at the same time,
2045 // see if we can just put each operand into a register and fold at least
2046 // the add.
2047 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2048 !AM.Base_Reg.getNode() &&
2049 !AM.IndexReg.getNode()) {
2050 N = Handle.getValue();
2051 AM.Base_Reg = N.getOperand(0);
2052 AM.IndexReg = N.getOperand(1);
2053 AM.Scale = 1;
2054 return false;
2055 }
2056 N = Handle.getValue();
2057 return true;
2058}
2059
2060// Insert a node into the DAG at least before the Pos node's position. This
2061// will reposition the node as needed, and will assign it a node ID that is <=
2062// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2063// IDs! The selection DAG must no longer depend on their uniqueness when this
2064// is used.
2065static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2066 if (N->getNodeId() == -1 ||
2069 DAG.RepositionNode(Pos->getIterator(), N.getNode());
2070 // Mark Node as invalid for pruning as after this it may be a successor to a
2071 // selected node but otherwise be in the same position of Pos.
2072 // Conservatively mark it with the same -abs(Id) to assure node id
2073 // invariant is preserved.
2074 N->setNodeId(Pos->getNodeId());
2076 }
2077}
2078
2079// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2080// safe. This allows us to convert the shift and and into an h-register
2081// extract and a scaled index. Returns false if the simplification is
2082// performed.
2084 uint64_t Mask,
2085 SDValue Shift, SDValue X,
2086 X86ISelAddressMode &AM) {
2087 if (Shift.getOpcode() != ISD::SRL ||
2088 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2089 !Shift.hasOneUse())
2090 return true;
2091
2092 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2093 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2094 Mask != (0xffu << ScaleLog))
2095 return true;
2096
2097 MVT XVT = X.getSimpleValueType();
2098 MVT VT = N.getSimpleValueType();
2099 SDLoc DL(N);
2100 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2101 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2102 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2103 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2104 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2105 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2106 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2107
2108 // Insert the new nodes into the topological ordering. We must do this in
2109 // a valid topological ordering as nothing is going to go back and re-sort
2110 // these nodes. We continually insert before 'N' in sequence as this is
2111 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2112 // hierarchy left to express.
2113 insertDAGNode(DAG, N, Eight);
2114 insertDAGNode(DAG, N, NewMask);
2115 insertDAGNode(DAG, N, Srl);
2116 insertDAGNode(DAG, N, And);
2117 insertDAGNode(DAG, N, Ext);
2118 insertDAGNode(DAG, N, ShlCount);
2119 insertDAGNode(DAG, N, Shl);
2120 DAG.ReplaceAllUsesWith(N, Shl);
2121 DAG.RemoveDeadNode(N.getNode());
2122 AM.IndexReg = Ext;
2123 AM.Scale = (1 << ScaleLog);
2124 return false;
2125}
2126
2127// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2128// allows us to fold the shift into this addressing mode. Returns false if the
2129// transform succeeded.
2131 X86ISelAddressMode &AM) {
2132 SDValue Shift = N.getOperand(0);
2133
2134 // Use a signed mask so that shifting right will insert sign bits. These
2135 // bits will be removed when we shift the result left so it doesn't matter
2136 // what we use. This might allow a smaller immediate encoding.
2137 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2138
2139 // If we have an any_extend feeding the AND, look through it to see if there
2140 // is a shift behind it. But only if the AND doesn't use the extended bits.
2141 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2142 bool FoundAnyExtend = false;
2143 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2144 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2145 isUInt<32>(Mask)) {
2146 FoundAnyExtend = true;
2147 Shift = Shift.getOperand(0);
2148 }
2149
2150 if (Shift.getOpcode() != ISD::SHL ||
2151 !isa<ConstantSDNode>(Shift.getOperand(1)))
2152 return true;
2153
2154 SDValue X = Shift.getOperand(0);
2155
2156 // Not likely to be profitable if either the AND or SHIFT node has more
2157 // than one use (unless all uses are for address computation). Besides,
2158 // isel mechanism requires their node ids to be reused.
2159 if (!N.hasOneUse() || !Shift.hasOneUse())
2160 return true;
2161
2162 // Verify that the shift amount is something we can fold.
2163 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2164 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2165 return true;
2166
2167 MVT VT = N.getSimpleValueType();
2168 SDLoc DL(N);
2169 if (FoundAnyExtend) {
2170 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2171 insertDAGNode(DAG, N, NewX);
2172 X = NewX;
2173 }
2174
2175 SDValue NewMask = DAG.getSignedConstant(Mask >> ShiftAmt, DL, VT);
2176 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2177 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2178
2179 // Insert the new nodes into the topological ordering. We must do this in
2180 // a valid topological ordering as nothing is going to go back and re-sort
2181 // these nodes. We continually insert before 'N' in sequence as this is
2182 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2183 // hierarchy left to express.
2184 insertDAGNode(DAG, N, NewMask);
2185 insertDAGNode(DAG, N, NewAnd);
2186 insertDAGNode(DAG, N, NewShift);
2187 DAG.ReplaceAllUsesWith(N, NewShift);
2188 DAG.RemoveDeadNode(N.getNode());
2189
2190 AM.Scale = 1 << ShiftAmt;
2191 AM.IndexReg = NewAnd;
2192 return false;
2193}
2194
2195// Implement some heroics to detect shifts of masked values where the mask can
2196// be replaced by extending the shift and undoing that in the addressing mode
2197// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2198// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2199// the addressing mode. This results in code such as:
2200//
2201// int f(short *y, int *lookup_table) {
2202// ...
2203// return *y + lookup_table[*y >> 11];
2204// }
2205//
2206// Turning into:
2207// movzwl (%rdi), %eax
2208// movl %eax, %ecx
2209// shrl $11, %ecx
2210// addl (%rsi,%rcx,4), %eax
2211//
2212// Instead of:
2213// movzwl (%rdi), %eax
2214// movl %eax, %ecx
2215// shrl $9, %ecx
2216// andl $124, %rcx
2217// addl (%rsi,%rcx), %eax
2218//
2219// Note that this function assumes the mask is provided as a mask *after* the
2220// value is shifted. The input chain may or may not match that, but computing
2221// such a mask is trivial.
2223 uint64_t Mask,
2224 SDValue Shift, SDValue X,
2225 X86ISelAddressMode &AM) {
2226 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2227 !isa<ConstantSDNode>(Shift.getOperand(1)))
2228 return true;
2229
2230 // We need to ensure that mask is a continuous run of bits.
2231 unsigned MaskIdx, MaskLen;
2232 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2233 return true;
2234 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2235
2236 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2237
2238 // The amount of shift we're trying to fit into the addressing mode is taken
2239 // from the shifted mask index (number of trailing zeros of the mask).
2240 unsigned AMShiftAmt = MaskIdx;
2241
2242 // There is nothing we can do here unless the mask is removing some bits.
2243 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2244 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2245
2246 // Scale the leading zero count down based on the actual size of the value.
2247 // Also scale it down based on the size of the shift.
2248 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2249 if (MaskLZ < ScaleDown)
2250 return true;
2251 MaskLZ -= ScaleDown;
2252
2253 // The final check is to ensure that any masked out high bits of X are
2254 // already known to be zero. Otherwise, the mask has a semantic impact
2255 // other than masking out a couple of low bits. Unfortunately, because of
2256 // the mask, zero extensions will be removed from operands in some cases.
2257 // This code works extra hard to look through extensions because we can
2258 // replace them with zero extensions cheaply if necessary.
2259 bool ReplacingAnyExtend = false;
2260 if (X.getOpcode() == ISD::ANY_EXTEND) {
2261 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2262 X.getOperand(0).getSimpleValueType().getSizeInBits();
2263 // Assume that we'll replace the any-extend with a zero-extend, and
2264 // narrow the search to the extended value.
2265 X = X.getOperand(0);
2266 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2267 ReplacingAnyExtend = true;
2268 }
2269 APInt MaskedHighBits =
2270 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2271 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2272 return true;
2273
2274 // We've identified a pattern that can be transformed into a single shift
2275 // and an addressing mode. Make it so.
2276 MVT VT = N.getSimpleValueType();
2277 if (ReplacingAnyExtend) {
2278 assert(X.getValueType() != VT);
2279 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2280 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2281 insertDAGNode(DAG, N, NewX);
2282 X = NewX;
2283 }
2284
2285 MVT XVT = X.getSimpleValueType();
2286 SDLoc DL(N);
2287 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2288 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2289 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2290 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2291 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2292
2293 // Insert the new nodes into the topological ordering. We must do this in
2294 // a valid topological ordering as nothing is going to go back and re-sort
2295 // these nodes. We continually insert before 'N' in sequence as this is
2296 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2297 // hierarchy left to express.
2298 insertDAGNode(DAG, N, NewSRLAmt);
2299 insertDAGNode(DAG, N, NewSRL);
2300 insertDAGNode(DAG, N, NewExt);
2301 insertDAGNode(DAG, N, NewSHLAmt);
2302 insertDAGNode(DAG, N, NewSHL);
2303 DAG.ReplaceAllUsesWith(N, NewSHL);
2304 DAG.RemoveDeadNode(N.getNode());
2305
2306 AM.Scale = 1 << AMShiftAmt;
2307 AM.IndexReg = NewExt;
2308 return false;
2309}
2310
2311// Transform "(X >> SHIFT) & (MASK << C1)" to
2312// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2313// matched to a BEXTR later. Returns false if the simplification is performed.
2315 uint64_t Mask,
2316 SDValue Shift, SDValue X,
2317 X86ISelAddressMode &AM,
2318 const X86Subtarget &Subtarget) {
2319 if (Shift.getOpcode() != ISD::SRL ||
2320 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2321 !Shift.hasOneUse() || !N.hasOneUse())
2322 return true;
2323
2324 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2325 if (!Subtarget.hasTBM() &&
2326 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2327 return true;
2328
2329 // We need to ensure that mask is a continuous run of bits.
2330 unsigned MaskIdx, MaskLen;
2331 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2332 return true;
2333
2334 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2335
2336 // The amount of shift we're trying to fit into the addressing mode is taken
2337 // from the shifted mask index (number of trailing zeros of the mask).
2338 unsigned AMShiftAmt = MaskIdx;
2339
2340 // There is nothing we can do here unless the mask is removing some bits.
2341 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2342 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2343
2344 MVT XVT = X.getSimpleValueType();
2345 MVT VT = N.getSimpleValueType();
2346 SDLoc DL(N);
2347 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2348 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2349 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2350 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2351 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2352 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2353 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2354
2355 // Insert the new nodes into the topological ordering. We must do this in
2356 // a valid topological ordering as nothing is going to go back and re-sort
2357 // these nodes. We continually insert before 'N' in sequence as this is
2358 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2359 // hierarchy left to express.
2360 insertDAGNode(DAG, N, NewSRLAmt);
2361 insertDAGNode(DAG, N, NewSRL);
2362 insertDAGNode(DAG, N, NewMask);
2363 insertDAGNode(DAG, N, NewAnd);
2364 insertDAGNode(DAG, N, NewExt);
2365 insertDAGNode(DAG, N, NewSHLAmt);
2366 insertDAGNode(DAG, N, NewSHL);
2367 DAG.ReplaceAllUsesWith(N, NewSHL);
2368 DAG.RemoveDeadNode(N.getNode());
2369
2370 AM.Scale = 1 << AMShiftAmt;
2371 AM.IndexReg = NewExt;
2372 return false;
2373}
2374
2375// Attempt to peek further into a scaled index register, collecting additional
2376// extensions / offsets / etc. Returns /p N if we can't peek any further.
2377SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2378 X86ISelAddressMode &AM,
2379 unsigned Depth) {
2380 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2381 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2382 "Illegal index scale");
2383
2384 // Limit recursion.
2386 return N;
2387
2388 EVT VT = N.getValueType();
2389 unsigned Opc = N.getOpcode();
2390
2391 // index: add(x,c) -> index: x, disp + c
2392 if (CurDAG->isBaseWithConstantOffset(N)) {
2393 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2394 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2395 if (!foldOffsetIntoAddress(Offset, AM))
2396 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2397 }
2398
2399 // index: add(x,x) -> index: x, scale * 2
2400 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2401 if (AM.Scale <= 4) {
2402 AM.Scale *= 2;
2403 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2404 }
2405 }
2406
2407 // index: shl(x,i) -> index: x, scale * (1 << i)
2408 if (Opc == X86ISD::VSHLI) {
2409 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2410 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2411 if ((AM.Scale * ScaleAmt) <= 8) {
2412 AM.Scale *= ScaleAmt;
2413 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2414 }
2415 }
2416
2417 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2418 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2419 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2420 SDValue Src = N.getOperand(0);
2421 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2422 Src.hasOneUse()) {
2423 if (CurDAG->isBaseWithConstantOffset(Src)) {
2424 SDValue AddSrc = Src.getOperand(0);
2425 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2426 int64_t Offset = AddVal->getSExtValue();
2427 if (!foldOffsetIntoAddress((uint64_t)Offset * AM.Scale, AM)) {
2428 SDLoc DL(N);
2429 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2430 SDValue ExtVal = CurDAG->getSignedConstant(Offset, DL, VT);
2431 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2432 insertDAGNode(*CurDAG, N, ExtSrc);
2433 insertDAGNode(*CurDAG, N, ExtVal);
2434 insertDAGNode(*CurDAG, N, ExtAdd);
2435 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2436 CurDAG->RemoveDeadNode(N.getNode());
2437 return ExtSrc;
2438 }
2439 }
2440 }
2441 }
2442
2443 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2444 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2445 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2446 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2447 SDValue Src = N.getOperand(0);
2448 unsigned SrcOpc = Src.getOpcode();
2449 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2450 CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2451 Src.hasOneUse()) {
2452 if (CurDAG->isBaseWithConstantOffset(Src)) {
2453 SDValue AddSrc = Src.getOperand(0);
2454 uint64_t Offset = Src.getConstantOperandVal(1);
2455 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2456 SDLoc DL(N);
2457 SDValue Res;
2458 // If we're also scaling, see if we can use that as well.
2459 if (AddSrc.getOpcode() == ISD::SHL &&
2460 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2461 SDValue ShVal = AddSrc.getOperand(0);
2462 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2463 APInt HiBits =
2465 uint64_t ScaleAmt = 1ULL << ShAmt;
2466 if ((AM.Scale * ScaleAmt) <= 8 &&
2467 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2468 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2469 AM.Scale *= ScaleAmt;
2470 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2471 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2472 AddSrc.getOperand(1));
2473 insertDAGNode(*CurDAG, N, ExtShVal);
2474 insertDAGNode(*CurDAG, N, ExtShift);
2475 AddSrc = ExtShift;
2476 Res = ExtShVal;
2477 }
2478 }
2479 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2480 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2481 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2482 insertDAGNode(*CurDAG, N, ExtSrc);
2483 insertDAGNode(*CurDAG, N, ExtVal);
2484 insertDAGNode(*CurDAG, N, ExtAdd);
2485 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2486 CurDAG->RemoveDeadNode(N.getNode());
2487 return Res ? Res : ExtSrc;
2488 }
2489 }
2490 }
2491 }
2492
2493 // TODO: Handle extensions, shifted masks etc.
2494 return N;
2495}
2496
2497bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2498 unsigned Depth) {
2499 SDLoc dl(N);
2500 LLVM_DEBUG({
2501 dbgs() << "MatchAddress: ";
2502 AM.dump(CurDAG);
2503 });
2504 // Limit recursion.
2506 return matchAddressBase(N, AM);
2507
2508 // If this is already a %rip relative address, we can only merge immediates
2509 // into it. Instead of handling this in every case, we handle it here.
2510 // RIP relative addressing: %rip + 32-bit displacement!
2511 if (AM.isRIPRelative()) {
2512 // FIXME: JumpTable and ExternalSymbol address currently don't like
2513 // displacements. It isn't very important, but this should be fixed for
2514 // consistency.
2515 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2516 return true;
2517
2518 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2519 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2520 return false;
2521 return true;
2522 }
2523
2524 switch (N.getOpcode()) {
2525 default: break;
2526 case ISD::LOCAL_RECOVER: {
2527 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2528 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2529 // Use the symbol and don't prefix it.
2530 AM.MCSym = ESNode->getMCSymbol();
2531 return false;
2532 }
2533 break;
2534 }
2535 case ISD::Constant: {
2536 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2537 if (!foldOffsetIntoAddress(Val, AM))
2538 return false;
2539 break;
2540 }
2541
2542 case X86ISD::Wrapper:
2543 case X86ISD::WrapperRIP:
2544 if (!matchWrapper(N, AM))
2545 return false;
2546 break;
2547
2548 case ISD::LOAD:
2549 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2550 return false;
2551 break;
2552
2553 case ISD::FrameIndex:
2554 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2555 AM.Base_Reg.getNode() == nullptr &&
2556 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2557 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2558 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2559 return false;
2560 }
2561 break;
2562
2563 case ISD::SHL:
2564 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2565 break;
2566
2567 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2568 unsigned Val = CN->getZExtValue();
2569 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2570 // that the base operand remains free for further matching. If
2571 // the base doesn't end up getting used, a post-processing step
2572 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2573 if (Val == 1 || Val == 2 || Val == 3) {
2574 SDValue ShVal = N.getOperand(0);
2575 AM.Scale = 1 << Val;
2576 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2577 return false;
2578 }
2579 }
2580 break;
2581
2582 case ISD::SRL: {
2583 // Scale must not be used already.
2584 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2585
2586 // We only handle up to 64-bit values here as those are what matter for
2587 // addressing mode optimizations.
2588 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2589 "Unexpected value size!");
2590
2591 SDValue And = N.getOperand(0);
2592 if (And.getOpcode() != ISD::AND) break;
2593 SDValue X = And.getOperand(0);
2594
2595 // The mask used for the transform is expected to be post-shift, but we
2596 // found the shift first so just apply the shift to the mask before passing
2597 // it down.
2598 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2599 !isa<ConstantSDNode>(And.getOperand(1)))
2600 break;
2601 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2602
2603 // Try to fold the mask and shift into the scale, and return false if we
2604 // succeed.
2605 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2606 return false;
2607 break;
2608 }
2609
2610 case ISD::SMUL_LOHI:
2611 case ISD::UMUL_LOHI:
2612 // A mul_lohi where we need the low part can be folded as a plain multiply.
2613 if (N.getResNo() != 0) break;
2614 [[fallthrough]];
2615 case ISD::MUL:
2616 case X86ISD::MUL_IMM:
2617 // X*[3,5,9] -> X+X*[2,4,8]
2618 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2619 AM.Base_Reg.getNode() == nullptr &&
2620 AM.IndexReg.getNode() == nullptr) {
2621 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2622 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2623 CN->getZExtValue() == 9) {
2624 AM.Scale = unsigned(CN->getZExtValue())-1;
2625
2626 SDValue MulVal = N.getOperand(0);
2627 SDValue Reg;
2628
2629 // Okay, we know that we have a scale by now. However, if the scaled
2630 // value is an add of something and a constant, we can fold the
2631 // constant into the disp field here.
2632 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2633 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2634 Reg = MulVal.getOperand(0);
2635 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2636 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2637 if (foldOffsetIntoAddress(Disp, AM))
2638 Reg = N.getOperand(0);
2639 } else {
2640 Reg = N.getOperand(0);
2641 }
2642
2643 AM.IndexReg = AM.Base_Reg = Reg;
2644 return false;
2645 }
2646 }
2647 break;
2648
2649 case ISD::SUB: {
2650 // Given A-B, if A can be completely folded into the address and
2651 // the index field with the index field unused, use -B as the index.
2652 // This is a win if a has multiple parts that can be folded into
2653 // the address. Also, this saves a mov if the base register has
2654 // other uses, since it avoids a two-address sub instruction, however
2655 // it costs an additional mov if the index register has other uses.
2656
2657 // Add an artificial use to this node so that we can keep track of
2658 // it if it gets CSE'd with a different node.
2659 HandleSDNode Handle(N);
2660
2661 // Test if the LHS of the sub can be folded.
2662 X86ISelAddressMode Backup = AM;
2663 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2664 N = Handle.getValue();
2665 AM = Backup;
2666 break;
2667 }
2668 N = Handle.getValue();
2669 // Test if the index field is free for use.
2670 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2671 AM = Backup;
2672 break;
2673 }
2674
2675 int Cost = 0;
2676 SDValue RHS = N.getOperand(1);
2677 // If the RHS involves a register with multiple uses, this
2678 // transformation incurs an extra mov, due to the neg instruction
2679 // clobbering its operand.
2680 if (!RHS.getNode()->hasOneUse() ||
2681 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2682 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2683 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2684 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2685 RHS.getOperand(0).getValueType() == MVT::i32))
2686 ++Cost;
2687 // If the base is a register with multiple uses, this
2688 // transformation may save a mov.
2689 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2690 !AM.Base_Reg.getNode()->hasOneUse()) ||
2691 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2692 --Cost;
2693 // If the folded LHS was interesting, this transformation saves
2694 // address arithmetic.
2695 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2696 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2697 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2698 --Cost;
2699 // If it doesn't look like it may be an overall win, don't do it.
2700 if (Cost >= 0) {
2701 AM = Backup;
2702 break;
2703 }
2704
2705 // Ok, the transformation is legal and appears profitable. Go for it.
2706 // Negation will be emitted later to avoid creating dangling nodes if this
2707 // was an unprofitable LEA.
2708 AM.IndexReg = RHS;
2709 AM.NegateIndex = true;
2710 AM.Scale = 1;
2711 return false;
2712 }
2713
2714 case ISD::OR:
2715 case ISD::XOR:
2716 // See if we can treat the OR/XOR node as an ADD node.
2717 if (!CurDAG->isADDLike(N))
2718 break;
2719 [[fallthrough]];
2720 case ISD::ADD:
2721 if (!matchAdd(N, AM, Depth))
2722 return false;
2723 break;
2724
2725 case ISD::AND: {
2726 // Perform some heroic transforms on an and of a constant-count shift
2727 // with a constant to enable use of the scaled offset field.
2728
2729 // Scale must not be used already.
2730 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2731
2732 // We only handle up to 64-bit values here as those are what matter for
2733 // addressing mode optimizations.
2734 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2735 "Unexpected value size!");
2736
2737 if (!isa<ConstantSDNode>(N.getOperand(1)))
2738 break;
2739
2740 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2741 SDValue Shift = N.getOperand(0);
2742 SDValue X = Shift.getOperand(0);
2743
2744 uint64_t Mask = N.getConstantOperandVal(1);
2745
2746 // Try to fold the mask and shift into an extract and scale.
2747 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2748 return false;
2749
2750 // Try to fold the mask and shift directly into the scale.
2751 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2752 return false;
2753
2754 // Try to fold the mask and shift into BEXTR and scale.
2755 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2756 return false;
2757 }
2758
2759 // Try to swap the mask and shift to place shifts which can be done as
2760 // a scale on the outside of the mask.
2761 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2762 return false;
2763
2764 break;
2765 }
2766 case ISD::ZERO_EXTEND: {
2767 // Try to widen a zexted shift left to the same size as its use, so we can
2768 // match the shift as a scale factor.
2769 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2770 break;
2771
2772 SDValue Src = N.getOperand(0);
2773
2774 // See if we can match a zext(addlike(x,c)).
2775 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2776 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2777 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2778 if (Index != N) {
2779 AM.IndexReg = Index;
2780 return false;
2781 }
2782
2783 // Peek through mask: zext(and(shl(x,c1),c2))
2784 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2785 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2786 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2787 Mask = MaskC->getAPIntValue();
2788 Src = Src.getOperand(0);
2789 }
2790
2791 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2792 // Give up if the shift is not a valid scale factor [1,2,3].
2793 SDValue ShlSrc = Src.getOperand(0);
2794 SDValue ShlAmt = Src.getOperand(1);
2795 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2796 if (!ShAmtC)
2797 break;
2798 unsigned ShAmtV = ShAmtC->getZExtValue();
2799 if (ShAmtV > 3)
2800 break;
2801
2802 // The narrow shift must only shift out zero bits (it must be 'nuw').
2803 // That makes it safe to widen to the destination type.
2804 APInt HighZeros =
2805 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2806 if (!Src->getFlags().hasNoUnsignedWrap() &&
2807 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2808 break;
2809
2810 // zext (shl nuw i8 %x, C1) to i32
2811 // --> shl (zext i8 %x to i32), (zext C1)
2812 // zext (and (shl nuw i8 %x, C1), C2) to i32
2813 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2814 MVT SrcVT = ShlSrc.getSimpleValueType();
2815 MVT VT = N.getSimpleValueType();
2816 SDLoc DL(N);
2817
2818 SDValue Res = ShlSrc;
2819 if (!Mask.isAllOnes()) {
2820 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2821 insertDAGNode(*CurDAG, N, Res);
2822 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2823 insertDAGNode(*CurDAG, N, Res);
2824 }
2825 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2826 insertDAGNode(*CurDAG, N, Zext);
2827 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2828 insertDAGNode(*CurDAG, N, NewShl);
2829 CurDAG->ReplaceAllUsesWith(N, NewShl);
2830 CurDAG->RemoveDeadNode(N.getNode());
2831
2832 // Convert the shift to scale factor.
2833 AM.Scale = 1 << ShAmtV;
2834 // If matchIndexRecursively is not called here,
2835 // Zext may be replaced by other nodes but later used to call a builder
2836 // method
2837 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2838 return false;
2839 }
2840
2841 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2842 // Try to fold the mask and shift into an extract and scale.
2843 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2844 Src.getOperand(0), AM))
2845 return false;
2846
2847 // Try to fold the mask and shift directly into the scale.
2848 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2849 Src.getOperand(0), AM))
2850 return false;
2851
2852 // Try to fold the mask and shift into BEXTR and scale.
2853 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2854 Src.getOperand(0), AM, *Subtarget))
2855 return false;
2856 }
2857
2858 break;
2859 }
2860 }
2861
2862 return matchAddressBase(N, AM);
2863}
2864
2865/// Helper for MatchAddress. Add the specified node to the
2866/// specified addressing mode without any further recursion.
2867bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2868 // Is the base register already occupied?
2869 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2870 // If so, check to see if the scale index register is set.
2871 if (!AM.IndexReg.getNode()) {
2872 AM.IndexReg = N;
2873 AM.Scale = 1;
2874 return false;
2875 }
2876
2877 // Otherwise, we cannot select it.
2878 return true;
2879 }
2880
2881 // Default, generate it as a register.
2882 AM.BaseType = X86ISelAddressMode::RegBase;
2883 AM.Base_Reg = N;
2884 return false;
2885}
2886
2887bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2888 X86ISelAddressMode &AM,
2889 unsigned Depth) {
2890 SDLoc dl(N);
2891 LLVM_DEBUG({
2892 dbgs() << "MatchVectorAddress: ";
2893 AM.dump(CurDAG);
2894 });
2895 // Limit recursion.
2897 return matchAddressBase(N, AM);
2898
2899 // TODO: Support other operations.
2900 switch (N.getOpcode()) {
2901 case ISD::Constant: {
2902 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2903 if (!foldOffsetIntoAddress(Val, AM))
2904 return false;
2905 break;
2906 }
2907 case X86ISD::Wrapper:
2908 if (!matchWrapper(N, AM))
2909 return false;
2910 break;
2911 case ISD::ADD: {
2912 // Add an artificial use to this node so that we can keep track of
2913 // it if it gets CSE'd with a different node.
2914 HandleSDNode Handle(N);
2915
2916 X86ISelAddressMode Backup = AM;
2917 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2918 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2919 Depth + 1))
2920 return false;
2921 AM = Backup;
2922
2923 // Try again after commuting the operands.
2924 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2925 Depth + 1) &&
2926 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2927 Depth + 1))
2928 return false;
2929 AM = Backup;
2930
2931 N = Handle.getValue();
2932 break;
2933 }
2934 }
2935
2936 return matchAddressBase(N, AM);
2937}
2938
2939/// Helper for selectVectorAddr. Handles things that can be folded into a
2940/// gather/scatter address. The index register and scale should have already
2941/// been handled.
2942bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2943 return matchVectorAddressRecursively(N, AM, 0);
2944}
2945
2946bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2947 SDValue IndexOp, SDValue ScaleOp,
2948 SDValue &Base, SDValue &Scale,
2949 SDValue &Index, SDValue &Disp,
2950 SDValue &Segment) {
2951 X86ISelAddressMode AM;
2952 AM.Scale = ScaleOp->getAsZExtVal();
2953
2954 // Attempt to match index patterns, as long as we're not relying on implicit
2955 // sign-extension, which is performed BEFORE scale.
2956 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2957 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2958 else
2959 AM.IndexReg = IndexOp;
2960
2961 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2962 if (AddrSpace == X86AS::GS)
2963 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2964 if (AddrSpace == X86AS::FS)
2965 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2966 if (AddrSpace == X86AS::SS)
2967 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2968
2969 SDLoc DL(BasePtr);
2970 MVT VT = BasePtr.getSimpleValueType();
2971
2972 // Try to match into the base and displacement fields.
2973 if (matchVectorAddress(BasePtr, AM))
2974 return false;
2975
2976 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2977 return true;
2978}
2979
2980/// Returns true if it is able to pattern match an addressing mode.
2981/// It returns the operands which make up the maximal addressing mode it can
2982/// match by reference.
2983///
2984/// Parent is the parent node of the addr operand that is being matched. It
2985/// is always a load, store, atomic node, or null. It is only null when
2986/// checking memory operands for inline asm nodes.
2987bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2988 SDValue &Scale, SDValue &Index,
2989 SDValue &Disp, SDValue &Segment) {
2990 X86ISelAddressMode AM;
2991
2992 if (Parent &&
2993 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2994 // that are not a MemSDNode, and thus don't have proper addrspace info.
2995 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2996 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2997 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2998 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2999 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3000 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3001 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3002 unsigned AddrSpace =
3003 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
3004 if (AddrSpace == X86AS::GS)
3005 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3006 if (AddrSpace == X86AS::FS)
3007 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3008 if (AddrSpace == X86AS::SS)
3009 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3010 }
3011
3012 // Save the DL and VT before calling matchAddress, it can invalidate N.
3013 SDLoc DL(N);
3014 MVT VT = N.getSimpleValueType();
3015
3016 if (matchAddress(N, AM))
3017 return false;
3018
3019 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3020 return true;
3021}
3022
3023bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3024 // Cannot use 32 bit constants to reference objects in kernel/large code
3025 // model.
3026 if (TM.getCodeModel() == CodeModel::Kernel ||
3027 TM.getCodeModel() == CodeModel::Large)
3028 return false;
3029
3030 // In static codegen with small code model, we can get the address of a label
3031 // into a register with 'movl'
3032 if (N->getOpcode() != X86ISD::Wrapper)
3033 return false;
3034
3035 N = N.getOperand(0);
3036
3037 // At least GNU as does not accept 'movl' for TPOFF relocations.
3038 // FIXME: We could use 'movl' when we know we are targeting MC.
3039 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3040 return false;
3041
3042 Imm = N;
3043 // Small/medium code model can reference non-TargetGlobalAddress objects with
3044 // 32 bit constants.
3045 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3046 return TM.getCodeModel() == CodeModel::Small ||
3047 TM.getCodeModel() == CodeModel::Medium;
3048 }
3049
3050 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3051 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3052 return CR->getUnsignedMax().ult(1ull << 32);
3053
3054 return !TM.isLargeGlobalValue(GV);
3055}
3056
3057bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
3058 SDValue &Scale, SDValue &Index,
3059 SDValue &Disp, SDValue &Segment) {
3060 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3061 SDLoc DL(N);
3062
3063 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3064 return false;
3065
3066 auto *RN = dyn_cast<RegisterSDNode>(Base);
3067 if (RN && RN->getReg() == 0)
3068 Base = CurDAG->getRegister(0, MVT::i64);
3069 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
3070 // Base could already be %rip, particularly in the x32 ABI.
3071 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3072 MVT::i64), 0);
3073 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3074 Base);
3075 }
3076
3077 RN = dyn_cast<RegisterSDNode>(Index);
3078 if (RN && RN->getReg() == 0)
3079 Index = CurDAG->getRegister(0, MVT::i64);
3080 else {
3081 assert(Index.getValueType() == MVT::i32 &&
3082 "Expect to be extending 32-bit registers for use in LEA");
3083 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3084 MVT::i64), 0);
3085 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3086 Index);
3087 }
3088
3089 return true;
3090}
3091
3092/// Calls SelectAddr and determines if the maximal addressing
3093/// mode it matches can be cost effectively emitted as an LEA instruction.
3094bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3095 SDValue &Base, SDValue &Scale,
3096 SDValue &Index, SDValue &Disp,
3097 SDValue &Segment) {
3098 X86ISelAddressMode AM;
3099
3100 // Save the DL and VT before calling matchAddress, it can invalidate N.
3101 SDLoc DL(N);
3102 MVT VT = N.getSimpleValueType();
3103
3104 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3105 // segments.
3106 SDValue Copy = AM.Segment;
3107 SDValue T = CurDAG->getRegister(0, MVT::i32);
3108 AM.Segment = T;
3109 if (matchAddress(N, AM))
3110 return false;
3111 assert (T == AM.Segment);
3112 AM.Segment = Copy;
3113
3114 unsigned Complexity = 0;
3115 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3116 Complexity = 1;
3117 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3118 Complexity = 4;
3119
3120 if (AM.IndexReg.getNode())
3121 Complexity++;
3122
3123 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3124 // a simple shift.
3125 if (AM.Scale > 1)
3126 Complexity++;
3127
3128 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3129 // to a LEA. This is determined with some experimentation but is by no means
3130 // optimal (especially for code size consideration). LEA is nice because of
3131 // its three-address nature. Tweak the cost function again when we can run
3132 // convertToThreeAddress() at register allocation time.
3133 if (AM.hasSymbolicDisplacement()) {
3134 // For X86-64, always use LEA to materialize RIP-relative addresses.
3135 if (Subtarget->is64Bit())
3136 Complexity = 4;
3137 else
3138 Complexity += 2;
3139 }
3140
3141 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3142 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3143 // duplicating flag-producing instructions later in the pipeline.
3144 if (N.getOpcode() == ISD::ADD) {
3145 auto isMathWithFlags = [](SDValue V) {
3146 switch (V.getOpcode()) {
3147 case X86ISD::ADD:
3148 case X86ISD::SUB:
3149 case X86ISD::ADC:
3150 case X86ISD::SBB:
3151 case X86ISD::SMUL:
3152 case X86ISD::UMUL:
3153 /* TODO: These opcodes can be added safely, but we may want to justify
3154 their inclusion for different reasons (better for reg-alloc).
3155 case X86ISD::OR:
3156 case X86ISD::XOR:
3157 case X86ISD::AND:
3158 */
3159 // Value 1 is the flag output of the node - verify it's not dead.
3160 return !SDValue(V.getNode(), 1).use_empty();
3161 default:
3162 return false;
3163 }
3164 };
3165 // TODO: We might want to factor in whether there's a load folding
3166 // opportunity for the math op that disappears with LEA.
3167 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3168 Complexity++;
3169 }
3170
3171 if (AM.Disp)
3172 Complexity++;
3173
3174 // If it isn't worth using an LEA, reject it.
3175 if (Complexity <= 2)
3176 return false;
3177
3178 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3179 return true;
3180}
3181
3182/// This is only run on TargetGlobalTLSAddress nodes.
3183bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3184 SDValue &Scale, SDValue &Index,
3185 SDValue &Disp, SDValue &Segment) {
3186 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3187 N.getOpcode() == ISD::TargetExternalSymbol);
3188
3189 X86ISelAddressMode AM;
3190 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3191 AM.GV = GA->getGlobal();
3192 AM.Disp += GA->getOffset();
3193 AM.SymbolFlags = GA->getTargetFlags();
3194 } else {
3195 auto *SA = cast<ExternalSymbolSDNode>(N);
3196 AM.ES = SA->getSymbol();
3197 AM.SymbolFlags = SA->getTargetFlags();
3198 }
3199
3200 if (Subtarget->is32Bit()) {
3201 AM.Scale = 1;
3202 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3203 }
3204
3205 MVT VT = N.getSimpleValueType();
3206 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3207 return true;
3208}
3209
3210bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3211 // Keep track of the original value type and whether this value was
3212 // truncated. If we see a truncation from pointer type to VT that truncates
3213 // bits that are known to be zero, we can use a narrow reference.
3214 EVT VT = N.getValueType();
3215 bool WasTruncated = false;
3216 if (N.getOpcode() == ISD::TRUNCATE) {
3217 WasTruncated = true;
3218 N = N.getOperand(0);
3219 }
3220
3221 if (N.getOpcode() != X86ISD::Wrapper)
3222 return false;
3223
3224 // We can only use non-GlobalValues as immediates if they were not truncated,
3225 // as we do not have any range information. If we have a GlobalValue and the
3226 // address was not truncated, we can select it as an operand directly.
3227 unsigned Opc = N.getOperand(0)->getOpcode();
3228 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3229 Op = N.getOperand(0);
3230 // We can only select the operand directly if we didn't have to look past a
3231 // truncate.
3232 return !WasTruncated;
3233 }
3234
3235 // Check that the global's range fits into VT.
3236 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3237 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3238 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3239 return false;
3240
3241 // Okay, we can use a narrow reference.
3242 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3243 GA->getOffset(), GA->getTargetFlags());
3244 return true;
3245}
3246
3247bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3248 SDValue &Base, SDValue &Scale,
3249 SDValue &Index, SDValue &Disp,
3250 SDValue &Segment) {
3251 assert(Root && P && "Unknown root/parent nodes");
3252 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3253 !IsProfitableToFold(N, P, Root) ||
3254 !IsLegalToFold(N, P, Root, OptLevel))
3255 return false;
3256
3257 return selectAddr(N.getNode(),
3258 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3259}
3260
3261bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3262 SDValue &Base, SDValue &Scale,
3263 SDValue &Index, SDValue &Disp,
3264 SDValue &Segment) {
3265 assert(Root && P && "Unknown root/parent nodes");
3266 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3267 !IsProfitableToFold(N, P, Root) ||
3268 !IsLegalToFold(N, P, Root, OptLevel))
3269 return false;
3270
3271 return selectAddr(N.getNode(),
3272 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3273}
3274
3275/// Return an SDNode that returns the value of the global base register.
3276/// Output instructions required to initialize the global base register,
3277/// if necessary.
3278SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3279 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3280 auto &DL = MF->getDataLayout();
3281 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3282}
3283
3284bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3285 if (N->getOpcode() == ISD::TRUNCATE)
3286 N = N->getOperand(0).getNode();
3287 if (N->getOpcode() != X86ISD::Wrapper)
3288 return false;
3289
3290 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3291 if (!GA)
3292 return false;
3293
3294 auto *GV = GA->getGlobal();
3295 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3296 if (CR)
3297 return CR->getSignedMin().sge(-1ull << Width) &&
3298 CR->getSignedMax().slt(1ull << Width);
3299 // In the kernel code model, globals are in the negative 2GB of the address
3300 // space, so globals can be a sign extended 32-bit immediate.
3301 // In other code models, small globals are in the low 2GB of the address
3302 // space, so sign extending them is equivalent to zero extending them.
3303 return Width == 32 && !TM.isLargeGlobalValue(GV);
3304}
3305
3306X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3307 assert(N->isMachineOpcode() && "Unexpected node");
3308 unsigned Opc = N->getMachineOpcode();
3309 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3310 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3311 if (CondNo < 0)
3312 return X86::COND_INVALID;
3313
3314 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3315}
3316
3317/// Test whether the given X86ISD::CMP node has any users that use a flag
3318/// other than ZF.
3319bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3320 // Examine each user of the node.
3321 for (SDUse &Use : Flags->uses()) {
3322 // Only check things that use the flags.
3323 if (Use.getResNo() != Flags.getResNo())
3324 continue;
3325 SDNode *User = Use.getUser();
3326 // Only examine CopyToReg uses that copy to EFLAGS.
3327 if (User->getOpcode() != ISD::CopyToReg ||
3328 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3329 return false;
3330 // Examine each user of the CopyToReg use.
3331 for (SDUse &FlagUse : User->uses()) {
3332 // Only examine the Flag result.
3333 if (FlagUse.getResNo() != 1)
3334 continue;
3335 // Anything unusual: assume conservatively.
3336 if (!FlagUse.getUser()->isMachineOpcode())
3337 return false;
3338 // Examine the condition code of the user.
3339 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3340
3341 switch (CC) {
3342 // Comparisons which only use the zero flag.
3343 case X86::COND_E: case X86::COND_NE:
3344 continue;
3345 // Anything else: assume conservatively.
3346 default:
3347 return false;
3348 }
3349 }
3350 }
3351 return true;
3352}
3353
3354/// Test whether the given X86ISD::CMP node has any uses which require the SF
3355/// flag to be accurate.
3356bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3357 // Examine each user of the node.
3358 for (SDUse &Use : Flags->uses()) {
3359 // Only check things that use the flags.
3360 if (Use.getResNo() != Flags.getResNo())
3361 continue;
3362 SDNode *User = Use.getUser();
3363 // Only examine CopyToReg uses that copy to EFLAGS.
3364 if (User->getOpcode() != ISD::CopyToReg ||
3365 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3366 return false;
3367 // Examine each user of the CopyToReg use.
3368 for (SDUse &FlagUse : User->uses()) {
3369 // Only examine the Flag result.
3370 if (FlagUse.getResNo() != 1)
3371 continue;
3372 // Anything unusual: assume conservatively.
3373 if (!FlagUse.getUser()->isMachineOpcode())
3374 return false;
3375 // Examine the condition code of the user.
3376 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3377
3378 switch (CC) {
3379 // Comparisons which don't examine the SF flag.
3380 case X86::COND_A: case X86::COND_AE:
3381 case X86::COND_B: case X86::COND_BE:
3382 case X86::COND_E: case X86::COND_NE:
3383 case X86::COND_O: case X86::COND_NO:
3384 case X86::COND_P: case X86::COND_NP:
3385 continue;
3386 // Anything else: assume conservatively.
3387 default:
3388 return false;
3389 }
3390 }
3391 }
3392 return true;
3393}
3394
3396 switch (CC) {
3397 // Comparisons which don't examine the CF flag.
3398 case X86::COND_O: case X86::COND_NO:
3399 case X86::COND_E: case X86::COND_NE:
3400 case X86::COND_S: case X86::COND_NS:
3401 case X86::COND_P: case X86::COND_NP:
3402 case X86::COND_L: case X86::COND_GE:
3403 case X86::COND_G: case X86::COND_LE:
3404 return false;
3405 // Anything else: assume conservatively.
3406 default:
3407 return true;
3408 }
3409}
3410
3411/// Test whether the given node which sets flags has any uses which require the
3412/// CF flag to be accurate.
3413 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3414 // Examine each user of the node.
3415 for (SDUse &Use : Flags->uses()) {
3416 // Only check things that use the flags.
3417 if (Use.getResNo() != Flags.getResNo())
3418 continue;
3419
3420 SDNode *User = Use.getUser();
3421 unsigned UserOpc = User->getOpcode();
3422
3423 if (UserOpc == ISD::CopyToReg) {
3424 // Only examine CopyToReg uses that copy to EFLAGS.
3425 if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3426 return false;
3427 // Examine each user of the CopyToReg use.
3428 for (SDUse &FlagUse : User->uses()) {
3429 // Only examine the Flag result.
3430 if (FlagUse.getResNo() != 1)
3431 continue;
3432 // Anything unusual: assume conservatively.
3433 if (!FlagUse.getUser()->isMachineOpcode())
3434 return false;
3435 // Examine the condition code of the user.
3436 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3437
3438 if (mayUseCarryFlag(CC))
3439 return false;
3440 }
3441
3442 // This CopyToReg is ok. Move on to the next user.
3443 continue;
3444 }
3445
3446 // This might be an unselected node. So look for the pre-isel opcodes that
3447 // use flags.
3448 unsigned CCOpNo;
3449 switch (UserOpc) {
3450 default:
3451 // Something unusual. Be conservative.
3452 return false;
3453 case X86ISD::SETCC: CCOpNo = 0; break;
3454 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3455 case X86ISD::CMOV: CCOpNo = 2; break;
3456 case X86ISD::BRCOND: CCOpNo = 2; break;
3457 }
3458
3459 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
3460 if (mayUseCarryFlag(CC))
3461 return false;
3462 }
3463 return true;
3464}
3465
3466/// Check whether or not the chain ending in StoreNode is suitable for doing
3467/// the {load; op; store} to modify transformation.
3469 SDValue StoredVal, SelectionDAG *CurDAG,
3470 unsigned LoadOpNo,
3471 LoadSDNode *&LoadNode,
3472 SDValue &InputChain) {
3473 // Is the stored value result 0 of the operation?
3474 if (StoredVal.getResNo() != 0) return false;
3475
3476 // Are there other uses of the operation other than the store?
3477 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3478
3479 // Is the store non-extending and non-indexed?
3480 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3481 return false;
3482
3483 SDValue Load = StoredVal->getOperand(LoadOpNo);
3484 // Is the stored value a non-extending and non-indexed load?
3485 if (!ISD::isNormalLoad(Load.getNode())) return false;
3486
3487 // Return LoadNode by reference.
3488 LoadNode = cast<LoadSDNode>(Load);
3489
3490 // Is store the only read of the loaded value?
3491 if (!Load.hasOneUse())
3492 return false;
3493
3494 // Is the address of the store the same as the load?
3495 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3496 LoadNode->getOffset() != StoreNode->getOffset())
3497 return false;
3498
3499 bool FoundLoad = false;
3500 SmallVector<SDValue, 4> ChainOps;
3501 SmallVector<const SDNode *, 4> LoopWorklist;
3503 const unsigned int Max = 1024;
3504
3505 // Visualization of Load-Op-Store fusion:
3506 // -------------------------
3507 // Legend:
3508 // *-lines = Chain operand dependencies.
3509 // |-lines = Normal operand dependencies.
3510 // Dependencies flow down and right. n-suffix references multiple nodes.
3511 //
3512 // C Xn C
3513 // * * *
3514 // * * *
3515 // Xn A-LD Yn TF Yn
3516 // * * \ | * |
3517 // * * \ | * |
3518 // * * \ | => A--LD_OP_ST
3519 // * * \| \
3520 // TF OP \
3521 // * | \ Zn
3522 // * | \
3523 // A-ST Zn
3524 //
3525
3526 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3527 // #2: Yn -> LD
3528 // #3: ST -> Zn
3529
3530 // Ensure the transform is safe by checking for the dual
3531 // dependencies to make sure we do not induce a loop.
3532
3533 // As LD is a predecessor to both OP and ST we can do this by checking:
3534 // a). if LD is a predecessor to a member of Xn or Yn.
3535 // b). if a Zn is a predecessor to ST.
3536
3537 // However, (b) can only occur through being a chain predecessor to
3538 // ST, which is the same as Zn being a member or predecessor of Xn,
3539 // which is a subset of LD being a predecessor of Xn. So it's
3540 // subsumed by check (a).
3541
3542 SDValue Chain = StoreNode->getChain();
3543
3544 // Gather X elements in ChainOps.
3545 if (Chain == Load.getValue(1)) {
3546 FoundLoad = true;
3547 ChainOps.push_back(Load.getOperand(0));
3548 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3549 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3550 SDValue Op = Chain.getOperand(i);
3551 if (Op == Load.getValue(1)) {
3552 FoundLoad = true;
3553 // Drop Load, but keep its chain. No cycle check necessary.
3554 ChainOps.push_back(Load.getOperand(0));
3555 continue;
3556 }
3557 LoopWorklist.push_back(Op.getNode());
3558 ChainOps.push_back(Op);
3559 }
3560 }
3561
3562 if (!FoundLoad)
3563 return false;
3564
3565 // Worklist is currently Xn. Add Yn to worklist.
3566 for (SDValue Op : StoredVal->ops())
3567 if (Op.getNode() != LoadNode)
3568 LoopWorklist.push_back(Op.getNode());
3569
3570 // Check (a) if Load is a predecessor to Xn + Yn
3571 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3572 true))
3573 return false;
3574
3575 InputChain =
3576 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3577 return true;
3578}
3579
3580// Change a chain of {load; op; store} of the same value into a simple op
3581// through memory of that value, if the uses of the modified value and its
3582// address are suitable.
3583//
3584// The tablegen pattern memory operand pattern is currently not able to match
3585// the case where the EFLAGS on the original operation are used.
3586//
3587// To move this to tablegen, we'll need to improve tablegen to allow flags to
3588// be transferred from a node in the pattern to the result node, probably with
3589// a new keyword. For example, we have this
3590// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3591// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3592// but maybe need something like this
3593// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3594// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3595// (transferrable EFLAGS)]>;
3596//
3597// Until then, we manually fold these and instruction select the operation
3598// here.
3599bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3600 auto *StoreNode = cast<StoreSDNode>(Node);
3601 SDValue StoredVal = StoreNode->getOperand(1);
3602 unsigned Opc = StoredVal->getOpcode();
3603
3604 // Before we try to select anything, make sure this is memory operand size
3605 // and opcode we can handle. Note that this must match the code below that
3606 // actually lowers the opcodes.
3607 EVT MemVT = StoreNode->getMemoryVT();
3608 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3609 MemVT != MVT::i8)
3610 return false;
3611
3612 bool IsCommutable = false;
3613 bool IsNegate = false;
3614 switch (Opc) {
3615 default:
3616 return false;
3617 case X86ISD::SUB:
3618 IsNegate = isNullConstant(StoredVal.getOperand(0));
3619 break;
3620 case X86ISD::SBB:
3621 break;
3622 case X86ISD::ADD:
3623 case X86ISD::ADC:
3624 case X86ISD::AND:
3625 case X86ISD::OR:
3626 case X86ISD::XOR:
3627 IsCommutable = true;
3628 break;
3629 }
3630
3631 unsigned LoadOpNo = IsNegate ? 1 : 0;
3632 LoadSDNode *LoadNode = nullptr;
3633 SDValue InputChain;
3634 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3635 LoadNode, InputChain)) {
3636 if (!IsCommutable)
3637 return false;
3638
3639 // This operation is commutable, try the other operand.
3640 LoadOpNo = 1;
3641 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3642 LoadNode, InputChain))
3643 return false;
3644 }
3645
3646 SDValue Base, Scale, Index, Disp, Segment;
3647 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3648 Segment))
3649 return false;
3650
3651 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3652 unsigned Opc8) {
3653 switch (MemVT.getSimpleVT().SimpleTy) {
3654 case MVT::i64:
3655 return Opc64;
3656 case MVT::i32:
3657 return Opc32;
3658 case MVT::i16:
3659 return Opc16;
3660 case MVT::i8:
3661 return Opc8;
3662 default:
3663 llvm_unreachable("Invalid size!");
3664 }
3665 };
3666
3668 switch (Opc) {
3669 case X86ISD::SUB:
3670 // Handle negate.
3671 if (IsNegate) {
3672 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3673 X86::NEG8m);
3674 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3675 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3676 MVT::Other, Ops);
3677 break;
3678 }
3679 [[fallthrough]];
3680 case X86ISD::ADD:
3681 // Try to match inc/dec.
3682 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3683 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3684 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3685 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3686 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3687 unsigned NewOpc =
3688 ((Opc == X86ISD::ADD) == IsOne)
3689 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3690 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3691 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3692 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3693 MVT::Other, Ops);
3694 break;
3695 }
3696 }
3697 [[fallthrough]];
3698 case X86ISD::ADC:
3699 case X86ISD::SBB:
3700 case X86ISD::AND:
3701 case X86ISD::OR:
3702 case X86ISD::XOR: {
3703 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3704 switch (Opc) {
3705 case X86ISD::ADD:
3706 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3707 X86::ADD8mr);
3708 case X86ISD::ADC:
3709 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3710 X86::ADC8mr);
3711 case X86ISD::SUB:
3712 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3713 X86::SUB8mr);
3714 case X86ISD::SBB:
3715 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3716 X86::SBB8mr);
3717 case X86ISD::AND:
3718 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3719 X86::AND8mr);
3720 case X86ISD::OR:
3721 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3722 case X86ISD::XOR:
3723 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3724 X86::XOR8mr);
3725 default:
3726 llvm_unreachable("Invalid opcode!");
3727 }
3728 };
3729 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3730 switch (Opc) {
3731 case X86ISD::ADD:
3732 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3733 X86::ADD8mi);
3734 case X86ISD::ADC:
3735 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3736 X86::ADC8mi);
3737 case X86ISD::SUB:
3738 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3739 X86::SUB8mi);
3740 case X86ISD::SBB:
3741 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3742 X86::SBB8mi);
3743 case X86ISD::AND:
3744 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3745 X86::AND8mi);
3746 case X86ISD::OR:
3747 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3748 X86::OR8mi);
3749 case X86ISD::XOR:
3750 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3751 X86::XOR8mi);
3752 default:
3753 llvm_unreachable("Invalid opcode!");
3754 }
3755 };
3756
3757 unsigned NewOpc = SelectRegOpcode(Opc);
3758 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3759
3760 // See if the operand is a constant that we can fold into an immediate
3761 // operand.
3762 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3763 int64_t OperandV = OperandC->getSExtValue();
3764
3765 // Check if we can shrink the operand enough to fit in an immediate (or
3766 // fit into a smaller immediate) by negating it and switching the
3767 // operation.
3768 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3769 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3770 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3771 isInt<32>(-OperandV))) &&
3772 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3773 OperandV = -OperandV;
3774 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3775 }
3776
3777 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3778 Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
3779 NewOpc = SelectImmOpcode(Opc);
3780 }
3781 }
3782
3783 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3784 SDValue CopyTo =
3785 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3786 StoredVal.getOperand(2), SDValue());
3787
3788 const SDValue Ops[] = {Base, Scale, Index, Disp,
3789 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3790 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3791 Ops);
3792 } else {
3793 const SDValue Ops[] = {Base, Scale, Index, Disp,
3794 Segment, Operand, InputChain};
3795 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3796 Ops);
3797 }
3798 break;
3799 }
3800 default:
3801 llvm_unreachable("Invalid opcode!");
3802 }
3803
3804 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3805 LoadNode->getMemOperand()};
3806 CurDAG->setNodeMemRefs(Result, MemOps);
3807
3808 // Update Load Chain uses as well.
3809 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3810 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3811 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3812 CurDAG->RemoveDeadNode(Node);
3813 return true;
3814}
3815
3816// See if this is an X & Mask that we can match to BEXTR/BZHI.
3817// Where Mask is one of the following patterns:
3818// a) x & (1 << nbits) - 1
3819// b) x & ~(-1 << nbits)
3820// c) x & (-1 >> (32 - y))
3821// d) x << (32 - y) >> (32 - y)
3822// e) (1 << nbits) - 1
3823bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3824 assert(
3825 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3826 Node->getOpcode() == ISD::SRL) &&
3827 "Should be either an and-mask, or right-shift after clearing high bits.");
3828
3829 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3830 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3831 return false;
3832
3833 MVT NVT = Node->getSimpleValueType(0);
3834
3835 // Only supported for 32 and 64 bits.
3836 if (NVT != MVT::i32 && NVT != MVT::i64)
3837 return false;
3838
3839 SDValue NBits;
3840 bool NegateNBits;
3841
3842 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3843 // Else, if we only have BMI1's BEXTR, we require one-use.
3844 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3845 auto checkUses = [AllowExtraUsesByDefault](
3846 SDValue Op, unsigned NUses,
3847 std::optional<bool> AllowExtraUses) {
3848 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3849 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3850 };
3851 auto checkOneUse = [checkUses](SDValue Op,
3852 std::optional<bool> AllowExtraUses =
3853 std::nullopt) {
3854 return checkUses(Op, 1, AllowExtraUses);
3855 };
3856 auto checkTwoUse = [checkUses](SDValue Op,
3857 std::optional<bool> AllowExtraUses =
3858 std::nullopt) {
3859 return checkUses(Op, 2, AllowExtraUses);
3860 };
3861
3862 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3863 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3864 assert(V.getSimpleValueType() == MVT::i32 &&
3865 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3866 "Expected i64 -> i32 truncation");
3867 V = V.getOperand(0);
3868 }
3869 return V;
3870 };
3871
3872 // a) x & ((1 << nbits) + (-1))
3873 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3874 &NegateNBits](SDValue Mask) -> bool {
3875 // Match `add`. Must only have one use!
3876 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3877 return false;
3878 // We should be adding all-ones constant (i.e. subtracting one.)
3879 if (!isAllOnesConstant(Mask->getOperand(1)))
3880 return false;
3881 // Match `1 << nbits`. Might be truncated. Must only have one use!
3882 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3883 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3884 return false;
3885 if (!isOneConstant(M0->getOperand(0)))
3886 return false;
3887 NBits = M0->getOperand(1);
3888 NegateNBits = false;
3889 return true;
3890 };
3891
3892 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3893 V = peekThroughOneUseTruncation(V);
3894 return CurDAG->MaskedValueIsAllOnes(
3895 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3896 NVT.getSizeInBits()));
3897 };
3898
3899 // b) x & ~(-1 << nbits)
3900 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3901 &NBits, &NegateNBits](SDValue Mask) -> bool {
3902 // Match `~()`. Must only have one use!
3903 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3904 return false;
3905 // The -1 only has to be all-ones for the final Node's NVT.
3906 if (!isAllOnes(Mask->getOperand(1)))
3907 return false;
3908 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3909 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3910 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3911 return false;
3912 // The -1 only has to be all-ones for the final Node's NVT.
3913 if (!isAllOnes(M0->getOperand(0)))
3914 return false;
3915 NBits = M0->getOperand(1);
3916 NegateNBits = false;
3917 return true;
3918 };
3919
3920 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3921 // or leave the shift amount as-is, but then we'll have to negate it.
3922 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3923 unsigned Bitwidth) {
3924 NBits = ShiftAmt;
3925 NegateNBits = true;
3926 // Skip over a truncate of the shift amount, if any.
3927 if (NBits.getOpcode() == ISD::TRUNCATE)
3928 NBits = NBits.getOperand(0);
3929 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3930 // If it doesn't match, that's fine, we'll just negate it ourselves.
3931 if (NBits.getOpcode() != ISD::SUB)
3932 return;
3933 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3934 if (!V0 || V0->getZExtValue() != Bitwidth)
3935 return;
3936 NBits = NBits.getOperand(1);
3937 NegateNBits = false;
3938 };
3939
3940 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3941 // or
3942 // c) x & (-1 >> (32 - y))
3943 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3944 canonicalizeShiftAmt](SDValue Mask) -> bool {
3945 // The mask itself may be truncated.
3946 Mask = peekThroughOneUseTruncation(Mask);
3947 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3948 // Match `l>>`. Must only have one use!
3949 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3950 return false;
3951 // We should be shifting truly all-ones constant.
3952 if (!isAllOnesConstant(Mask.getOperand(0)))
3953 return false;
3954 SDValue M1 = Mask.getOperand(1);
3955 // The shift amount should not be used externally.
3956 if (!checkOneUse(M1))
3957 return false;
3958 canonicalizeShiftAmt(M1, Bitwidth);
3959 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3960 // is no extra use of the mask. Clearly, there was one since we are here.
3961 // But at the same time, if we need to negate the shift amount,
3962 // then we don't want the mask to stick around, else it's unprofitable.
3963 return !NegateNBits;
3964 };
3965
3966 SDValue X;
3967
3968 // d) x << z >> z but then we'll have to subtract z from bitwidth
3969 // or
3970 // d) x << (32 - y) >> (32 - y)
3971 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3972 AllowExtraUsesByDefault, &NegateNBits,
3973 &X](SDNode *Node) -> bool {
3974 if (Node->getOpcode() != ISD::SRL)
3975 return false;
3976 SDValue N0 = Node->getOperand(0);
3977 if (N0->getOpcode() != ISD::SHL)
3978 return false;
3979 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3980 SDValue N1 = Node->getOperand(1);
3981 SDValue N01 = N0->getOperand(1);
3982 // Both of the shifts must be by the exact same value.
3983 if (N1 != N01)
3984 return false;
3985 canonicalizeShiftAmt(N1, Bitwidth);
3986 // There should not be any external uses of the inner shift / shift amount.
3987 // Note that while we are generally okay with external uses given BMI2,
3988 // iff we need to negate the shift amount, we are not okay with extra uses.
3989 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3990 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3991 return false;
3992 X = N0->getOperand(0);
3993 return true;
3994 };
3995
3996 auto matchLowBitMask = [matchPatternA, matchPatternB,
3997 matchPatternC](SDValue Mask) -> bool {
3998 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3999 };
4000
4001 if (Node->getOpcode() == ISD::AND) {
4002 X = Node->getOperand(0);
4003 SDValue Mask = Node->getOperand(1);
4004
4005 if (matchLowBitMask(Mask)) {
4006 // Great.
4007 } else {
4008 std::swap(X, Mask);
4009 if (!matchLowBitMask(Mask))
4010 return false;
4011 }
4012 } else if (matchLowBitMask(SDValue(Node, 0))) {
4013 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
4014 } else if (!matchPatternD(Node))
4015 return false;
4016
4017 // If we need to negate the shift amount, require BMI2 BZHI support.
4018 // It's just too unprofitable for BMI1 BEXTR.
4019 if (NegateNBits && !Subtarget->hasBMI2())
4020 return false;
4021
4022 SDLoc DL(Node);
4023
4024 // Truncate the shift amount.
4025 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
4026 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4027
4028 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4029 // All the other bits are undefined, we do not care about them.
4030 SDValue ImplDef = SDValue(
4031 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
4032 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
4033
4034 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
4035 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
4036 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
4037 MVT::i32, ImplDef, NBits, SRIdxVal),
4038 0);
4039 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4040
4041 // We might have matched the amount of high bits to be cleared,
4042 // but we want the amount of low bits to be kept, so negate it then.
4043 if (NegateNBits) {
4044 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4045 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4046
4047 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4048 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4049 }
4050
4051 if (Subtarget->hasBMI2()) {
4052 // Great, just emit the BZHI..
4053 if (NVT != MVT::i32) {
4054 // But have to place the bit count into the wide-enough register first.
4055 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4056 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4057 }
4058
4059 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4060 ReplaceNode(Node, Extract.getNode());
4061 SelectCode(Extract.getNode());
4062 return true;
4063 }
4064
4065 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4066 // *logically* shifted (potentially with one-use trunc inbetween),
4067 // and the truncation was the only use of the shift,
4068 // and if so look past one-use truncation.
4069 {
4070 SDValue RealX = peekThroughOneUseTruncation(X);
4071 // FIXME: only if the shift is one-use?
4072 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4073 X = RealX;
4074 }
4075
4076 MVT XVT = X.getSimpleValueType();
4077
4078 // Else, emitting BEXTR requires one more step.
4079 // The 'control' of BEXTR has the pattern of:
4080 // [15...8 bit][ 7...0 bit] location
4081 // [ bit count][ shift] name
4082 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4083
4084 // Shift NBits left by 8 bits, thus producing 'control'.
4085 // This makes the low 8 bits to be zero.
4086 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4087 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4088 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4089 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4090
4091 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4092 // FIXME: only if the shift is one-use?
4093 if (X.getOpcode() == ISD::SRL) {
4094 SDValue ShiftAmt = X.getOperand(1);
4095 X = X.getOperand(0);
4096
4097 assert(ShiftAmt.getValueType() == MVT::i8 &&
4098 "Expected shift amount to be i8");
4099
4100 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4101 // We could zext to i16 in some form, but we intentionally don't do that.
4102 SDValue OrigShiftAmt = ShiftAmt;
4103 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4104 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4105
4106 // And now 'or' these low 8 bits of shift amount into the 'control'.
4107 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4108 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4109 }
4110
4111 // But have to place the 'control' into the wide-enough register first.
4112 if (XVT != MVT::i32) {
4113 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4114 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4115 }
4116
4117 // And finally, form the BEXTR itself.
4118 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4119
4120 // The 'X' was originally truncated. Do that now.
4121 if (XVT != NVT) {
4122 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4123 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4124 }
4125
4126 ReplaceNode(Node, Extract.getNode());
4127 SelectCode(Extract.getNode());
4128
4129 return true;
4130}
4131
4132// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4133MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4134 MVT NVT = Node->getSimpleValueType(0);
4135 SDLoc dl(Node);
4136
4137 SDValue N0 = Node->getOperand(0);
4138 SDValue N1 = Node->getOperand(1);
4139
4140 // If we have TBM we can use an immediate for the control. If we have BMI
4141 // we should only do this if the BEXTR instruction is implemented well.
4142 // Otherwise moving the control into a register makes this more costly.
4143 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4144 // hoisting the move immediate would make it worthwhile with a less optimal
4145 // BEXTR?
4146 bool PreferBEXTR =
4147 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4148 if (!PreferBEXTR && !Subtarget->hasBMI2())
4149 return nullptr;
4150
4151 // Must have a shift right.
4152 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4153 return nullptr;
4154
4155 // Shift can't have additional users.
4156 if (!N0->hasOneUse())
4157 return nullptr;
4158
4159 // Only supported for 32 and 64 bits.
4160 if (NVT != MVT::i32 && NVT != MVT::i64)
4161 return nullptr;
4162
4163 // Shift amount and RHS of and must be constant.
4164 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4165 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4166 if (!MaskCst || !ShiftCst)
4167 return nullptr;
4168
4169 // And RHS must be a mask.
4170 uint64_t Mask = MaskCst->getZExtValue();
4171 if (!isMask_64(Mask))
4172 return nullptr;
4173
4174 uint64_t Shift = ShiftCst->getZExtValue();
4175 uint64_t MaskSize = llvm::popcount(Mask);
4176
4177 // Don't interfere with something that can be handled by extracting AH.
4178 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4179 if (Shift == 8 && MaskSize == 8)
4180 return nullptr;
4181
4182 // Make sure we are only using bits that were in the original value, not
4183 // shifted in.
4184 if (Shift + MaskSize > NVT.getSizeInBits())
4185 return nullptr;
4186
4187 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4188 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4189 // does not fit into 32 bits. Load folding is not a sufficient reason.
4190 if (!PreferBEXTR && MaskSize <= 32)
4191 return nullptr;
4192
4193 SDValue Control;
4194 unsigned ROpc, MOpc;
4195
4196#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4197 if (!PreferBEXTR) {
4198 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4199 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4200 // Let's perform the mask first, and apply shift later. Note that we need to
4201 // widen the mask to account for the fact that we'll apply shift afterwards!
4202 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4203 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4204 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4205 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4206 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4207 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4208 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4209 } else {
4210 // The 'control' of BEXTR has the pattern of:
4211 // [15...8 bit][ 7...0 bit] location
4212 // [ bit count][ shift] name
4213 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4214 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4215 if (Subtarget->hasTBM()) {
4216 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4217 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4218 } else {
4219 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4220 // BMI requires the immediate to placed in a register.
4221 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4222 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4223 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4224 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4225 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4226 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4227 }
4228 }
4229
4230 MachineSDNode *NewNode;
4231 SDValue Input = N0->getOperand(0);
4232 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4233 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4234 SDValue Ops[] = {
4235 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4236 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4237 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4238 // Update the chain.
4239 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4240 // Record the mem-refs
4241 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4242 } else {
4243 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4244 }
4245
4246 if (!PreferBEXTR) {
4247 // We still need to apply the shift.
4248 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4249 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4250 : GET_ND_IF_ENABLED(X86::SHR32ri);
4251 NewNode =
4252 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4253 }
4254
4255 return NewNode;
4256}
4257
4258// Emit a PCMISTR(I/M) instruction.
4259MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4260 bool MayFoldLoad, const SDLoc &dl,
4261 MVT VT, SDNode *Node) {
4262 SDValue N0 = Node->getOperand(0);
4263 SDValue N1 = Node->getOperand(1);
4264 SDValue Imm = Node->getOperand(2);
4265 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4266 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4267
4268 // Try to fold a load. No need to check alignment.
4269 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4270 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4271 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4272 N1.getOperand(0) };
4273 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4274 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4275 // Update the chain.
4276 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4277 // Record the mem-refs
4278 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4279 return CNode;
4280 }
4281
4282 SDValue Ops[] = { N0, N1, Imm };
4283 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4284 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4285 return CNode;
4286}
4287
4288// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4289// to emit a second instruction after this one. This is needed since we have two
4290// copyToReg nodes glued before this and we need to continue that glue through.
4291MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4292 bool MayFoldLoad, const SDLoc &dl,
4293 MVT VT, SDNode *Node,
4294 SDValue &InGlue) {
4295 SDValue N0 = Node->getOperand(0);
4296 SDValue N2 = Node->getOperand(2);
4297 SDValue Imm = Node->getOperand(4);
4298 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4299 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4300
4301 // Try to fold a load. No need to check alignment.
4302 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4303 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4304 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4305 N2.getOperand(0), InGlue };
4306 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4307 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4308 InGlue = SDValue(CNode, 3);
4309 // Update the chain.
4310 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4311 // Record the mem-refs
4312 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4313 return CNode;
4314 }
4315
4316 SDValue Ops[] = { N0, N2, Imm, InGlue };
4317 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4318 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4319 InGlue = SDValue(CNode, 2);
4320 return CNode;
4321}
4322
4323bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4324 EVT VT = N->getValueType(0);
4325
4326 // Only handle scalar shifts.
4327 if (VT.isVector())
4328 return false;
4329
4330 // Narrower shifts only mask to 5 bits in hardware.
4331 unsigned Size = VT == MVT::i64 ? 64 : 32;
4332
4333 SDValue OrigShiftAmt = N->getOperand(1);
4334 SDValue ShiftAmt = OrigShiftAmt;
4335 SDLoc DL(N);
4336
4337 // Skip over a truncate of the shift amount.
4338 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4339 ShiftAmt = ShiftAmt->getOperand(0);
4340
4341 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4342 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4343
4344 SDValue NewShiftAmt;
4345 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4346 ShiftAmt->getOpcode() == ISD::XOR) {
4347 SDValue Add0 = ShiftAmt->getOperand(0);
4348 SDValue Add1 = ShiftAmt->getOperand(1);
4349 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4350 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4351 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4352 // to avoid the ADD/SUB/XOR.
4353 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4354 NewShiftAmt = Add0;
4355
4356 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4357 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4358 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4359 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4360 // we can replace it with a NOT. In the XOR case it may save some code
4361 // size, in the SUB case it also may save a move.
4362 assert(Add0C == nullptr || Add1C == nullptr);
4363
4364 // We can only do N-X, not X-N
4365 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4366 return false;
4367
4368 EVT OpVT = ShiftAmt.getValueType();
4369
4370 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4371 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4372 Add0C == nullptr ? Add0 : Add1, AllOnes);
4373 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4374 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4375 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4376 // -X to generate a NEG instead of a SUB of a constant.
4377 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4378 Add0C->getZExtValue() != 0) {
4379 EVT SubVT = ShiftAmt.getValueType();
4380 SDValue X;
4381 if (Add0C->getZExtValue() % Size == 0)
4382 X = Add1;
4383 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4384 Add0C->getZExtValue() % 32 == 0) {
4385 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4386 // This is mainly beneficial if we already compute (x+n*32).
4387 if (Add1.getOpcode() == ISD::TRUNCATE) {
4388 Add1 = Add1.getOperand(0);
4389 SubVT = Add1.getValueType();
4390 }
4391 if (Add0.getValueType() != SubVT) {
4392 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4393 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4394 }
4395
4396 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4397 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4398 } else
4399 return false;
4400 // Insert a negate op.
4401 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4402 // that uses it that's not a shift.
4403 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4404 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4405 NewShiftAmt = Neg;
4406
4407 // Insert these operands into a valid topological order so they can
4408 // get selected independently.
4409 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4410 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4411 } else
4412 return false;
4413 } else
4414 return false;
4415
4416 if (NewShiftAmt.getValueType() != MVT::i8) {
4417 // Need to truncate the shift amount.
4418 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4419 // Add to a correct topological ordering.
4420 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4421 }
4422
4423 // Insert a new mask to keep the shift amount legal. This should be removed
4424 // by isel patterns.
4425 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4426 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4427 // Place in a correct topological ordering.
4428 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4429
4430 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4431 NewShiftAmt);
4432 if (UpdatedNode != N) {
4433 // If we found an existing node, we should replace ourselves with that node
4434 // and wait for it to be selected after its other users.
4435 ReplaceNode(N, UpdatedNode);
4436 return true;
4437 }
4438
4439 // If the original shift amount is now dead, delete it so that we don't run
4440 // it through isel.
4441 if (OrigShiftAmt.getNode()->use_empty())
4442 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4443
4444 // Now that we've optimized the shift amount, defer to normal isel to get
4445 // load folding and legacy vs BMI2 selection without repeating it here.
4446 SelectCode(N);
4447 return true;
4448}
4449
4450bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4451 MVT NVT = N->getSimpleValueType(0);
4452 unsigned Opcode = N->getOpcode();
4453 SDLoc dl(N);
4454
4455 // For operations of the form (x << C1) op C2, check if we can use a smaller
4456 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4457 SDValue Shift = N->getOperand(0);
4458 SDValue N1 = N->getOperand(1);
4459
4460 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4461 if (!Cst)
4462 return false;
4463
4464 int64_t Val = Cst->getSExtValue();
4465
4466 // If we have an any_extend feeding the AND, look through it to see if there
4467 // is a shift behind it. But only if the AND doesn't use the extended bits.
4468 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4469 bool FoundAnyExtend = false;
4470 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4471 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4472 isUInt<32>(Val)) {
4473 FoundAnyExtend = true;
4474 Shift = Shift.getOperand(0);
4475 }
4476
4477 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4478 return false;
4479
4480 // i8 is unshrinkable, i16 should be promoted to i32.
4481 if (NVT != MVT::i32 && NVT != MVT::i64)
4482 return false;
4483
4484 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4485 if (!ShlCst)
4486 return false;
4487
4488 uint64_t ShAmt = ShlCst->getZExtValue();
4489
4490 // Make sure that we don't change the operation by removing bits.
4491 // This only matters for OR and XOR, AND is unaffected.
4492 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4493 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4494 return false;
4495
4496 // Check the minimum bitwidth for the new constant.
4497 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4498 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4499 if (Opcode == ISD::AND) {
4500 // AND32ri is the same as AND64ri32 with zext imm.
4501 // Try this before sign extended immediates below.
4502 ShiftedVal = (uint64_t)Val >> ShAmt;
4503 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4504 return true;
4505 // Also swap order when the AND can become MOVZX.
4506 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4507 return true;
4508 }
4509 ShiftedVal = Val >> ShAmt;
4510 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4511 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4512 return true;
4513 if (Opcode != ISD::AND) {
4514 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4515 ShiftedVal = (uint64_t)Val >> ShAmt;
4516 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4517 return true;
4518 }
4519 return false;
4520 };
4521
4522 int64_t ShiftedVal;
4523 if (!CanShrinkImmediate(ShiftedVal))
4524 return false;
4525
4526 // Ok, we can reorder to get a smaller immediate.
4527
4528 // But, its possible the original immediate allowed an AND to become MOVZX.
4529 // Doing this late due to avoid the MakedValueIsZero call as late as
4530 // possible.
4531 if (Opcode == ISD::AND) {
4532 // Find the smallest zext this could possibly be.
4533 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4534 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4535
4536 // Figure out which bits need to be zero to achieve that mask.
4537 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4538 ZExtWidth);
4539 NeededMask &= ~Cst->getAPIntValue();
4540
4541 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4542 return false;
4543 }
4544
4545 SDValue X = Shift.getOperand(0);
4546 if (FoundAnyExtend) {
4547 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4548 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4549 X = NewX;
4550 }
4551
4552 SDValue NewCst = CurDAG->getSignedConstant(ShiftedVal, dl, NVT);
4553 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4554 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4555 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4556 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4557 Shift.getOperand(1));
4558 ReplaceNode(N, NewSHL.getNode());
4559 SelectCode(NewSHL.getNode());
4560 return true;
4561}
4562
4563bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4564 SDNode *ParentB, SDNode *ParentC,
4566 uint8_t Imm) {
4567 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4568 C.isOperandOf(ParentC) && "Incorrect parent node");
4569
4570 auto tryFoldLoadOrBCast =
4571 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4572 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4573 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4574 return true;
4575
4576 // Not a load, check for broadcast which may be behind a bitcast.
4577 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4578 P = L.getNode();
4579 L = L.getOperand(0);
4580 }
4581
4582 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4583 return false;
4584
4585 // Only 32 and 64 bit broadcasts are supported.
4586 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4587 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4588 if (Size != 32 && Size != 64)
4589 return false;
4590
4591 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4592 };
4593
4594 bool FoldedLoad = false;
4595 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4596 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4597 FoldedLoad = true;
4598 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4599 Tmp4)) {
4600 FoldedLoad = true;
4601 std::swap(A, C);
4602 // Swap bits 1/4 and 3/6.
4603 uint8_t OldImm = Imm;
4604 Imm = OldImm & 0xa5;
4605 if (OldImm & 0x02) Imm |= 0x10;
4606 if (OldImm & 0x10) Imm |= 0x02;
4607 if (OldImm & 0x08) Imm |= 0x40;
4608 if (OldImm & 0x40) Imm |= 0x08;
4609 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4610 Tmp4)) {
4611 FoldedLoad = true;
4612 std::swap(B, C);
4613 // Swap bits 1/2 and 5/6.
4614 uint8_t OldImm = Imm;
4615 Imm = OldImm & 0x99;
4616 if (OldImm & 0x02) Imm |= 0x04;
4617 if (OldImm & 0x04) Imm |= 0x02;
4618 if (OldImm & 0x20) Imm |= 0x40;
4619 if (OldImm & 0x40) Imm |= 0x20;
4620 }
4621
4622 SDLoc DL(Root);
4623
4624 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4625
4626 MVT NVT = Root->getSimpleValueType(0);
4627
4628 MachineSDNode *MNode;
4629 if (FoldedLoad) {
4630 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4631
4632 unsigned Opc;
4633 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4634 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4635 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4636 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4637
4638 bool UseD = EltSize == 32;
4639 if (NVT.is128BitVector())
4640 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4641 else if (NVT.is256BitVector())
4642 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4643 else if (NVT.is512BitVector())
4644 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4645 else
4646 llvm_unreachable("Unexpected vector size!");
4647 } else {
4648 bool UseD = NVT.getVectorElementType() == MVT::i32;
4649 if (NVT.is128BitVector())
4650 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4651 else if (NVT.is256BitVector())
4652 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4653 else if (NVT.is512BitVector())
4654 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4655 else
4656 llvm_unreachable("Unexpected vector size!");
4657 }
4658
4659 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4660 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4661
4662 // Update the chain.
4663 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4664 // Record the mem-refs
4665 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4666 } else {
4667 bool UseD = NVT.getVectorElementType() == MVT::i32;
4668 unsigned Opc;
4669 if (NVT.is128BitVector())
4670 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4671 else if (NVT.is256BitVector())
4672 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4673 else if (NVT.is512BitVector())
4674 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4675 else
4676 llvm_unreachable("Unexpected vector size!");
4677
4678 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4679 }
4680
4681 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4682 CurDAG->RemoveDeadNode(Root);
4683 return true;
4684}
4685
4686// Try to match two logic ops to a VPTERNLOG.
4687// FIXME: Handle more complex patterns that use an operand more than once?
4688bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4689 MVT NVT = N->getSimpleValueType(0);
4690
4691 // Make sure we support VPTERNLOG.
4692 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4693 NVT.getVectorElementType() == MVT::i1)
4694 return false;
4695
4696 // We need VLX for 128/256-bit.
4697 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4698 return false;
4699
4700 SDValue N0 = N->getOperand(0);
4701 SDValue N1 = N->getOperand(1);
4702
4703 auto getFoldableLogicOp = [](SDValue Op) {
4704 // Peek through single use bitcast.
4705 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4706 Op = Op.getOperand(0);
4707
4708 if (!Op.hasOneUse())
4709 return SDValue();
4710
4711 unsigned Opc = Op.getOpcode();
4712 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4713 Opc == X86ISD::ANDNP)
4714 return Op;
4715
4716 return SDValue();
4717 };
4718
4719 SDValue A, FoldableOp;
4720 if ((FoldableOp = getFoldableLogicOp(N1))) {
4721 A = N0;
4722 } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4723 A = N1;
4724 } else
4725 return false;
4726
4727 SDValue B = FoldableOp.getOperand(0);
4728 SDValue C = FoldableOp.getOperand(1);
4729 SDNode *ParentA = N;
4730 SDNode *ParentB = FoldableOp.getNode();
4731 SDNode *ParentC = FoldableOp.getNode();
4732
4733 // We can build the appropriate control immediate by performing the logic
4734 // operation we're matching using these constants for A, B, and C.
4735 uint8_t TernlogMagicA = 0xf0;
4736 uint8_t TernlogMagicB = 0xcc;
4737 uint8_t TernlogMagicC = 0xaa;
4738
4739 // Some of the inputs may be inverted, peek through them and invert the
4740 // magic values accordingly.
4741 // TODO: There may be a bitcast before the xor that we should peek through.
4742 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4743 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4744 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4745 Magic = ~Magic;
4746 Parent = Op.getNode();
4747 Op = Op.getOperand(0);
4748 }
4749 };
4750
4751 PeekThroughNot(A, ParentA, TernlogMagicA);
4752 PeekThroughNot(B, ParentB, TernlogMagicB);
4753 PeekThroughNot(C, ParentC, TernlogMagicC);
4754
4755 uint8_t Imm;
4756 switch (FoldableOp.getOpcode()) {
4757 default: llvm_unreachable("Unexpected opcode!");
4758 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4759 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4760 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4761 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4762 }
4763
4764 switch (N->getOpcode()) {
4765 default: llvm_unreachable("Unexpected opcode!");
4766 case X86ISD::ANDNP:
4767 if (A == N0)
4768 Imm &= ~TernlogMagicA;
4769 else
4770 Imm = ~(Imm) & TernlogMagicA;
4771 break;
4772 case ISD::AND: Imm &= TernlogMagicA; break;
4773 case ISD::OR: Imm |= TernlogMagicA; break;
4774 case ISD::XOR: Imm ^= TernlogMagicA; break;
4775 }
4776
4777 return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4778}
4779
4780/// If the high bits of an 'and' operand are known zero, try setting the
4781/// high bits of an 'and' constant operand to produce a smaller encoding by
4782/// creating a small, sign-extended negative immediate rather than a large
4783/// positive one. This reverses a transform in SimplifyDemandedBits that
4784/// shrinks mask constants by clearing bits. There is also a possibility that
4785/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4786/// case, just replace the 'and'. Return 'true' if the node is replaced.
4787bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4788 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4789 // have immediate operands.
4790 MVT VT = And->getSimpleValueType(0);
4791 if (VT != MVT::i32 && VT != MVT::i64)
4792 return false;
4793
4794 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4795 if (!And1C)
4796 return false;
4797
4798 // Bail out if the mask constant is already negative. It's can't shrink more.
4799 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4800 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4801 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4802 // are negative too.
4803 APInt MaskVal = And1C->getAPIntValue();
4804 unsigned MaskLZ = MaskVal.countl_zero();
4805 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4806 return false;
4807
4808 // Don't extend into the upper 32 bits of a 64 bit mask.
4809 if (VT == MVT::i64 && MaskLZ >= 32) {
4810 MaskLZ -= 32;
4811 MaskVal = MaskVal.trunc(32);
4812 }
4813
4814 SDValue And0 = And->getOperand(0);
4815 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4816 APInt NegMaskVal = MaskVal | HighZeros;
4817
4818 // If a negative constant would not allow a smaller encoding, there's no need
4819 // to continue. Only change the constant when we know it's a win.
4820 unsigned MinWidth = NegMaskVal.getSignificantBits();
4821 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4822 return false;
4823
4824 // Extend masks if we truncated above.
4825 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4826 NegMaskVal = NegMaskVal.zext(64);
4827 HighZeros = HighZeros.zext(64);
4828 }
4829
4830 // The variable operand must be all zeros in the top bits to allow using the
4831 // new, negative constant as the mask.
4832 // TODO: Handle constant folding?
4833 KnownBits Known0 = CurDAG->computeKnownBits(And0);
4834 if (Known0.isConstant() || !HighZeros.isSubsetOf(Known0.Zero))
4835 return false;
4836
4837 // Check if the mask is -1. In that case, this is an unnecessary instruction
4838 // that escaped earlier analysis.
4839 if (NegMaskVal.isAllOnes()) {
4840 ReplaceNode(And, And0.getNode());
4841 return true;
4842 }
4843
4844 // A negative mask allows a smaller encoding. Create a new 'and' node.
4845 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4846 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4847 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4848 ReplaceNode(And, NewAnd.getNode());
4849 SelectCode(NewAnd.getNode());
4850 return true;
4851}
4852
4853static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4854 bool FoldedBCast, bool Masked) {
4855#define VPTESTM_CASE(VT, SUFFIX) \
4856case MVT::VT: \
4857 if (Masked) \
4858 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4859 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4860
4861
4862#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4863default: llvm_unreachable("Unexpected VT!"); \
4864VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4865VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4866VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4867VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4868VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4869VPTESTM_CASE(v8i64, QZ##SUFFIX)
4870
4871#define VPTESTM_FULL_CASES(SUFFIX) \
4872VPTESTM_BROADCAST_CASES(SUFFIX) \
4873VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4874VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4875VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4876VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4877VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4878VPTESTM_CASE(v32i16, WZ##SUFFIX)
4879
4880 if (FoldedBCast) {
4881 switch (TestVT.SimpleTy) {
4883 }
4884 }
4885
4886 if (FoldedLoad) {
4887 switch (TestVT.SimpleTy) {
4889 }
4890 }
4891
4892 switch (TestVT.SimpleTy) {
4894 }
4895
4896#undef VPTESTM_FULL_CASES
4897#undef VPTESTM_BROADCAST_CASES
4898#undef VPTESTM_CASE
4899}
4900
4901// Try to create VPTESTM instruction. If InMask is not null, it will be used
4902// to form a masked operation.
4903bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4904 SDValue InMask) {
4905 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4906 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4907 "Unexpected VT!");
4908
4909 // Look for equal and not equal compares.
4910 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4911 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4912 return false;
4913
4914 SDValue SetccOp0 = Setcc.getOperand(0);
4915 SDValue SetccOp1 = Setcc.getOperand(1);
4916
4917 // Canonicalize the all zero vector to the RHS.
4918 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4919 std::swap(SetccOp0, SetccOp1);
4920
4921 // See if we're comparing against zero.
4922 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4923 return false;
4924
4925 SDValue N0 = SetccOp0;
4926
4927 MVT CmpVT = N0.getSimpleValueType();
4928 MVT CmpSVT = CmpVT.getVectorElementType();
4929
4930 // Start with both operands the same. We'll try to refine this.
4931 SDValue Src0 = N0;
4932 SDValue Src1 = N0;
4933
4934 {
4935 // Look through single use bitcasts.
4936 SDValue N0Temp = N0;
4937 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4938 N0Temp = N0.getOperand(0);
4939
4940 // Look for single use AND.
4941 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4942 Src0 = N0Temp.getOperand(0);
4943 Src1 = N0Temp.getOperand(1);
4944 }
4945 }
4946
4947 // Without VLX we need to widen the operation.
4948 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4949
4950 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4951 SDValue &Base, SDValue &Scale, SDValue &Index,
4952 SDValue &Disp, SDValue &Segment) {
4953 // If we need to widen, we can't fold the load.
4954 if (!Widen)
4955 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4956 return true;
4957
4958 // If we didn't fold a load, try to match broadcast. No widening limitation
4959 // for this. But only 32 and 64 bit types are supported.
4960 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4961 return false;
4962
4963 // Look through single use bitcasts.
4964 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4965 P = L.getNode();
4966 L = L.getOperand(0);
4967 }
4968
4969 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4970 return false;
4971
4972 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4973 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4974 return false;
4975
4976 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4977 };
4978
4979 // We can only fold loads if the sources are unique.
4980 bool CanFoldLoads = Src0 != Src1;
4981
4982 bool FoldedLoad = false;
4983 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4984 if (CanFoldLoads) {
4985 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4986 Tmp3, Tmp4);
4987 if (!FoldedLoad) {
4988 // And is commutative.
4989 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4990 Tmp2, Tmp3, Tmp4);
4991 if (FoldedLoad)
4992 std::swap(Src0, Src1);
4993 }
4994 }
4995
4996 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4997
4998 bool IsMasked = InMask.getNode() != nullptr;
4999
5000 SDLoc dl(Root);
5001
5002 MVT ResVT = Setcc.getSimpleValueType();
5003 MVT MaskVT = ResVT;
5004 if (Widen) {
5005 // Widen the inputs using insert_subreg or copy_to_regclass.
5006 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5007 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5008 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5009 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
5010 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
5011 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
5012 CmpVT), 0);
5013 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
5014
5015 if (!FoldedBCast)
5016 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
5017
5018 if (IsMasked) {
5019 // Widen the mask.
5020 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
5021 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5022 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5023 dl, MaskVT, InMask, RC), 0);
5024 }
5025 }
5026
5027 bool IsTestN = CC == ISD::SETEQ;
5028 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5029 IsMasked);
5030
5031 MachineSDNode *CNode;
5032 if (FoldedLoad) {
5033 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
5034
5035 if (IsMasked) {
5036 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5037 Src1.getOperand(0) };
5038 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5039 } else {
5040 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5041 Src1.getOperand(0) };
5042 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5043 }
5044
5045 // Update the chain.
5046 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5047 // Record the mem-refs
5048 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5049 } else {
5050 if (IsMasked)
5051 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5052 else
5053 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5054 }
5055
5056 // If we widened, we need to shrink the mask VT.
5057 if (Widen) {
5058 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5059 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5060 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5061 dl, ResVT, SDValue(CNode, 0), RC);
5062 }
5063
5064 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5065 CurDAG->RemoveDeadNode(Root);
5066 return true;
5067}
5068
5069// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5070// into vpternlog.
5071bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5072 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5073
5074 MVT NVT = N->getSimpleValueType(0);
5075
5076 // Make sure we support VPTERNLOG.
5077 if (!NVT.isVector() || !Subtarget->hasAVX512())
5078 return false;
5079
5080 // We need VLX for 128/256-bit.
5081 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5082 return false;
5083
5084 SDValue N0 = N->getOperand(0);
5085 SDValue N1 = N->getOperand(1);
5086
5087 // Canonicalize AND to LHS.
5088 if (N1.getOpcode() == ISD::AND)
5089 std::swap(N0, N1);
5090
5091 if (N0.getOpcode() != ISD::AND ||
5092 N1.getOpcode() != X86ISD::ANDNP ||
5093 !N0.hasOneUse() || !N1.hasOneUse())
5094 return false;
5095
5096 // ANDN is not commutable, use it to pick down A and C.
5097 SDValue A = N1.getOperand(0);
5098 SDValue C = N1.getOperand(1);
5099
5100 // AND is commutable, if one operand matches A, the other operand is B.
5101 // Otherwise this isn't a match.
5102 SDValue B;
5103 if (N0.getOperand(0) == A)
5104 B = N0.getOperand(1);
5105 else if (N0.getOperand(1) == A)
5106 B = N0.getOperand(0);
5107 else
5108 return false;
5109
5110 SDLoc dl(N);
5111 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5112 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5113 ReplaceNode(N, Ternlog.getNode());
5114
5115 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5116 Ternlog.getNode(), A, B, C, 0xCA);
5117}
5118
5119void X86DAGToDAGISel::Select(SDNode *Node) {
5120 MVT NVT = Node->getSimpleValueType(0);
5121 unsigned Opcode = Node->getOpcode();
5122 SDLoc dl(Node);
5123
5124 if (Node->isMachineOpcode()) {
5125 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5126 Node->setNodeId(-1);
5127 return; // Already selected.
5128 }
5129
5130 switch (Opcode) {
5131 default: break;
5133 unsigned IntNo = Node->getConstantOperandVal(1);
5134 switch (IntNo) {
5135 default: break;
5136 case Intrinsic::x86_encodekey128:
5137 case Intrinsic::x86_encodekey256: {
5138 if (!Subtarget->hasKL())
5139 break;
5140
5141 unsigned Opcode;
5142 switch (IntNo) {
5143 default: llvm_unreachable("Impossible intrinsic");
5144 case Intrinsic::x86_encodekey128:
5145 Opcode = X86::ENCODEKEY128;
5146 break;
5147 case Intrinsic::x86_encodekey256:
5148 Opcode = X86::ENCODEKEY256;
5149 break;
5150 }
5151
5152 SDValue Chain = Node->getOperand(0);
5153 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5154 SDValue());
5155 if (Opcode == X86::ENCODEKEY256)
5156 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5157 Chain.getValue(1));
5158
5159 MachineSDNode *Res = CurDAG->getMachineNode(
5160 Opcode, dl, Node->getVTList(),
5161 {Node->getOperand(2), Chain, Chain.getValue(1)});
5162 ReplaceNode(Node, Res);
5163 return;
5164 }
5165 case Intrinsic::x86_tileloaddrs64_internal:
5166 case Intrinsic::x86_tileloaddrst164_internal:
5167 if (!Subtarget->hasAMXMOVRS())
5168 break;
5169 [[fallthrough]];
5170 case Intrinsic::x86_tileloadd64_internal:
5171 case Intrinsic::x86_tileloaddt164_internal: {
5172 if (!Subtarget->hasAMXTILE())
5173 break;
5174 auto *MFI =
5175 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5176 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5177 unsigned Opc;
5178 switch (IntNo) {
5179 default:
5180 llvm_unreachable("Unexpected intrinsic!");
5181 case Intrinsic::x86_tileloaddrs64_internal:
5182 Opc = X86::PTILELOADDRSV;
5183 break;
5184 case Intrinsic::x86_tileloaddrst164_internal:
5185 Opc = X86::PTILELOADDRST1V;
5186 break;
5187 case Intrinsic::x86_tileloadd64_internal:
5188 Opc = X86::PTILELOADDV;
5189 break;
5190 case Intrinsic::x86_tileloaddt164_internal:
5191 Opc = X86::PTILELOADDT1V;
5192 break;
5193 }
5194 // _tile_loadd_internal(row, col, buf, STRIDE)
5195 SDValue Base = Node->getOperand(4);
5196 SDValue Scale = getI8Imm(1, dl);
5197 SDValue Index = Node->getOperand(5);
5198 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5199 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5200 SDValue Chain = Node->getOperand(0);
5201 MachineSDNode *CNode;
5202 SDValue Ops[] = {Node->getOperand(2),
5203 Node->getOperand(3),
5204 Base,
5205 Scale,
5206 Index,
5207 Disp,
5208 Segment,
5209 Chain};
5210 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5211 ReplaceNode(Node, CNode);
5212 return;
5213 }
5214 }
5215 break;
5216 }
5217 case ISD::INTRINSIC_VOID: {
5218 unsigned IntNo = Node->getConstantOperandVal(1);
5219 switch (IntNo) {
5220 default: break;
5221 case Intrinsic::x86_sse3_monitor:
5222 case Intrinsic::x86_monitorx:
5223 case Intrinsic::x86_clzero: {
5224 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5225
5226 unsigned Opc = 0;
5227 switch (IntNo) {
5228 default: llvm_unreachable("Unexpected intrinsic!");
5229 case Intrinsic::x86_sse3_monitor:
5230 if (!Subtarget->hasSSE3())
5231 break;
5232 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5233 break;
5234 case Intrinsic::x86_monitorx:
5235 if (!Subtarget->hasMWAITX())
5236 break;
5237 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5238 break;
5239 case Intrinsic::x86_clzero:
5240 if (!Subtarget->hasCLZERO())
5241 break;
5242 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5243 break;
5244 }
5245
5246 if (Opc) {
5247 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5248 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5249 Node->getOperand(2), SDValue());
5250 SDValue InGlue = Chain.getValue(1);
5251
5252 if (IntNo == Intrinsic::x86_sse3_monitor ||
5253 IntNo == Intrinsic::x86_monitorx) {
5254 // Copy the other two operands to ECX and EDX.
5255 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5256 InGlue);
5257 InGlue = Chain.getValue(1);
5258 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5259 InGlue);
5260 InGlue = Chain.getValue(1);
5261 }
5262
5263 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5264 { Chain, InGlue});
5265 ReplaceNode(Node, CNode);
5266 return;
5267 }
5268
5269 break;
5270 }
5271 case Intrinsic::x86_tilestored64_internal: {
5272 auto *MFI =
5273 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5274 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5275 unsigned Opc = X86::PTILESTOREDV;
5276 // _tile_stored_internal(row, col, buf, STRIDE, c)
5277 SDValue Base = Node->getOperand(4);
5278 SDValue Scale = getI8Imm(1, dl);
5279 SDValue Index = Node->getOperand(5);
5280 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5281 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5282 SDValue Chain = Node->getOperand(0);
5283 MachineSDNode *CNode;
5284 SDValue Ops[] = {Node->getOperand(2),
5285 Node->getOperand(3),
5286 Base,
5287 Scale,
5288 Index,
5289 Disp,
5290 Segment,
5291 Node->getOperand(6),
5292 Chain};
5293 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5294 ReplaceNode(Node, CNode);
5295 return;
5296 }
5297 case Intrinsic::x86_tileloaddrs64:
5298 case Intrinsic::x86_tileloaddrst164:
5299 if (!Subtarget->hasAMXMOVRS())
5300 break;
5301 [[fallthrough]];
5302 case Intrinsic::x86_tileloadd64:
5303 case Intrinsic::x86_tileloaddt164:
5304 case Intrinsic::x86_tilestored64: {
5305 if (!Subtarget->hasAMXTILE())
5306 break;
5307 auto *MFI =
5308 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5309 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5310 unsigned Opc;
5311 switch (IntNo) {
5312 default: llvm_unreachable("Unexpected intrinsic!");
5313 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5314 case Intrinsic::x86_tileloaddrs64:
5315 Opc = X86::PTILELOADDRS;
5316 break;
5317 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5318 case Intrinsic::x86_tileloaddrst164:
5319 Opc = X86::PTILELOADDRST1;
5320 break;
5321 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5322 }
5323 // FIXME: Match displacement and scale.
5324 unsigned TIndex = Node->getConstantOperandVal(2);
5325 SDValue TReg = getI8Imm(TIndex, dl);
5326 SDValue Base = Node->getOperand(3);
5327 SDValue Scale = getI8Imm(1, dl);
5328 SDValue Index = Node->getOperand(4);
5329 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5330 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5331 SDValue Chain = Node->getOperand(0);
5332 MachineSDNode *CNode;
5333 if (Opc == X86::PTILESTORED) {
5334 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5335 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5336 } else {
5337 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5338 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5339 }
5340 ReplaceNode(Node, CNode);
5341 return;
5342 }
5343 case Intrinsic::x86_t2rpntlvwz0rs:
5344 case Intrinsic::x86_t2rpntlvwz0rst1:
5345 case Intrinsic::x86_t2rpntlvwz1rs:
5346 case Intrinsic::x86_t2rpntlvwz1rst1:
5347 if (!Subtarget->hasAMXMOVRS())
5348 break;
5349 [[fallthrough]];
5350 case Intrinsic::x86_t2rpntlvwz0:
5351 case Intrinsic::x86_t2rpntlvwz0t1:
5352 case Intrinsic::x86_t2rpntlvwz1:
5353 case Intrinsic::x86_t2rpntlvwz1t1: {
5354 if (!Subtarget->hasAMXTRANSPOSE())
5355 break;
5356 auto *MFI =
5357 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5358 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5359 unsigned Opc;
5360 switch (IntNo) {
5361 default:
5362 llvm_unreachable("Unexpected intrinsic!");
5363 case Intrinsic::x86_t2rpntlvwz0:
5364 Opc = X86::PT2RPNTLVWZ0;
5365 break;
5366 case Intrinsic::x86_t2rpntlvwz0t1:
5367 Opc = X86::PT2RPNTLVWZ0T1;
5368 break;
5369 case Intrinsic::x86_t2rpntlvwz1:
5370 Opc = X86::PT2RPNTLVWZ1;
5371 break;
5372 case Intrinsic::x86_t2rpntlvwz1t1:
5373 Opc = X86::PT2RPNTLVWZ1T1;
5374 break;
5375 case Intrinsic::x86_t2rpntlvwz0rs:
5376 Opc = X86::PT2RPNTLVWZ0RS;
5377 break;
5378 case Intrinsic::x86_t2rpntlvwz0rst1:
5379 Opc = X86::PT2RPNTLVWZ0RST1;
5380 break;
5381 case Intrinsic::x86_t2rpntlvwz1rs:
5382 Opc = X86::PT2RPNTLVWZ1RS;
5383 break;
5384 case Intrinsic::x86_t2rpntlvwz1rst1:
5385 Opc = X86::PT2RPNTLVWZ1RST1;
5386 break;
5387 }
5388 // FIXME: Match displacement and scale.
5389 unsigned TIndex = Node->getConstantOperandVal(2);
5390 SDValue TReg = getI8Imm(TIndex, dl);
5391 SDValue Base = Node->getOperand(3);
5392 SDValue Scale = getI8Imm(1, dl);
5393 SDValue Index = Node->getOperand(4);
5394 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5395 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5396 SDValue Chain = Node->getOperand(0);
5397 SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
5398 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5399 ReplaceNode(Node, CNode);
5400 return;
5401 }
5402 }
5403 break;
5404 }
5405 case ISD::BRIND:
5406 case X86ISD::NT_BRIND: {
5407 if (Subtarget->isTargetNaCl())
5408 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5409 // leave the instruction alone.
5410 break;
5411 if (Subtarget->isTarget64BitILP32()) {
5412 // Converts a 32-bit register to a 64-bit, zero-extended version of
5413 // it. This is needed because x86-64 can do many things, but jmp %r32
5414 // ain't one of them.
5415 SDValue Target = Node->getOperand(1);
5416 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5417 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5418 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5419 Node->getOperand(0), ZextTarget);
5420 ReplaceNode(Node, Brind.getNode());
5421 SelectCode(ZextTarget.getNode());
5422 SelectCode(Brind.getNode());
5423 return;
5424 }
5425 break;
5426 }
5428 ReplaceNode(Node, getGlobalBaseReg());
5429 return;
5430
5431 case ISD::BITCAST:
5432 // Just drop all 128/256/512-bit bitcasts.
5433 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5434 NVT == MVT::f128) {
5435 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5436 CurDAG->RemoveDeadNode(Node);
5437 return;
5438 }
5439 break;
5440
5441 case ISD::SRL:
5442 if (matchBitExtract(Node))
5443 return;
5444 [[fallthrough]];
5445 case ISD::SRA:
5446 case ISD::SHL:
5447 if (tryShiftAmountMod(Node))
5448 return;
5449 break;
5450
5451 case X86ISD::VPTERNLOG: {
5452 uint8_t Imm = Node->getConstantOperandVal(3);
5453 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5454 Node->getOperand(1), Node->getOperand(2), Imm))
5455 return;
5456 break;
5457 }
5458
5459 case X86ISD::ANDNP:
5460 if (tryVPTERNLOG(Node))
5461 return;
5462 break;
5463
5464 case ISD::AND:
5465 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5466 // Try to form a masked VPTESTM. Operands can be in either order.
5467 SDValue N0 = Node->getOperand(0);
5468 SDValue N1 = Node->getOperand(1);
5469 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5470 tryVPTESTM(Node, N0, N1))
5471 return;
5472 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5473 tryVPTESTM(Node, N1, N0))
5474 return;
5475 }
5476
5477 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5478 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5479 CurDAG->RemoveDeadNode(Node);
5480 return;
5481 }
5482 if (matchBitExtract(Node))
5483 return;
5484 if (AndImmShrink && shrinkAndImmediate(Node))
5485 return;
5486
5487 [[fallthrough]];
5488 case ISD::OR:
5489 case ISD::XOR:
5490 if (tryShrinkShlLogicImm(Node))
5491 return;
5492 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5493 return;
5494 if (tryVPTERNLOG(Node))
5495 return;
5496
5497 [[fallthrough]];
5498 case ISD::ADD:
5499 if (Opcode == ISD::ADD && matchBitExtract(Node))
5500 return;
5501 [[fallthrough]];
5502 case ISD::SUB: {
5503 // Try to avoid folding immediates with multiple uses for optsize.
5504 // This code tries to select to register form directly to avoid going
5505 // through the isel table which might fold the immediate. We can't change
5506 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5507 // tablegen files to check immediate use count without making the patterns
5508 // unavailable to the fast-isel table.
5509 if (!CurDAG->shouldOptForSize())
5510 break;
5511
5512 // Only handle i8/i16/i32/i64.
5513 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5514 break;
5515
5516 SDValue N0 = Node->getOperand(0);
5517 SDValue N1 = Node->getOperand(1);
5518
5519 auto *Cst = dyn_cast<ConstantSDNode>(N1);
5520 if (!Cst)
5521 break;
5522
5523 int64_t Val = Cst->getSExtValue();
5524
5525 // Make sure its an immediate that is considered foldable.
5526 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5527 if (!isInt<8>(Val) && !isInt<32>(Val))
5528 break;
5529
5530 // If this can match to INC/DEC, let it go.
5531 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5532 break;
5533
5534 // Check if we should avoid folding this immediate.
5535 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5536 break;
5537
5538 // We should not fold the immediate. So we need a register form instead.
5539 unsigned ROpc, MOpc;
5540 switch (NVT.SimpleTy) {
5541 default: llvm_unreachable("Unexpected VT!");
5542 case MVT::i8:
5543 switch (Opcode) {
5544 default: llvm_unreachable("Unexpected opcode!");
5545 case ISD::ADD:
5546 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5547 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5548 break;
5549 case ISD::SUB:
5550 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5551 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5552 break;
5553 case ISD::AND:
5554 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5555 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5556 break;
5557 case ISD::OR:
5558 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5559 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5560 break;
5561 case ISD::XOR:
5562 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5563 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5564 break;
5565 }
5566 break;
5567 case MVT::i16:
5568 switch (Opcode) {
5569 default: llvm_unreachable("Unexpected opcode!");
5570 case ISD::ADD:
5571 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5572 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5573 break;
5574 case ISD::SUB:
5575 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5576 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5577 break;
5578 case ISD::AND:
5579 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5580 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5581 break;
5582 case ISD::OR:
5583 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5584 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5585 break;
5586 case ISD::XOR:
5587 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5588 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5589 break;
5590 }
5591 break;
5592 case MVT::i32:
5593 switch (Opcode) {
5594 default: llvm_unreachable("Unexpected opcode!");
5595 case ISD::ADD:
5596 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5597 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5598 break;
5599 case ISD::SUB:
5600 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5601 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5602 break;
5603 case ISD::AND:
5604 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5605 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5606 break;
5607 case ISD::OR:
5608 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5609 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5610 break;
5611 case ISD::XOR:
5612 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5613 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5614 break;
5615 }
5616 break;
5617 case MVT::i64:
5618 switch (Opcode) {
5619 default: llvm_unreachable("Unexpected opcode!");
5620 case ISD::ADD:
5621 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5622 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5623 break;
5624 case ISD::SUB:
5625 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5626 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5627 break;
5628 case ISD::AND:
5629 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5630 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5631 break;
5632 case ISD::OR:
5633 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5634 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5635 break;
5636 case ISD::XOR:
5637 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5638 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5639 break;
5640 }
5641 break;
5642 }
5643
5644 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5645
5646 // If this is a not a subtract, we can still try to fold a load.
5647 if (Opcode != ISD::SUB) {
5648 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5649 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5650 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5651 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5652 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5653 // Update the chain.
5654 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5655 // Record the mem-refs
5656 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5657 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5658 CurDAG->RemoveDeadNode(Node);
5659 return;
5660 }
5661 }
5662
5663 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5664 return;
5665 }
5666
5667 case X86ISD::SMUL:
5668 // i16/i32/i64 are handled with isel patterns.
5669 if (NVT != MVT::i8)
5670 break;
5671 [[fallthrough]];
5672 case X86ISD::UMUL: {
5673 SDValue N0 = Node->getOperand(0);
5674 SDValue N1 = Node->getOperand(1);
5675
5676 unsigned LoReg, ROpc, MOpc;
5677 switch (NVT.SimpleTy) {
5678 default: llvm_unreachable("Unsupported VT!");
5679 case MVT::i8:
5680 LoReg = X86::AL;
5681 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5682 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5683 break;
5684 case MVT::i16:
5685 LoReg = X86::AX;
5686 ROpc = X86::MUL16r;
5687 MOpc = X86::MUL16m;
5688 break;
5689 case MVT::i32:
5690 LoReg = X86::EAX;
5691 ROpc = X86::MUL32r;
5692 MOpc = X86::MUL32m;
5693 break;
5694 case MVT::i64:
5695 LoReg = X86::RAX;
5696 ROpc = X86::MUL64r;
5697 MOpc = X86::MUL64m;
5698 break;
5699 }
5700
5701 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5702 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5703 // Multiply is commutative.
5704 if (!FoldedLoad) {
5705 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5706 if (FoldedLoad)
5707 std::swap(N0, N1);
5708 }
5709
5710 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5711 N0, SDValue()).getValue(1);
5712
5713 MachineSDNode *CNode;
5714 if (FoldedLoad) {
5715 // i16/i32/i64 use an instruction that produces a low and high result even
5716 // though only the low result is used.
5717 SDVTList VTs;
5718 if (NVT == MVT::i8)
5719 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5720 else
5721 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5722
5723 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5724 InGlue };
5725 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5726
5727 // Update the chain.
5728 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5729 // Record the mem-refs
5730 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5731 } else {
5732 // i16/i32/i64 use an instruction that produces a low and high result even
5733 // though only the low result is used.
5734 SDVTList VTs;
5735 if (NVT == MVT::i8)
5736 VTs = CurDAG->getVTList(NVT, MVT::i32);
5737 else
5738 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5739
5740 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5741 }
5742
5743 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5744 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5745 CurDAG->RemoveDeadNode(Node);
5746 return;
5747 }
5748
5749 case ISD::SMUL_LOHI:
5750 case ISD::UMUL_LOHI: {
5751 SDValue N0 = Node->getOperand(0);
5752 SDValue N1 = Node->getOperand(1);
5753
5754 unsigned Opc, MOpc;
5755 unsigned LoReg, HiReg;
5756 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5757 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5758 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5759 switch (NVT.SimpleTy) {
5760 default: llvm_unreachable("Unsupported VT!");
5761 case MVT::i32:
5762 Opc = UseMULXHi ? X86::MULX32Hrr
5763 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5764 : IsSigned ? X86::IMUL32r
5765 : X86::MUL32r;
5766 MOpc = UseMULXHi ? X86::MULX32Hrm
5767 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5768 : IsSigned ? X86::IMUL32m
5769 : X86::MUL32m;
5770 LoReg = UseMULX ? X86::EDX : X86::EAX;
5771 HiReg = X86::EDX;
5772 break;
5773 case MVT::i64:
5774 Opc = UseMULXHi ? X86::MULX64Hrr
5775 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5776 : IsSigned ? X86::IMUL64r
5777 : X86::MUL64r;
5778 MOpc = UseMULXHi ? X86::MULX64Hrm
5779 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5780 : IsSigned ? X86::IMUL64m
5781 : X86::MUL64m;
5782 LoReg = UseMULX ? X86::RDX : X86::RAX;
5783 HiReg = X86::RDX;
5784 break;
5785 }
5786
5787 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5788 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5789 // Multiply is commutative.
5790 if (!foldedLoad) {
5791 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5792 if (foldedLoad)
5793 std::swap(N0, N1);
5794 }
5795
5796 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5797 N0, SDValue()).getValue(1);
5798 SDValue ResHi, ResLo;
5799 if (foldedLoad) {
5800 SDValue Chain;
5801 MachineSDNode *CNode = nullptr;
5802 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5803 InGlue };
5804 if (UseMULXHi) {
5805 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5806 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5807 ResHi = SDValue(CNode, 0);
5808 Chain = SDValue(CNode, 1);
5809 } else if (UseMULX) {
5810 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5811 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5812 ResHi = SDValue(CNode, 0);
5813 ResLo = SDValue(CNode, 1);
5814 Chain = SDValue(CNode, 2);
5815 } else {
5816 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5817 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5818 Chain = SDValue(CNode, 0);
5819 InGlue = SDValue(CNode, 1);
5820 }
5821
5822 // Update the chain.
5823 ReplaceUses(N1.getValue(1), Chain);
5824 // Record the mem-refs
5825 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5826 } else {
5827 SDValue Ops[] = { N1, InGlue };
5828 if (UseMULXHi) {
5829 SDVTList VTs = CurDAG->getVTList(NVT);
5830 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5831 ResHi = SDValue(CNode, 0);
5832 } else if (UseMULX) {
5833 SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5834 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5835 ResHi = SDValue(CNode, 0);
5836 ResLo = SDValue(CNode, 1);
5837 } else {
5838 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5839 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5840 InGlue = SDValue(CNode, 0);
5841 }
5842 }
5843
5844 // Copy the low half of the result, if it is needed.
5845 if (!SDValue(Node, 0).use_empty()) {
5846 if (!ResLo) {
5847 assert(LoReg && "Register for low half is not defined!");
5848 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5849 NVT, InGlue);
5850 InGlue = ResLo.getValue(2);
5851 }
5852 ReplaceUses(SDValue(Node, 0), ResLo);
5853 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5854 dbgs() << '\n');
5855 }
5856 // Copy the high half of the result, if it is needed.
5857 if (!SDValue(Node, 1).use_empty()) {
5858 if (!ResHi) {
5859 assert(HiReg && "Register for high half is not defined!");
5860 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5861 NVT, InGlue);
5862 InGlue = ResHi.getValue(2);
5863 }
5864 ReplaceUses(SDValue(Node, 1), ResHi);
5865 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5866 dbgs() << '\n');
5867 }
5868
5869 CurDAG->RemoveDeadNode(Node);
5870 return;
5871 }
5872
5873 case ISD::SDIVREM:
5874 case ISD::UDIVREM: {
5875 SDValue N0 = Node->getOperand(0);
5876 SDValue N1 = Node->getOperand(1);
5877
5878 unsigned ROpc, MOpc;
5879 bool isSigned = Opcode == ISD::SDIVREM;
5880 if (!isSigned) {
5881 switch (NVT.SimpleTy) {
5882 default: llvm_unreachable("Unsupported VT!");
5883 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5884 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5885 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5886 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5887 }
5888 } else {
5889 switch (NVT.SimpleTy) {
5890 default: llvm_unreachable("Unsupported VT!");
5891 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5892 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5893 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5894 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5895 }
5896 }
5897
5898 unsigned LoReg, HiReg, ClrReg;
5899 unsigned SExtOpcode;
5900 switch (NVT.SimpleTy) {
5901 default: llvm_unreachable("Unsupported VT!");
5902 case MVT::i8:
5903 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5904 SExtOpcode = 0; // Not used.
5905 break;
5906 case MVT::i16:
5907 LoReg = X86::AX; HiReg = X86::DX;
5908 ClrReg = X86::DX;
5909 SExtOpcode = X86::CWD;
5910 break;
5911 case MVT::i32:
5912 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5913 SExtOpcode = X86::CDQ;
5914 break;
5915 case MVT::i64:
5916 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5917 SExtOpcode = X86::CQO;
5918 break;
5919 }
5920
5921 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5922 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5923 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5924
5925 SDValue InGlue;
5926 if (NVT == MVT::i8) {
5927 // Special case for div8, just use a move with zero extension to AX to
5928 // clear the upper 8 bits (AH).
5929 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5930 MachineSDNode *Move;
5931 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5932 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5933 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5934 : X86::MOVZX16rm8;
5935 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5936 Chain = SDValue(Move, 1);
5937 ReplaceUses(N0.getValue(1), Chain);
5938 // Record the mem-refs
5939 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5940 } else {
5941 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5942 : X86::MOVZX16rr8;
5943 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5944 Chain = CurDAG->getEntryNode();
5945 }
5946 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5947 SDValue());
5948 InGlue = Chain.getValue(1);
5949 } else {
5950 InGlue =
5951 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5952 LoReg, N0, SDValue()).getValue(1);
5953 if (isSigned && !signBitIsZero) {
5954 // Sign extend the low part into the high part.
5955 InGlue =
5956 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5957 } else {
5958 // Zero out the high part, effectively zero extending the input.
5959 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5960 SDValue ClrNode =
5961 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
5962 switch (NVT.SimpleTy) {
5963 case MVT::i16:
5964 ClrNode =
5965 SDValue(CurDAG->getMachineNode(
5966 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5967 CurDAG->getTargetConstant(X86::sub_16bit, dl,
5968 MVT::i32)),
5969 0);
5970 break;
5971 case MVT::i32:
5972 break;
5973 case MVT::i64:
5974 ClrNode =
5975 SDValue(CurDAG->getMachineNode(
5976 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5977 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5978 CurDAG->getTargetConstant(X86::sub_32bit, dl,
5979 MVT::i32)),
5980 0);
5981 break;
5982 default:
5983 llvm_unreachable("Unexpected division source");
5984 }
5985
5986 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5987 ClrNode, InGlue).getValue(1);
5988 }
5989 }
5990
5991 if (foldedLoad) {
5992 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5993 InGlue };
5994 MachineSDNode *CNode =
5995 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5996 InGlue = SDValue(CNode, 1);
5997 // Update the chain.
5998 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5999 // Record the mem-refs
6000 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
6001 } else {
6002 InGlue =
6003 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
6004 }
6005
6006 // Prevent use of AH in a REX instruction by explicitly copying it to
6007 // an ABCD_L register.
6008 //
6009 // The current assumption of the register allocator is that isel
6010 // won't generate explicit references to the GR8_ABCD_H registers. If
6011 // the allocator and/or the backend get enhanced to be more robust in
6012 // that regard, this can be, and should be, removed.
6013 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6014 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
6015 unsigned AHExtOpcode =
6016 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6017
6018 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
6019 MVT::Glue, AHCopy, InGlue);
6020 SDValue Result(RNode, 0);
6021 InGlue = SDValue(RNode, 1);
6022
6023 Result =
6024 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
6025
6026 ReplaceUses(SDValue(Node, 1), Result);
6027 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6028 dbgs() << '\n');
6029 }
6030 // Copy the division (low) result, if it is needed.
6031 if (!SDValue(Node, 0).use_empty()) {
6032 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6033 LoReg, NVT, InGlue);
6034 InGlue = Result.getValue(2);
6035 ReplaceUses(SDValue(Node, 0), Result);
6036 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6037 dbgs() << '\n');
6038 }
6039 // Copy the remainder (high) result, if it is needed.
6040 if (!SDValue(Node, 1).use_empty()) {
6041 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6042 HiReg, NVT, InGlue);
6043 InGlue = Result.getValue(2);
6044 ReplaceUses(SDValue(Node, 1), Result);
6045 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6046 dbgs() << '\n');
6047 }
6048 CurDAG->RemoveDeadNode(Node);
6049 return;
6050 }
6051
6052 case X86ISD::FCMP:
6054 case X86ISD::STRICT_FCMPS: {
6055 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6056 Node->getOpcode() == X86ISD::STRICT_FCMPS;
6057 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
6058 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
6059
6060 // Save the original VT of the compare.
6061 MVT CmpVT = N0.getSimpleValueType();
6062
6063 // Floating point needs special handling if we don't have FCOMI.
6064 if (Subtarget->canUseCMOV())
6065 break;
6066
6067 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6068
6069 unsigned Opc;
6070 switch (CmpVT.SimpleTy) {
6071 default: llvm_unreachable("Unexpected type!");
6072 case MVT::f32:
6073 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6074 break;
6075 case MVT::f64:
6076 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6077 break;
6078 case MVT::f80:
6079 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6080 break;
6081 }
6082
6083 SDValue Chain =
6084 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
6085 SDValue Glue;
6086 if (IsStrictCmp) {
6087 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
6088 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
6089 Glue = Chain.getValue(1);
6090 } else {
6091 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
6092 }
6093
6094 // Move FPSW to AX.
6095 SDValue FNSTSW =
6096 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
6097
6098 // Extract upper 8-bits of AX.
6099 SDValue Extract =
6100 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
6101
6102 // Move AH into flags.
6103 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6104 assert(Subtarget->canUseLAHFSAHF() &&
6105 "Target doesn't support SAHF or FCOMI?");
6106 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
6107 Chain = AH;
6108 SDValue SAHF = SDValue(
6109 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
6110
6111 if (IsStrictCmp)
6112 ReplaceUses(SDValue(Node, 1), Chain);
6113
6114 ReplaceUses(SDValue(Node, 0), SAHF);
6115 CurDAG->RemoveDeadNode(Node);
6116 return;
6117 }
6118
6119 case X86ISD::CMP: {
6120 SDValue N0 = Node->getOperand(0);
6121 SDValue N1 = Node->getOperand(1);
6122
6123 // Optimizations for TEST compares.
6124 if (!isNullConstant(N1))
6125 break;
6126
6127 // Save the original VT of the compare.
6128 MVT CmpVT = N0.getSimpleValueType();
6129
6130 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6131 // by a test instruction. The test should be removed later by
6132 // analyzeCompare if we are using only the zero flag.
6133 // TODO: Should we check the users and use the BEXTR flags directly?
6134 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6135 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6136 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6137 : X86::TEST32rr;
6138 SDValue BEXTR = SDValue(NewNode, 0);
6139 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6140 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6141 CurDAG->RemoveDeadNode(Node);
6142 return;
6143 }
6144 }
6145
6146 // We can peek through truncates, but we need to be careful below.
6147 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6148 N0 = N0.getOperand(0);
6149
6150 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6151 // use a smaller encoding.
6152 // Look past the truncate if CMP is the only use of it.
6153 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6154 N0.getValueType() != MVT::i8) {
6155 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6156 if (!MaskC)
6157 break;
6158
6159 // We may have looked through a truncate so mask off any bits that
6160 // shouldn't be part of the compare.
6161 uint64_t Mask = MaskC->getZExtValue();
6162 Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
6163
6164 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6165 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6166 // zero flag.
6167 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6168 onlyUsesZeroFlag(SDValue(Node, 0))) {
6169 unsigned ShiftOpcode = ISD::DELETED_NODE;
6170 unsigned ShiftAmt;
6171 unsigned SubRegIdx;
6172 MVT SubRegVT;
6173 unsigned TestOpcode;
6174 unsigned LeadingZeros = llvm::countl_zero(Mask);
6175 unsigned TrailingZeros = llvm::countr_zero(Mask);
6176
6177 // With leading/trailing zeros, the transform is profitable if we can
6178 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6179 // incurring any extra register moves.
6180 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6181 if (LeadingZeros == 0 && SavesBytes) {
6182 // If the mask covers the most significant bit, then we can replace
6183 // TEST+AND with a SHR and check eflags.
6184 // This emits a redundant TEST which is subsequently eliminated.
6185 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6186 ShiftAmt = TrailingZeros;
6187 SubRegIdx = 0;
6188 TestOpcode = X86::TEST64rr;
6189 } else if (TrailingZeros == 0 && SavesBytes) {
6190 // If the mask covers the least significant bit, then we can replace
6191 // TEST+AND with a SHL and check eflags.
6192 // This emits a redundant TEST which is subsequently eliminated.
6193 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6194 ShiftAmt = LeadingZeros;
6195 SubRegIdx = 0;
6196 TestOpcode = X86::TEST64rr;
6197 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6198 // If the shifted mask extends into the high half and is 8/16/32 bits
6199 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6200 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6201 if (PopCount == 8) {
6202 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6203 ShiftAmt = TrailingZeros;
6204 SubRegIdx = X86::sub_8bit;
6205 SubRegVT = MVT::i8;
6206 TestOpcode = X86::TEST8rr;
6207 } else if (PopCount == 16) {
6208 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6209 ShiftAmt = TrailingZeros;
6210 SubRegIdx = X86::sub_16bit;
6211 SubRegVT = MVT::i16;
6212 TestOpcode = X86::TEST16rr;
6213 } else if (PopCount == 32) {
6214 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6215 ShiftAmt = TrailingZeros;
6216 SubRegIdx = X86::sub_32bit;
6217 SubRegVT = MVT::i32;
6218 TestOpcode = X86::TEST32rr;
6219 }
6220 }
6221 if (ShiftOpcode != ISD::DELETED_NODE) {
6222 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6223 SDValue Shift = SDValue(
6224 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6225 N0.getOperand(0), ShiftC),
6226 0);
6227 if (SubRegIdx != 0) {
6228 Shift =
6229 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6230 }
6232 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6233 ReplaceNode(Node, Test);
6234 return;
6235 }
6236 }
6237
6238 MVT VT;
6239 int SubRegOp;
6240 unsigned ROpc, MOpc;
6241
6242 // For each of these checks we need to be careful if the sign flag is
6243 // being used. It is only safe to use the sign flag in two conditions,
6244 // either the sign bit in the shrunken mask is zero or the final test
6245 // size is equal to the original compare size.
6246
6247 if (isUInt<8>(Mask) &&
6248 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6249 hasNoSignFlagUses(SDValue(Node, 0)))) {
6250 // For example, convert "testl %eax, $8" to "testb %al, $8"
6251 VT = MVT::i8;
6252 SubRegOp = X86::sub_8bit;
6253 ROpc = X86::TEST8ri;
6254 MOpc = X86::TEST8mi;
6255 } else if (OptForMinSize && isUInt<16>(Mask) &&
6256 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6257 hasNoSignFlagUses(SDValue(Node, 0)))) {
6258 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6259 // NOTE: We only want to form TESTW instructions if optimizing for
6260 // min size. Otherwise we only save one byte and possibly get a length
6261 // changing prefix penalty in the decoders.
6262 VT = MVT::i16;
6263 SubRegOp = X86::sub_16bit;
6264 ROpc = X86::TEST16ri;
6265 MOpc = X86::TEST16mi;
6266 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6267 ((!(Mask & 0x80000000) &&
6268 // Without minsize 16-bit Cmps can get here so we need to
6269 // be sure we calculate the correct sign flag if needed.
6270 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6271 CmpVT == MVT::i32 ||
6272 hasNoSignFlagUses(SDValue(Node, 0)))) {
6273 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6274 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6275 // Otherwize, we find ourselves in a position where we have to do
6276 // promotion. If previous passes did not promote the and, we assume
6277 // they had a good reason not to and do not promote here.
6278 VT = MVT::i32;
6279 SubRegOp = X86::sub_32bit;
6280 ROpc = X86::TEST32ri;
6281 MOpc = X86::TEST32mi;
6282 } else {
6283 // No eligible transformation was found.
6284 break;
6285 }
6286
6287 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6288 SDValue Reg = N0.getOperand(0);
6289
6290 // Emit a testl or testw.
6291 MachineSDNode *NewNode;
6292 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6293 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6294 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6295 if (!LoadN->isSimple()) {
6296 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6297 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6298 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6299 (MOpc == X86::TEST32mi && NumVolBits != 32))
6300 break;
6301 }
6302 }
6303 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6304 Reg.getOperand(0) };
6305 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6306 // Update the chain.
6307 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6308 // Record the mem-refs
6309 CurDAG->setNodeMemRefs(NewNode,
6310 {cast<LoadSDNode>(Reg)->getMemOperand()});
6311 } else {
6312 // Extract the subregister if necessary.
6313 if (N0.getValueType() != VT)
6314 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6315
6316 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6317 }
6318 // Replace CMP with TEST.
6319 ReplaceNode(Node, NewNode);
6320 return;
6321 }
6322 break;
6323 }
6324 case X86ISD::PCMPISTR: {
6325 if (!Subtarget->hasSSE42())
6326 break;
6327
6328 bool NeedIndex = !SDValue(Node, 0).use_empty();
6329 bool NeedMask = !SDValue(Node, 1).use_empty();
6330 // We can't fold a load if we are going to make two instructions.
6331 bool MayFoldLoad = !NeedIndex || !NeedMask;
6332
6333 MachineSDNode *CNode;
6334 if (NeedMask) {
6335 unsigned ROpc =
6336 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6337 unsigned MOpc =
6338 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6339 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6340 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6341 }
6342 if (NeedIndex || !NeedMask) {
6343 unsigned ROpc =
6344 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6345 unsigned MOpc =
6346 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6347 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6348 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6349 }
6350
6351 // Connect the flag usage to the last instruction created.
6352 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6353 CurDAG->RemoveDeadNode(Node);
6354 return;
6355 }
6356 case X86ISD::PCMPESTR: {
6357 if (!Subtarget->hasSSE42())
6358 break;
6359
6360 // Copy the two implicit register inputs.
6361 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6362 Node->getOperand(1),
6363 SDValue()).getValue(1);
6364 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6365 Node->getOperand(3), InGlue).getValue(1);
6366
6367 bool NeedIndex = !SDValue(Node, 0).use_empty();
6368 bool NeedMask = !SDValue(Node, 1).use_empty();
6369 // We can't fold a load if we are going to make two instructions.
6370 bool MayFoldLoad = !NeedIndex || !NeedMask;
6371
6372 MachineSDNode *CNode;
6373 if (NeedMask) {
6374 unsigned ROpc =
6375 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6376 unsigned MOpc =
6377 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6378 CNode =
6379 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6380 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6381 }
6382 if (NeedIndex || !NeedMask) {
6383 unsigned ROpc =
6384 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6385 unsigned MOpc =
6386 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6387 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6388 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6389 }
6390 // Connect the flag usage to the last instruction created.
6391 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6392 CurDAG->RemoveDeadNode(Node);
6393 return;
6394 }
6395
6396 case ISD::SETCC: {
6397 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6398 return;
6399
6400 break;
6401 }
6402
6403 case ISD::STORE:
6404 if (foldLoadStoreIntoMemOperand(Node))
6405 return;
6406 break;
6407
6408 case X86ISD::SETCC_CARRY: {
6409 MVT VT = Node->getSimpleValueType(0);
6411 if (Subtarget->hasSBBDepBreaking()) {
6412 // We have to do this manually because tblgen will put the eflags copy in
6413 // the wrong place if we use an extract_subreg in the pattern.
6414 // Copy flags to the EFLAGS register and glue it to next node.
6415 SDValue EFLAGS =
6416 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6417 Node->getOperand(1), SDValue());
6418
6419 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6420 // 32-bit version.
6421 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6422 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6423 Result = SDValue(
6424 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6425 0);
6426 } else {
6427 // The target does not recognize sbb with the same reg operand as a
6428 // no-source idiom, so we explicitly zero the input values.
6429 Result = getSBBZero(Node);
6430 }
6431
6432 // For less than 32-bits we need to extract from the 32-bit node.
6433 if (VT == MVT::i8 || VT == MVT::i16) {
6434 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6435 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6436 }
6437
6438 ReplaceUses(SDValue(Node, 0), Result);
6439 CurDAG->RemoveDeadNode(Node);
6440 return;
6441 }
6442 case X86ISD::SBB: {
6443 if (isNullConstant(Node->getOperand(0)) &&
6444 isNullConstant(Node->getOperand(1))) {
6445 SDValue Result = getSBBZero(Node);
6446
6447 // Replace the flag use.
6448 ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6449
6450 // Replace the result use.
6451 if (!SDValue(Node, 0).use_empty()) {
6452 // For less than 32-bits we need to extract from the 32-bit node.
6453 MVT VT = Node->getSimpleValueType(0);
6454 if (VT == MVT::i8 || VT == MVT::i16) {
6455 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6456 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6457 }
6458 ReplaceUses(SDValue(Node, 0), Result);
6459 }
6460
6461 CurDAG->RemoveDeadNode(Node);
6462 return;
6463 }
6464 break;
6465 }
6466 case X86ISD::MGATHER: {
6467 auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6468 SDValue IndexOp = Mgt->getIndex();
6469 SDValue Mask = Mgt->getMask();
6470 MVT IndexVT = IndexOp.getSimpleValueType();
6471 MVT ValueVT = Node->getSimpleValueType(0);
6472 MVT MaskVT = Mask.getSimpleValueType();
6473
6474 // This is just to prevent crashes if the nodes are malformed somehow. We're
6475 // otherwise only doing loose type checking in here based on type what
6476 // a type constraint would say just like table based isel.
6477 if (!ValueVT.isVector() || !MaskVT.isVector())
6478 break;
6479
6480 unsigned NumElts = ValueVT.getVectorNumElements();
6481 MVT ValueSVT = ValueVT.getVectorElementType();
6482
6483 bool IsFP = ValueSVT.isFloatingPoint();
6484 unsigned EltSize = ValueSVT.getSizeInBits();
6485
6486 unsigned Opc = 0;
6487 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6488 if (AVX512Gather) {
6489 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6490 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6491 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6492 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6493 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6494 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6495 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6496 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6497 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6498 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6499 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6500 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6501 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6502 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6503 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6504 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6505 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6506 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6507 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6508 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6509 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6510 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6511 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6512 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6513 } else {
6514 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6515 "Unexpected mask VT!");
6516 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6517 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6518 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6519 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6520 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6521 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6522 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6523 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6524 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6525 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6526 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6527 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6528 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6529 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6530 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6531 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6532 }
6533
6534 if (!Opc)
6535 break;
6536
6537 SDValue Base, Scale, Index, Disp, Segment;
6538 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6539 Base, Scale, Index, Disp, Segment))
6540 break;
6541
6542 SDValue PassThru = Mgt->getPassThru();
6543 SDValue Chain = Mgt->getChain();
6544 // Gather instructions have a mask output not in the ISD node.
6545 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6546
6547 MachineSDNode *NewNode;
6548 if (AVX512Gather) {
6549 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6550 Index, Disp, Segment, Chain};
6551 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6552 } else {
6553 SDValue Ops[] = {PassThru, Base, Scale, Index,
6554 Disp, Segment, Mask, Chain};
6555 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6556 }
6557 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6558 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6559 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6560 CurDAG->RemoveDeadNode(Node);
6561 return;
6562 }
6563 case X86ISD::MSCATTER: {
6564 auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6565 SDValue Value = Sc->getValue();
6566 SDValue IndexOp = Sc->getIndex();
6567 MVT IndexVT = IndexOp.getSimpleValueType();
6568 MVT ValueVT = Value.getSimpleValueType();
6569
6570 // This is just to prevent crashes if the nodes are malformed somehow. We're
6571 // otherwise only doing loose type checking in here based on type what
6572 // a type constraint would say just like table based isel.
6573 if (!ValueVT.isVector())
6574 break;
6575
6576 unsigned NumElts = ValueVT.getVectorNumElements();
6577 MVT ValueSVT = ValueVT.getVectorElementType();
6578
6579 bool IsFP = ValueSVT.isFloatingPoint();
6580 unsigned EltSize = ValueSVT.getSizeInBits();
6581
6582 unsigned Opc;
6583 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6584 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6585 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6586 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6587 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6588 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6589 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6590 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6591 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6592 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6593 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6594 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6595 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6596 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6597 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6598 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6599 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6600 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6601 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6602 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6603 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6604 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6605 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6606 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6607 else
6608 break;
6609
6610 SDValue Base, Scale, Index, Disp, Segment;
6611 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6612 Base, Scale, Index, Disp, Segment))
6613 break;
6614
6615 SDValue Mask = Sc->getMask();
6616 SDValue Chain = Sc->getChain();
6617 // Scatter instructions have a mask output not in the ISD node.
6618 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6619 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6620
6621 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6622 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6623 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6624 CurDAG->RemoveDeadNode(Node);
6625 return;
6626 }
6628 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6629 auto CallId = MFI->getPreallocatedIdForCallSite(
6630 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6631 SDValue Chain = Node->getOperand(0);
6632 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6633 MachineSDNode *New = CurDAG->getMachineNode(
6634 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6635 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6636 CurDAG->RemoveDeadNode(Node);
6637 return;
6638 }
6639 case ISD::PREALLOCATED_ARG: {
6640 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6641 auto CallId = MFI->getPreallocatedIdForCallSite(
6642 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6643 SDValue Chain = Node->getOperand(0);
6644 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6645 SDValue ArgIndex = Node->getOperand(2);
6646 SDValue Ops[3];
6647 Ops[0] = CallIdValue;
6648 Ops[1] = ArgIndex;
6649 Ops[2] = Chain;
6650 MachineSDNode *New = CurDAG->getMachineNode(
6651 TargetOpcode::PREALLOCATED_ARG, dl,
6652 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6653 MVT::Other),
6654 Ops);
6655 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6656 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6657 CurDAG->RemoveDeadNode(Node);
6658 return;
6659 }
6664 if (!Subtarget->hasWIDEKL())
6665 break;
6666
6667 unsigned Opcode;
6668 switch (Node->getOpcode()) {
6669 default:
6670 llvm_unreachable("Unexpected opcode!");
6672 Opcode = X86::AESENCWIDE128KL;
6673 break;
6675 Opcode = X86::AESDECWIDE128KL;
6676 break;
6678 Opcode = X86::AESENCWIDE256KL;
6679 break;
6681 Opcode = X86::AESDECWIDE256KL;
6682 break;
6683 }
6684
6685 SDValue Chain = Node->getOperand(0);
6686 SDValue Addr = Node->getOperand(1);
6687
6688 SDValue Base, Scale, Index, Disp, Segment;
6689 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6690 break;
6691
6692 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6693 SDValue());
6694 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6695 Chain.getValue(1));
6696 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6697 Chain.getValue(1));
6698 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6699 Chain.getValue(1));
6700 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6701 Chain.getValue(1));
6702 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6703 Chain.getValue(1));
6704 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6705 Chain.getValue(1));
6706 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6707 Chain.getValue(1));
6708
6709 MachineSDNode *Res = CurDAG->getMachineNode(
6710 Opcode, dl, Node->getVTList(),
6711 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6712 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6713 ReplaceNode(Node, Res);
6714 return;
6715 }
6716 }
6717
6718 SelectCode(Node);
6719}
6720
6721bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6722 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6723 std::vector<SDValue> &OutOps) {
6724 SDValue Op0, Op1, Op2, Op3, Op4;
6725 switch (ConstraintID) {
6726 default:
6727 llvm_unreachable("Unexpected asm memory constraint");
6728 case InlineAsm::ConstraintCode::o: // offsetable ??
6729 case InlineAsm::ConstraintCode::v: // not offsetable ??
6730 case InlineAsm::ConstraintCode::m: // memory
6731 case InlineAsm::ConstraintCode::X:
6732 case InlineAsm::ConstraintCode::p: // address
6733 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6734 return true;
6735 break;
6736 }
6737
6738 OutOps.push_back(Op0);
6739 OutOps.push_back(Op1);
6740 OutOps.push_back(Op2);
6741 OutOps.push_back(Op3);
6742 OutOps.push_back(Op4);
6743 return false;
6744}
6745
6748 std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6749
6750/// This pass converts a legalized DAG into a X86-specific DAG,
6751/// ready for instruction scheduling.
6753 CodeGenOptLevel OptLevel) {
6754 return new X86DAGToDAGISelLegacy(TM, OptLevel);
6755}
unsigned SubReg
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
aarch64 promote const
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
#define P(N)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII)
Check if the instruction uses RIP relative addressing.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, unsigned LoadOpNo, LoadSDNode *&LoadNode, SDValue &InputChain)
Check whether or not the chain ending in StoreNode is suitable for doing the {load; op; store} to mod...
#define GET_EGPR_IF_ENABLED(OPC)
static bool needBWI(MVT VT)
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked)
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget)
static bool mayUseCarryFlag(X86::CondCode CC)
static cl::opt< bool > EnablePromoteAnyextLoad("x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden)
cl::opt< bool > IndirectBranchTracking
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain)
Replace the original chain operand of the call with load's chain operand and move load below the call...
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N)
#define GET_ND_IF_ENABLED(OPC)
#define VPTESTM_BROADCAST_CASES(SUFFIX)
#define FROM_TO(A, B)
static cl::opt< bool > AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden)
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
#define VPTESTM_FULL_CASES(SUFFIX)
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
static bool isDispSafeForFrameIndex(int64_t Val)
#define PASS_NAME
#define CASE(A)
#define CASE_ND(OP)
#define DEBUG_TYPE
static bool isEndbrImm64(uint64_t Imm)
#define GET_ND_IF_ENABLED(OPC)
Value * RHS
DEMANGLE_DUMP_METHOD void dump() const
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:986
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:910
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1577
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1511
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1257
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:389
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
The address of a basic block.
Definition: Constants.h:893
This is an important base class in LLVM.
Definition: Constant.h:42
This class represents an Operation in the Expression.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:707
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:704
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:731
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
std::optional< ConstantRange > getAbsoluteSymbolRange() const
If this is an absolute symbol reference, returns the range of the symbol, otherwise returns std::null...
Definition: Globals.cpp:413
This class is used to form a handle around another node that is persistent and is updated across invo...
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
Root of the metadata hierarchy.
Definition: Metadata.h:62
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:354
Register getReg() const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
int getNodeId() const
Return the unique node id.
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
const SDValue & getOperand(unsigned Num) const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
bool use_empty() const
Return true if there are no nodes using value ResNo of Node.
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
unsigned getNumOperands() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps)
SelectInlineAsmMemoryOperand - Select the specified address as a target addressing mode,...
virtual void PostprocessISelDAG()
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
static int getUninvalidatedNodeId(SDNode *N)
virtual void emitFunctionEntryCode()
virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const
IsProfitableToFold - Returns true if it's profitable to fold the specific operand node N of U during ...
virtual bool ComplexPatternFuncMutatesDAG() const
Return true if complex patterns for this target can mutate the DAG.
virtual void PreprocessISelDAG()
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
virtual bool runOnMachineFunction(MachineFunction &mf)
static void InvalidateNodeId(SDNode *N)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:456
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
void RepositionNode(allnodes_iterator Position, SDNode *N)
Move node N in the AllNodes list to be immediately before the given iterator Position.
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:558
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
std::vector< ArgListEntry > ArgListTy
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
unsigned getID() const
Return the register class ID number.
Target - Wrapper for Target specific information.
static Type * getVoidTy(LLVMContext &C)
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:72
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
LLVM Value Representation.
Definition: Value.h:74
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
iterator_range< use_iterator > uses()
Definition: Value.h:376
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5304
X86ISelDAGToDAGPass(X86TargetMachine &TM)
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
size_t getPreallocatedIdForCallSite(const Value *CS)
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:574
@ STRICT_FCEIL
Definition: ISDOpcodes.h:441
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:814
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ FrameIndex
Definition: ISDOpcodes.h:80
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:805
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ PREALLOCATED_SETUP
PREALLOCATED_SETUP - This has 2 operands: an input chain and a SRCVALUE with the preallocated call Va...
Definition: ISDOpcodes.h:1231
@ TargetExternalSymbol
Definition: ISDOpcodes.h:175
@ PREALLOCATED_ARG
PREALLOCATED_ARG - This has 3 operands: an input chain, a SRCVALUE with the preallocated call Value,...
Definition: ISDOpcodes.h:1234
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1123
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:170
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:445
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:588
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:120
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:860
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:442
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:444
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:458
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:480
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:457
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:887
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:485
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:882
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:438
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ CALLSEQ_START
CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end of a call sequence,...
Definition: ISDOpcodes.h:1211
@ STRICT_FRINT
Definition: ISDOpcodes.h:437
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ TargetGlobalTLSAddress
Definition: ISDOpcodes.h:171
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1602
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
const uint64_t Magic
Definition: CodeGenData.h:276
SymbolFlags
Symbol flags.
Definition: Symbol.h:24
@ GlobalBaseReg
The result of the mflr at function entry, used for PIC code.
@ X86
Windows x64, Windows Itanium (IA-64)
@ SS
Definition: X86.h:212
@ FS
Definition: X86.h:211
@ GS
Definition: X86.h:210
Reg
All possible values of the reg field in the ModR/M byte.
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
Definition: X86BaseInfo.h:825
@ VEX
VEX - encoding using 0xC4/0xC5.
Definition: X86BaseInfo.h:818
@ XOP
XOP - Opcode prefix used by XOP instructions.
Definition: X86BaseInfo.h:820
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:363
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ CALL
These operations represent an abstract X86 call instruction, which includes a bunch of information.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FLD
This instruction implements an extending load to FP stack slots.
@ TC_RETURN
Tail call return.
@ FOR
Bitwise logical OR of floating point values.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:480
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
unsigned M1(unsigned Val)
Definition: VE.h:376
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:273
FunctionPass * createX86ISelDag(X86TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a X86-specific DAG, ready for instruction scheduling.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
@ And
Bitwise or logical AND of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:94
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:207
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:212
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:53
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This structure contains all information that is necessary for lowering calls.