LLVM 19.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86.h"
16#include "X86RegisterInfo.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Type.h"
29#include "llvm/Support/Debug.h"
33#include <cstdint>
34
35using namespace llvm;
36
37#define DEBUG_TYPE "x86-isel"
38#define PASS_NAME "X86 DAG->DAG Instruction Selection"
39
40STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
41
42static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
43 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45
47 "x86-promote-anyext-load", cl::init(true),
48 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
49
51
52//===----------------------------------------------------------------------===//
53// Pattern Matcher Implementation
54//===----------------------------------------------------------------------===//
55
56namespace {
57 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
58 /// numbers for the leaves of the matched tree.
59 struct X86ISelAddressMode {
60 enum {
61 RegBase,
62 FrameIndexBase
63 } BaseType = RegBase;
64
65 // This is really a union, discriminated by BaseType!
66 SDValue Base_Reg;
67 int Base_FrameIndex = 0;
68
69 unsigned Scale = 1;
70 SDValue IndexReg;
71 int32_t Disp = 0;
72 SDValue Segment;
73 const GlobalValue *GV = nullptr;
74 const Constant *CP = nullptr;
75 const BlockAddress *BlockAddr = nullptr;
76 const char *ES = nullptr;
77 MCSymbol *MCSym = nullptr;
78 int JT = -1;
79 Align Alignment; // CP alignment.
80 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
81 bool NegateIndex = false;
82
83 X86ISelAddressMode() = default;
84
85 bool hasSymbolicDisplacement() const {
86 return GV != nullptr || CP != nullptr || ES != nullptr ||
87 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
88 }
89
90 bool hasBaseOrIndexReg() const {
91 return BaseType == FrameIndexBase ||
92 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
93 }
94
95 /// Return true if this addressing mode is already RIP-relative.
96 bool isRIPRelative() const {
97 if (BaseType != RegBase) return false;
98 if (RegisterSDNode *RegNode =
99 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
100 return RegNode->getReg() == X86::RIP;
101 return false;
102 }
103
104 void setBaseReg(SDValue Reg) {
105 BaseType = RegBase;
106 Base_Reg = Reg;
107 }
108
109#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
110 void dump(SelectionDAG *DAG = nullptr) {
111 dbgs() << "X86ISelAddressMode " << this << '\n';
112 dbgs() << "Base_Reg ";
113 if (Base_Reg.getNode())
114 Base_Reg.getNode()->dump(DAG);
115 else
116 dbgs() << "nul\n";
117 if (BaseType == FrameIndexBase)
118 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
119 dbgs() << " Scale " << Scale << '\n'
120 << "IndexReg ";
121 if (NegateIndex)
122 dbgs() << "negate ";
123 if (IndexReg.getNode())
124 IndexReg.getNode()->dump(DAG);
125 else
126 dbgs() << "nul\n";
127 dbgs() << " Disp " << Disp << '\n'
128 << "GV ";
129 if (GV)
130 GV->dump();
131 else
132 dbgs() << "nul";
133 dbgs() << " CP ";
134 if (CP)
135 CP->dump();
136 else
137 dbgs() << "nul";
138 dbgs() << '\n'
139 << "ES ";
140 if (ES)
141 dbgs() << ES;
142 else
143 dbgs() << "nul";
144 dbgs() << " MCSym ";
145 if (MCSym)
146 dbgs() << MCSym;
147 else
148 dbgs() << "nul";
149 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
150 }
151#endif
152 };
153}
154
155namespace {
156 //===--------------------------------------------------------------------===//
157 /// ISel - X86-specific code to select X86 machine instructions for
158 /// SelectionDAG operations.
159 ///
160 class X86DAGToDAGISel final : public SelectionDAGISel {
161 /// Keep a pointer to the X86Subtarget around so that we can
162 /// make the right decision when generating code for different targets.
163 const X86Subtarget *Subtarget;
164
165 /// If true, selector should try to optimize for minimum code size.
166 bool OptForMinSize;
167
168 /// Disable direct TLS access through segment registers.
169 bool IndirectTlsSegRefs;
170
171 public:
172 static char ID;
173
174 X86DAGToDAGISel() = delete;
175
176 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
177 : SelectionDAGISel(ID, tm, OptLevel), Subtarget(nullptr),
178 OptForMinSize(false), IndirectTlsSegRefs(false) {}
179
180 bool runOnMachineFunction(MachineFunction &MF) override {
181 // Reset the subtarget each time through.
182 Subtarget = &MF.getSubtarget<X86Subtarget>();
183 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
184 "indirect-tls-seg-refs");
185
186 // OptFor[Min]Size are used in pattern predicates that isel is matching.
187 OptForMinSize = MF.getFunction().hasMinSize();
188 assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
189 "OptForMinSize implies OptForSize");
190
192 return true;
193 }
194
195 void emitFunctionEntryCode() override;
196
197 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
198
199 void PreprocessISelDAG() override;
200 void PostprocessISelDAG() override;
201
202// Include the pieces autogenerated from the target description.
203#include "X86GenDAGISel.inc"
204
205 private:
206 void Select(SDNode *N) override;
207
208 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
209 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
210 bool AllowSegmentRegForX32 = false);
211 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
212 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
213 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
214 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
215 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
216 unsigned Depth);
217 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218 unsigned Depth);
219 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
220 unsigned Depth);
221 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
222 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
223 SDValue &Scale, SDValue &Index, SDValue &Disp,
224 SDValue &Segment);
225 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
226 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
227 SDValue &Index, SDValue &Disp, SDValue &Segment);
228 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
229 bool selectLEAAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
233 SDValue &Scale, SDValue &Index, SDValue &Disp,
234 SDValue &Segment);
235 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
236 SDValue &Scale, SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238 bool selectRelocImm(SDValue N, SDValue &Op);
239
240 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment);
244
245 // Convenience method where P is also root.
246 bool tryFoldLoad(SDNode *P, SDValue N,
247 SDValue &Base, SDValue &Scale,
248 SDValue &Index, SDValue &Disp,
249 SDValue &Segment) {
250 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
251 }
252
253 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
254 SDValue &Base, SDValue &Scale,
255 SDValue &Index, SDValue &Disp,
256 SDValue &Segment);
257
258 bool isProfitableToFormMaskedOp(SDNode *N) const;
259
260 /// Implement addressing mode selection for inline asm expressions.
262 InlineAsm::ConstraintCode ConstraintID,
263 std::vector<SDValue> &OutOps) override;
264
265 void emitSpecialCodeForMain();
266
267 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
268 MVT VT, SDValue &Base, SDValue &Scale,
269 SDValue &Index, SDValue &Disp,
270 SDValue &Segment) {
271 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
272 Base = CurDAG->getTargetFrameIndex(
273 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
274 else if (AM.Base_Reg.getNode())
275 Base = AM.Base_Reg;
276 else
277 Base = CurDAG->getRegister(0, VT);
278
279 Scale = getI8Imm(AM.Scale, DL);
280
281#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
282 // Negate the index if needed.
283 if (AM.NegateIndex) {
284 unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
285 : GET_ND_IF_ENABLED(X86::NEG32r);
286 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
287 AM.IndexReg), 0);
288 AM.IndexReg = Neg;
289 }
290
291 if (AM.IndexReg.getNode())
292 Index = AM.IndexReg;
293 else
294 Index = CurDAG->getRegister(0, VT);
295
296 // These are 32-bit even in 64-bit mode since RIP-relative offset
297 // is 32-bit.
298 if (AM.GV)
299 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
300 MVT::i32, AM.Disp,
301 AM.SymbolFlags);
302 else if (AM.CP)
303 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
304 AM.Disp, AM.SymbolFlags);
305 else if (AM.ES) {
306 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
307 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
308 } else if (AM.MCSym) {
309 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
310 assert(AM.SymbolFlags == 0 && "oo");
311 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
312 } else if (AM.JT != -1) {
313 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
314 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
315 } else if (AM.BlockAddr)
316 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
317 AM.SymbolFlags);
318 else
319 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
320
321 if (AM.Segment.getNode())
322 Segment = AM.Segment;
323 else
324 Segment = CurDAG->getRegister(0, MVT::i16);
325 }
326
327 // Utility function to determine whether we should avoid selecting
328 // immediate forms of instructions for better code size or not.
329 // At a high level, we'd like to avoid such instructions when
330 // we have similar constants used within the same basic block
331 // that can be kept in a register.
332 //
333 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
334 uint32_t UseCount = 0;
335
336 // Do not want to hoist if we're not optimizing for size.
337 // TODO: We'd like to remove this restriction.
338 // See the comment in X86InstrInfo.td for more info.
339 if (!CurDAG->shouldOptForSize())
340 return false;
341
342 // Walk all the users of the immediate.
343 for (const SDNode *User : N->uses()) {
344 if (UseCount >= 2)
345 break;
346
347 // This user is already selected. Count it as a legitimate use and
348 // move on.
349 if (User->isMachineOpcode()) {
350 UseCount++;
351 continue;
352 }
353
354 // We want to count stores of immediates as real uses.
355 if (User->getOpcode() == ISD::STORE &&
356 User->getOperand(1).getNode() == N) {
357 UseCount++;
358 continue;
359 }
360
361 // We don't currently match users that have > 2 operands (except
362 // for stores, which are handled above)
363 // Those instruction won't match in ISEL, for now, and would
364 // be counted incorrectly.
365 // This may change in the future as we add additional instruction
366 // types.
367 if (User->getNumOperands() != 2)
368 continue;
369
370 // If this is a sign-extended 8-bit integer immediate used in an ALU
371 // instruction, there is probably an opcode encoding to save space.
372 auto *C = dyn_cast<ConstantSDNode>(N);
373 if (C && isInt<8>(C->getSExtValue()))
374 continue;
375
376 // Immediates that are used for offsets as part of stack
377 // manipulation should be left alone. These are typically
378 // used to indicate SP offsets for argument passing and
379 // will get pulled into stores/pushes (implicitly).
380 if (User->getOpcode() == X86ISD::ADD ||
381 User->getOpcode() == ISD::ADD ||
382 User->getOpcode() == X86ISD::SUB ||
383 User->getOpcode() == ISD::SUB) {
384
385 // Find the other operand of the add/sub.
386 SDValue OtherOp = User->getOperand(0);
387 if (OtherOp.getNode() == N)
388 OtherOp = User->getOperand(1);
389
390 // Don't count if the other operand is SP.
391 RegisterSDNode *RegNode;
392 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
393 (RegNode = dyn_cast_or_null<RegisterSDNode>(
394 OtherOp->getOperand(1).getNode())))
395 if ((RegNode->getReg() == X86::ESP) ||
396 (RegNode->getReg() == X86::RSP))
397 continue;
398 }
399
400 // ... otherwise, count this and move on.
401 UseCount++;
402 }
403
404 // If we have more than 1 use, then recommend for hoisting.
405 return (UseCount > 1);
406 }
407
408 /// Return a target constant with the specified value of type i8.
409 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
410 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
411 }
412
413 /// Return a target constant with the specified value, of type i32.
414 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
415 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
416 }
417
418 /// Return a target constant with the specified value, of type i64.
419 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
420 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
421 }
422
423 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
424 const SDLoc &DL) {
425 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
426 uint64_t Index = N->getConstantOperandVal(1);
427 MVT VecVT = N->getOperand(0).getSimpleValueType();
428 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
429 }
430
431 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
432 const SDLoc &DL) {
433 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
434 uint64_t Index = N->getConstantOperandVal(2);
435 MVT VecVT = N->getSimpleValueType(0);
436 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
437 }
438
439 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
440 const SDLoc &DL) {
441 assert(VecWidth == 128 && "Unexpected vector width");
442 uint64_t Index = N->getConstantOperandVal(2);
443 MVT VecVT = N->getSimpleValueType(0);
444 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
445 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
446 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
447 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
448 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
449 }
450
451 SDValue getSBBZero(SDNode *N) {
452 SDLoc dl(N);
453 MVT VT = N->getSimpleValueType(0);
454
455 // Create zero.
456 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
458 CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
459 if (VT == MVT::i64) {
460 Zero = SDValue(
461 CurDAG->getMachineNode(
462 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
463 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
464 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
465 0);
466 }
467
468 // Copy flags to the EFLAGS register and glue it to next node.
469 unsigned Opcode = N->getOpcode();
470 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
471 "Unexpected opcode for SBB materialization");
472 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
473 SDValue EFLAGS =
474 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
475 N->getOperand(FlagOpIndex), SDValue());
476
477 // Create a 64-bit instruction if the result is 64-bits otherwise use the
478 // 32-bit version.
479 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
480 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
481 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
482 return SDValue(
483 CurDAG->getMachineNode(Opc, dl, VTs,
484 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
485 0);
486 }
487
488 // Helper to detect unneeded and instructions on shift amounts. Called
489 // from PatFrags in tablegen.
490 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
491 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
492 const APInt &Val = N->getConstantOperandAPInt(1);
493
494 if (Val.countr_one() >= Width)
495 return true;
496
497 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
498 return Mask.countr_one() >= Width;
499 }
500
501 /// Return an SDNode that returns the value of the global base register.
502 /// Output instructions required to initialize the global base register,
503 /// if necessary.
504 SDNode *getGlobalBaseReg();
505
506 /// Return a reference to the TargetMachine, casted to the target-specific
507 /// type.
508 const X86TargetMachine &getTargetMachine() const {
509 return static_cast<const X86TargetMachine &>(TM);
510 }
511
512 /// Return a reference to the TargetInstrInfo, casted to the target-specific
513 /// type.
514 const X86InstrInfo *getInstrInfo() const {
515 return Subtarget->getInstrInfo();
516 }
517
518 /// Return a condition code of the given SDNode
519 X86::CondCode getCondFromNode(SDNode *N) const;
520
521 /// Address-mode matching performs shift-of-and to and-of-shift
522 /// reassociation in order to expose more scaled addressing
523 /// opportunities.
524 bool ComplexPatternFuncMutatesDAG() const override {
525 return true;
526 }
527
528 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
529
530 // Indicates we should prefer to use a non-temporal load for this load.
531 bool useNonTemporalLoad(LoadSDNode *N) const {
532 if (!N->isNonTemporal())
533 return false;
534
535 unsigned StoreSize = N->getMemoryVT().getStoreSize();
536
537 if (N->getAlign().value() < StoreSize)
538 return false;
539
540 switch (StoreSize) {
541 default: llvm_unreachable("Unsupported store size");
542 case 4:
543 case 8:
544 return false;
545 case 16:
546 return Subtarget->hasSSE41();
547 case 32:
548 return Subtarget->hasAVX2();
549 case 64:
550 return Subtarget->hasAVX512();
551 }
552 }
553
554 bool foldLoadStoreIntoMemOperand(SDNode *Node);
555 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
556 bool matchBitExtract(SDNode *Node);
557 bool shrinkAndImmediate(SDNode *N);
558 bool isMaskZeroExtended(SDNode *N) const;
559 bool tryShiftAmountMod(SDNode *N);
560 bool tryShrinkShlLogicImm(SDNode *N);
561 bool tryVPTERNLOG(SDNode *N);
562 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
563 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
564 uint8_t Imm);
565 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
566 bool tryMatchBitSelect(SDNode *N);
567
568 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
569 const SDLoc &dl, MVT VT, SDNode *Node);
570 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
571 const SDLoc &dl, MVT VT, SDNode *Node,
572 SDValue &InGlue);
573
574 bool tryOptimizeRem8Extend(SDNode *N);
575
576 bool onlyUsesZeroFlag(SDValue Flags) const;
577 bool hasNoSignFlagUses(SDValue Flags) const;
578 bool hasNoCarryFlagUses(SDValue Flags) const;
579 };
580}
581
582char X86DAGToDAGISel::ID = 0;
583
584INITIALIZE_PASS(X86DAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
585
586// Returns true if this masked compare can be implemented legally with this
587// type.
588static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
589 unsigned Opcode = N->getOpcode();
590 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
591 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
592 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
593 // We can get 256-bit 8 element types here without VLX being enabled. When
594 // this happens we will use 512-bit operations and the mask will not be
595 // zero extended.
596 EVT OpVT = N->getOperand(0).getValueType();
597 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
598 // second operand.
599 if (Opcode == X86ISD::STRICT_CMPM)
600 OpVT = N->getOperand(1).getValueType();
601 if (OpVT.is256BitVector() || OpVT.is128BitVector())
602 return Subtarget->hasVLX();
603
604 return true;
605 }
606 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
607 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
608 Opcode == X86ISD::FSETCCM_SAE)
609 return true;
610
611 return false;
612}
613
614// Returns true if we can assume the writer of the mask has zero extended it
615// for us.
616bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
617 // If this is an AND, check if we have a compare on either side. As long as
618 // one side guarantees the mask is zero extended, the AND will preserve those
619 // zeros.
620 if (N->getOpcode() == ISD::AND)
621 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
622 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
623
624 return isLegalMaskCompare(N, Subtarget);
625}
626
627bool
628X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
629 if (OptLevel == CodeGenOptLevel::None)
630 return false;
631
632 if (!N.hasOneUse())
633 return false;
634
635 if (N.getOpcode() != ISD::LOAD)
636 return true;
637
638 // Don't fold non-temporal loads if we have an instruction for them.
639 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
640 return false;
641
642 // If N is a load, do additional profitability checks.
643 if (U == Root) {
644 switch (U->getOpcode()) {
645 default: break;
646 case X86ISD::ADD:
647 case X86ISD::ADC:
648 case X86ISD::SUB:
649 case X86ISD::SBB:
650 case X86ISD::AND:
651 case X86ISD::XOR:
652 case X86ISD::OR:
653 case ISD::ADD:
654 case ISD::UADDO_CARRY:
655 case ISD::AND:
656 case ISD::OR:
657 case ISD::XOR: {
658 SDValue Op1 = U->getOperand(1);
659
660 // If the other operand is a 8-bit immediate we should fold the immediate
661 // instead. This reduces code size.
662 // e.g.
663 // movl 4(%esp), %eax
664 // addl $4, %eax
665 // vs.
666 // movl $4, %eax
667 // addl 4(%esp), %eax
668 // The former is 2 bytes shorter. In case where the increment is 1, then
669 // the saving can be 4 bytes (by using incl %eax).
670 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
671 if (Imm->getAPIntValue().isSignedIntN(8))
672 return false;
673
674 // If this is a 64-bit AND with an immediate that fits in 32-bits,
675 // prefer using the smaller and over folding the load. This is needed to
676 // make sure immediates created by shrinkAndImmediate are always folded.
677 // Ideally we would narrow the load during DAG combine and get the
678 // best of both worlds.
679 if (U->getOpcode() == ISD::AND &&
680 Imm->getAPIntValue().getBitWidth() == 64 &&
681 Imm->getAPIntValue().isIntN(32))
682 return false;
683
684 // If this really a zext_inreg that can be represented with a movzx
685 // instruction, prefer that.
686 // TODO: We could shrink the load and fold if it is non-volatile.
687 if (U->getOpcode() == ISD::AND &&
688 (Imm->getAPIntValue() == UINT8_MAX ||
689 Imm->getAPIntValue() == UINT16_MAX ||
690 Imm->getAPIntValue() == UINT32_MAX))
691 return false;
692
693 // ADD/SUB with can negate the immediate and use the opposite operation
694 // to fit 128 into a sign extended 8 bit immediate.
695 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
696 (-Imm->getAPIntValue()).isSignedIntN(8))
697 return false;
698
699 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
700 (-Imm->getAPIntValue()).isSignedIntN(8) &&
701 hasNoCarryFlagUses(SDValue(U, 1)))
702 return false;
703 }
704
705 // If the other operand is a TLS address, we should fold it instead.
706 // This produces
707 // movl %gs:0, %eax
708 // leal i@NTPOFF(%eax), %eax
709 // instead of
710 // movl $i@NTPOFF, %eax
711 // addl %gs:0, %eax
712 // if the block also has an access to a second TLS address this will save
713 // a load.
714 // FIXME: This is probably also true for non-TLS addresses.
715 if (Op1.getOpcode() == X86ISD::Wrapper) {
716 SDValue Val = Op1.getOperand(0);
718 return false;
719 }
720
721 // Don't fold load if this matches the BTS/BTR/BTC patterns.
722 // BTS: (or X, (shl 1, n))
723 // BTR: (and X, (rotl -2, n))
724 // BTC: (xor X, (shl 1, n))
725 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
726 if (U->getOperand(0).getOpcode() == ISD::SHL &&
727 isOneConstant(U->getOperand(0).getOperand(0)))
728 return false;
729
730 if (U->getOperand(1).getOpcode() == ISD::SHL &&
731 isOneConstant(U->getOperand(1).getOperand(0)))
732 return false;
733 }
734 if (U->getOpcode() == ISD::AND) {
735 SDValue U0 = U->getOperand(0);
736 SDValue U1 = U->getOperand(1);
737 if (U0.getOpcode() == ISD::ROTL) {
738 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
739 if (C && C->getSExtValue() == -2)
740 return false;
741 }
742
743 if (U1.getOpcode() == ISD::ROTL) {
744 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
745 if (C && C->getSExtValue() == -2)
746 return false;
747 }
748 }
749
750 break;
751 }
752 case ISD::SHL:
753 case ISD::SRA:
754 case ISD::SRL:
755 // Don't fold a load into a shift by immediate. The BMI2 instructions
756 // support folding a load, but not an immediate. The legacy instructions
757 // support folding an immediate, but can't fold a load. Folding an
758 // immediate is preferable to folding a load.
759 if (isa<ConstantSDNode>(U->getOperand(1)))
760 return false;
761
762 break;
763 }
764 }
765
766 // Prevent folding a load if this can implemented with an insert_subreg or
767 // a move that implicitly zeroes.
768 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
769 isNullConstant(Root->getOperand(2)) &&
770 (Root->getOperand(0).isUndef() ||
772 return false;
773
774 return true;
775}
776
777// Indicates it is profitable to form an AVX512 masked operation. Returning
778// false will favor a masked register-register masked move or vblendm and the
779// operation will be selected separately.
780bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
781 assert(
782 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
783 "Unexpected opcode!");
784
785 // If the operation has additional users, the operation will be duplicated.
786 // Check the use count to prevent that.
787 // FIXME: Are there cheap opcodes we might want to duplicate?
788 return N->getOperand(1).hasOneUse();
789}
790
791/// Replace the original chain operand of the call with
792/// load's chain operand and move load below the call's chain operand.
793static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
794 SDValue Call, SDValue OrigChain) {
796 SDValue Chain = OrigChain.getOperand(0);
797 if (Chain.getNode() == Load.getNode())
798 Ops.push_back(Load.getOperand(0));
799 else {
800 assert(Chain.getOpcode() == ISD::TokenFactor &&
801 "Unexpected chain operand");
802 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
803 if (Chain.getOperand(i).getNode() == Load.getNode())
804 Ops.push_back(Load.getOperand(0));
805 else
806 Ops.push_back(Chain.getOperand(i));
807 SDValue NewChain =
808 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
809 Ops.clear();
810 Ops.push_back(NewChain);
811 }
812 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
813 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
814 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
815 Load.getOperand(1), Load.getOperand(2));
816
817 Ops.clear();
818 Ops.push_back(SDValue(Load.getNode(), 1));
819 Ops.append(Call->op_begin() + 1, Call->op_end());
820 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
821}
822
823/// Return true if call address is a load and it can be
824/// moved below CALLSEQ_START and the chains leading up to the call.
825/// Return the CALLSEQ_START by reference as a second output.
826/// In the case of a tail call, there isn't a callseq node between the call
827/// chain and the load.
828static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
829 // The transformation is somewhat dangerous if the call's chain was glued to
830 // the call. After MoveBelowOrigChain the load is moved between the call and
831 // the chain, this can create a cycle if the load is not folded. So it is
832 // *really* important that we are sure the load will be folded.
833 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
834 return false;
835 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
836 if (!LD ||
837 !LD->isSimple() ||
838 LD->getAddressingMode() != ISD::UNINDEXED ||
839 LD->getExtensionType() != ISD::NON_EXTLOAD)
840 return false;
841
842 // Now let's find the callseq_start.
843 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
844 if (!Chain.hasOneUse())
845 return false;
846 Chain = Chain.getOperand(0);
847 }
848
849 if (!Chain.getNumOperands())
850 return false;
851 // Since we are not checking for AA here, conservatively abort if the chain
852 // writes to memory. It's not safe to move the callee (a load) across a store.
853 if (isa<MemSDNode>(Chain.getNode()) &&
854 cast<MemSDNode>(Chain.getNode())->writeMem())
855 return false;
856 if (Chain.getOperand(0).getNode() == Callee.getNode())
857 return true;
858 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
859 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
860 Callee.getValue(1).hasOneUse())
861 return true;
862 return false;
863}
864
865static bool isEndbrImm64(uint64_t Imm) {
866// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
867// i.g: 0xF3660F1EFA, 0xF3670F1EFA
868 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
869 return false;
870
871 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
872 0x65, 0x66, 0x67, 0xf0, 0xf2};
873 int i = 24; // 24bit 0x0F1EFA has matched
874 while (i < 64) {
875 uint8_t Byte = (Imm >> i) & 0xFF;
876 if (Byte == 0xF3)
877 return true;
878 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
879 return false;
880 i += 8;
881 }
882
883 return false;
884}
885
886static bool needBWI(MVT VT) {
887 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
888}
889
890void X86DAGToDAGISel::PreprocessISelDAG() {
891 bool MadeChange = false;
892 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
893 E = CurDAG->allnodes_end(); I != E; ) {
894 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
895
896 // This is for CET enhancement.
897 //
898 // ENDBR32 and ENDBR64 have specific opcodes:
899 // ENDBR32: F3 0F 1E FB
900 // ENDBR64: F3 0F 1E FA
901 // And we want that attackers won’t find unintended ENDBR32/64
902 // opcode matches in the binary
903 // Here’s an example:
904 // If the compiler had to generate asm for the following code:
905 // a = 0xF30F1EFA
906 // it could, for example, generate:
907 // mov 0xF30F1EFA, dword ptr[a]
908 // In such a case, the binary would include a gadget that starts
909 // with a fake ENDBR64 opcode. Therefore, we split such generation
910 // into multiple operations, let it not shows in the binary
911 if (N->getOpcode() == ISD::Constant) {
912 MVT VT = N->getSimpleValueType(0);
913 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
914 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
915 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
916 // Check that the cf-protection-branch is enabled.
917 Metadata *CFProtectionBranch =
918 MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
919 if (CFProtectionBranch || IndirectBranchTracking) {
920 SDLoc dl(N);
921 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
922 Complement = CurDAG->getNOT(dl, Complement, VT);
923 --I;
924 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
925 ++I;
926 MadeChange = true;
927 continue;
928 }
929 }
930 }
931
932 // If this is a target specific AND node with no flag usages, turn it back
933 // into ISD::AND to enable test instruction matching.
934 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
935 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
936 N->getOperand(0), N->getOperand(1));
937 --I;
938 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
939 ++I;
940 MadeChange = true;
941 continue;
942 }
943
944 // Convert vector increment or decrement to sub/add with an all-ones
945 // constant:
946 // add X, <1, 1...> --> sub X, <-1, -1...>
947 // sub X, <1, 1...> --> add X, <-1, -1...>
948 // The all-ones vector constant can be materialized using a pcmpeq
949 // instruction that is commonly recognized as an idiom (has no register
950 // dependency), so that's better/smaller than loading a splat 1 constant.
951 //
952 // But don't do this if it would inhibit a potentially profitable load
953 // folding opportunity for the other operand. That only occurs with the
954 // intersection of:
955 // (1) The other operand (op0) is load foldable.
956 // (2) The op is an add (otherwise, we are *creating* an add and can still
957 // load fold the other op).
958 // (3) The target has AVX (otherwise, we have a destructive add and can't
959 // load fold the other op without killing the constant op).
960 // (4) The constant 1 vector has multiple uses (so it is profitable to load
961 // into a register anyway).
962 auto mayPreventLoadFold = [&]() {
963 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
964 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
965 !N->getOperand(1).hasOneUse();
966 };
967 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
968 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
969 APInt SplatVal;
970 if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
971 SplatVal.isOne()) {
972 SDLoc DL(N);
973
974 MVT VT = N->getSimpleValueType(0);
975 unsigned NumElts = VT.getSizeInBits() / 32;
977 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
978 AllOnes = CurDAG->getBitcast(VT, AllOnes);
979
980 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
981 SDValue Res =
982 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
983 --I;
984 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
985 ++I;
986 MadeChange = true;
987 continue;
988 }
989 }
990
991 switch (N->getOpcode()) {
992 case X86ISD::VBROADCAST: {
993 MVT VT = N->getSimpleValueType(0);
994 // Emulate v32i16/v64i8 broadcast without BWI.
995 if (!Subtarget->hasBWI() && needBWI(VT)) {
996 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
997 SDLoc dl(N);
998 SDValue NarrowBCast =
999 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1000 SDValue Res =
1001 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1002 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1003 unsigned Index = NarrowVT.getVectorMinNumElements();
1004 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1005 CurDAG->getIntPtrConstant(Index, dl));
1006
1007 --I;
1008 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1009 ++I;
1010 MadeChange = true;
1011 continue;
1012 }
1013
1014 break;
1015 }
1017 MVT VT = N->getSimpleValueType(0);
1018 // Emulate v32i16/v64i8 broadcast without BWI.
1019 if (!Subtarget->hasBWI() && needBWI(VT)) {
1020 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1021 auto *MemNode = cast<MemSDNode>(N);
1022 SDLoc dl(N);
1023 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1024 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1025 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1026 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1027 MemNode->getMemOperand());
1028 SDValue Res =
1029 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1030 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1031 unsigned Index = NarrowVT.getVectorMinNumElements();
1032 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1033 CurDAG->getIntPtrConstant(Index, dl));
1034
1035 --I;
1036 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1037 CurDAG->ReplaceAllUsesWith(N, To);
1038 ++I;
1039 MadeChange = true;
1040 continue;
1041 }
1042
1043 break;
1044 }
1045 case ISD::LOAD: {
1046 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1047 // load, then just extract the lower subvector and avoid the second load.
1048 auto *Ld = cast<LoadSDNode>(N);
1049 MVT VT = N->getSimpleValueType(0);
1050 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1051 !(VT.is128BitVector() || VT.is256BitVector()))
1052 break;
1053
1054 MVT MaxVT = VT;
1055 SDNode *MaxLd = nullptr;
1056 SDValue Ptr = Ld->getBasePtr();
1057 SDValue Chain = Ld->getChain();
1058 for (SDNode *User : Ptr->uses()) {
1059 auto *UserLd = dyn_cast<LoadSDNode>(User);
1060 MVT UserVT = User->getSimpleValueType(0);
1061 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1062 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1063 !User->hasAnyUseOfValue(1) &&
1064 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1065 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1066 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1067 MaxLd = User;
1068 MaxVT = UserVT;
1069 }
1070 }
1071 if (MaxLd) {
1072 SDLoc dl(N);
1073 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1074 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1075 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1076 SDValue(MaxLd, 0),
1077 CurDAG->getIntPtrConstant(0, dl));
1078 SDValue Res = CurDAG->getBitcast(VT, Extract);
1079
1080 --I;
1081 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1082 CurDAG->ReplaceAllUsesWith(N, To);
1083 ++I;
1084 MadeChange = true;
1085 continue;
1086 }
1087 break;
1088 }
1089 case ISD::VSELECT: {
1090 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1091 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1092 if (EleVT == MVT::i1)
1093 break;
1094
1095 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1096 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1097 "We can't replace VSELECT with BLENDV in vXi16!");
1098 SDValue R;
1099 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1100 EleVT.getSizeInBits()) {
1101 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1102 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1103 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1104 } else {
1105 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1106 N->getOperand(0), N->getOperand(1),
1107 N->getOperand(2));
1108 }
1109 --I;
1110 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1111 ++I;
1112 MadeChange = true;
1113 continue;
1114 }
1115 case ISD::FP_ROUND:
1117 case ISD::FP_TO_SINT:
1118 case ISD::FP_TO_UINT:
1121 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1122 // don't need 2 sets of patterns.
1123 if (!N->getSimpleValueType(0).isVector())
1124 break;
1125
1126 unsigned NewOpc;
1127 switch (N->getOpcode()) {
1128 default: llvm_unreachable("Unexpected opcode!");
1129 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1130 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1131 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1132 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1133 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1134 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1135 }
1136 SDValue Res;
1137 if (N->isStrictFPOpcode())
1138 Res =
1139 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1140 {N->getOperand(0), N->getOperand(1)});
1141 else
1142 Res =
1143 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1144 N->getOperand(0));
1145 --I;
1146 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1147 ++I;
1148 MadeChange = true;
1149 continue;
1150 }
1151 case ISD::SHL:
1152 case ISD::SRA:
1153 case ISD::SRL: {
1154 // Replace vector shifts with their X86 specific equivalent so we don't
1155 // need 2 sets of patterns.
1156 if (!N->getValueType(0).isVector())
1157 break;
1158
1159 unsigned NewOpc;
1160 switch (N->getOpcode()) {
1161 default: llvm_unreachable("Unexpected opcode!");
1162 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1163 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1164 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1165 }
1166 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1167 N->getOperand(0), N->getOperand(1));
1168 --I;
1169 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1170 ++I;
1171 MadeChange = true;
1172 continue;
1173 }
1174 case ISD::ANY_EXTEND:
1176 // Replace vector any extend with the zero extend equivalents so we don't
1177 // need 2 sets of patterns. Ignore vXi1 extensions.
1178 if (!N->getValueType(0).isVector())
1179 break;
1180
1181 unsigned NewOpc;
1182 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1183 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1184 "Unexpected opcode for mask vector!");
1185 NewOpc = ISD::SIGN_EXTEND;
1186 } else {
1187 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1190 }
1191
1192 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1193 N->getOperand(0));
1194 --I;
1195 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1196 ++I;
1197 MadeChange = true;
1198 continue;
1199 }
1200 case ISD::FCEIL:
1201 case ISD::STRICT_FCEIL:
1202 case ISD::FFLOOR:
1203 case ISD::STRICT_FFLOOR:
1204 case ISD::FTRUNC:
1205 case ISD::STRICT_FTRUNC:
1206 case ISD::FROUNDEVEN:
1208 case ISD::FNEARBYINT:
1210 case ISD::FRINT:
1211 case ISD::STRICT_FRINT: {
1212 // Replace fp rounding with their X86 specific equivalent so we don't
1213 // need 2 sets of patterns.
1214 unsigned Imm;
1215 switch (N->getOpcode()) {
1216 default: llvm_unreachable("Unexpected opcode!");
1217 case ISD::STRICT_FCEIL:
1218 case ISD::FCEIL: Imm = 0xA; break;
1219 case ISD::STRICT_FFLOOR:
1220 case ISD::FFLOOR: Imm = 0x9; break;
1221 case ISD::STRICT_FTRUNC:
1222 case ISD::FTRUNC: Imm = 0xB; break;
1224 case ISD::FROUNDEVEN: Imm = 0x8; break;
1226 case ISD::FNEARBYINT: Imm = 0xC; break;
1227 case ISD::STRICT_FRINT:
1228 case ISD::FRINT: Imm = 0x4; break;
1229 }
1230 SDLoc dl(N);
1231 bool IsStrict = N->isStrictFPOpcode();
1232 SDValue Res;
1233 if (IsStrict)
1234 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1235 {N->getValueType(0), MVT::Other},
1236 {N->getOperand(0), N->getOperand(1),
1237 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1238 else
1239 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1240 N->getOperand(0),
1241 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1242 --I;
1243 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1244 ++I;
1245 MadeChange = true;
1246 continue;
1247 }
1248 case X86ISD::FANDN:
1249 case X86ISD::FAND:
1250 case X86ISD::FOR:
1251 case X86ISD::FXOR: {
1252 // Widen scalar fp logic ops to vector to reduce isel patterns.
1253 // FIXME: Can we do this during lowering/combine.
1254 MVT VT = N->getSimpleValueType(0);
1255 if (VT.isVector() || VT == MVT::f128)
1256 break;
1257
1258 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1259 : VT == MVT::f32 ? MVT::v4f32
1260 : MVT::v8f16;
1261
1262 SDLoc dl(N);
1263 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1264 N->getOperand(0));
1265 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1266 N->getOperand(1));
1267
1268 SDValue Res;
1269 if (Subtarget->hasSSE2()) {
1270 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1271 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1272 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1273 unsigned Opc;
1274 switch (N->getOpcode()) {
1275 default: llvm_unreachable("Unexpected opcode!");
1276 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1277 case X86ISD::FAND: Opc = ISD::AND; break;
1278 case X86ISD::FOR: Opc = ISD::OR; break;
1279 case X86ISD::FXOR: Opc = ISD::XOR; break;
1280 }
1281 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1282 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1283 } else {
1284 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1285 }
1286 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1287 CurDAG->getIntPtrConstant(0, dl));
1288 --I;
1289 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1290 ++I;
1291 MadeChange = true;
1292 continue;
1293 }
1294 }
1295
1296 if (OptLevel != CodeGenOptLevel::None &&
1297 // Only do this when the target can fold the load into the call or
1298 // jmp.
1299 !Subtarget->useIndirectThunkCalls() &&
1300 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1301 (N->getOpcode() == X86ISD::TC_RETURN &&
1302 (Subtarget->is64Bit() ||
1303 !getTargetMachine().isPositionIndependent())))) {
1304 /// Also try moving call address load from outside callseq_start to just
1305 /// before the call to allow it to be folded.
1306 ///
1307 /// [Load chain]
1308 /// ^
1309 /// |
1310 /// [Load]
1311 /// ^ ^
1312 /// | |
1313 /// / \--
1314 /// / |
1315 ///[CALLSEQ_START] |
1316 /// ^ |
1317 /// | |
1318 /// [LOAD/C2Reg] |
1319 /// | |
1320 /// \ /
1321 /// \ /
1322 /// [CALL]
1323 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1324 SDValue Chain = N->getOperand(0);
1325 SDValue Load = N->getOperand(1);
1326 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1327 continue;
1328 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1329 ++NumLoadMoved;
1330 MadeChange = true;
1331 continue;
1332 }
1333
1334 // Lower fpround and fpextend nodes that target the FP stack to be store and
1335 // load to the stack. This is a gross hack. We would like to simply mark
1336 // these as being illegal, but when we do that, legalize produces these when
1337 // it expands calls, then expands these in the same legalize pass. We would
1338 // like dag combine to be able to hack on these between the call expansion
1339 // and the node legalization. As such this pass basically does "really
1340 // late" legalization of these inline with the X86 isel pass.
1341 // FIXME: This should only happen when not compiled with -O0.
1342 switch (N->getOpcode()) {
1343 default: continue;
1344 case ISD::FP_ROUND:
1345 case ISD::FP_EXTEND:
1346 {
1347 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1348 MVT DstVT = N->getSimpleValueType(0);
1349
1350 // If any of the sources are vectors, no fp stack involved.
1351 if (SrcVT.isVector() || DstVT.isVector())
1352 continue;
1353
1354 // If the source and destination are SSE registers, then this is a legal
1355 // conversion that should not be lowered.
1356 const X86TargetLowering *X86Lowering =
1357 static_cast<const X86TargetLowering *>(TLI);
1358 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1359 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1360 if (SrcIsSSE && DstIsSSE)
1361 continue;
1362
1363 if (!SrcIsSSE && !DstIsSSE) {
1364 // If this is an FPStack extension, it is a noop.
1365 if (N->getOpcode() == ISD::FP_EXTEND)
1366 continue;
1367 // If this is a value-preserving FPStack truncation, it is a noop.
1368 if (N->getConstantOperandVal(1))
1369 continue;
1370 }
1371
1372 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1373 // FPStack has extload and truncstore. SSE can fold direct loads into other
1374 // operations. Based on this, decide what we want to do.
1375 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1376 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1377 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1378 MachinePointerInfo MPI =
1379 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1380 SDLoc dl(N);
1381
1382 // FIXME: optimize the case where the src/dest is a load or store?
1383
1384 SDValue Store = CurDAG->getTruncStore(
1385 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1386 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1387 MemTmp, MPI, MemVT);
1388
1389 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1390 // extload we created. This will cause general havok on the dag because
1391 // anything below the conversion could be folded into other existing nodes.
1392 // To avoid invalidating 'I', back it up to the convert node.
1393 --I;
1394 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1395 break;
1396 }
1397
1398 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1399 //dealing with the chain differently, as there is already a preexisting chain.
1402 {
1403 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1404 MVT DstVT = N->getSimpleValueType(0);
1405
1406 // If any of the sources are vectors, no fp stack involved.
1407 if (SrcVT.isVector() || DstVT.isVector())
1408 continue;
1409
1410 // If the source and destination are SSE registers, then this is a legal
1411 // conversion that should not be lowered.
1412 const X86TargetLowering *X86Lowering =
1413 static_cast<const X86TargetLowering *>(TLI);
1414 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1415 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1416 if (SrcIsSSE && DstIsSSE)
1417 continue;
1418
1419 if (!SrcIsSSE && !DstIsSSE) {
1420 // If this is an FPStack extension, it is a noop.
1421 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1422 continue;
1423 // If this is a value-preserving FPStack truncation, it is a noop.
1424 if (N->getConstantOperandVal(2))
1425 continue;
1426 }
1427
1428 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1429 // FPStack has extload and truncstore. SSE can fold direct loads into other
1430 // operations. Based on this, decide what we want to do.
1431 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1432 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1433 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1434 MachinePointerInfo MPI =
1435 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1436 SDLoc dl(N);
1437
1438 // FIXME: optimize the case where the src/dest is a load or store?
1439
1440 //Since the operation is StrictFP, use the preexisting chain.
1442 if (!SrcIsSSE) {
1443 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1444 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1445 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1446 MPI, /*Align*/ std::nullopt,
1448 if (N->getFlags().hasNoFPExcept()) {
1449 SDNodeFlags Flags = Store->getFlags();
1450 Flags.setNoFPExcept(true);
1451 Store->setFlags(Flags);
1452 }
1453 } else {
1454 assert(SrcVT == MemVT && "Unexpected VT!");
1455 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1456 MPI);
1457 }
1458
1459 if (!DstIsSSE) {
1460 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1461 SDValue Ops[] = {Store, MemTmp};
1462 Result = CurDAG->getMemIntrinsicNode(
1463 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1464 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1465 if (N->getFlags().hasNoFPExcept()) {
1466 SDNodeFlags Flags = Result->getFlags();
1467 Flags.setNoFPExcept(true);
1468 Result->setFlags(Flags);
1469 }
1470 } else {
1471 assert(DstVT == MemVT && "Unexpected VT!");
1472 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1473 }
1474
1475 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1476 // extload we created. This will cause general havok on the dag because
1477 // anything below the conversion could be folded into other existing nodes.
1478 // To avoid invalidating 'I', back it up to the convert node.
1479 --I;
1480 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1481 break;
1482 }
1483 }
1484
1485
1486 // Now that we did that, the node is dead. Increment the iterator to the
1487 // next node to process, then delete N.
1488 ++I;
1489 MadeChange = true;
1490 }
1491
1492 // Remove any dead nodes that may have been left behind.
1493 if (MadeChange)
1494 CurDAG->RemoveDeadNodes();
1495}
1496
1497// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1498bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1499 unsigned Opc = N->getMachineOpcode();
1500 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1501 Opc != X86::MOVSX64rr8)
1502 return false;
1503
1504 SDValue N0 = N->getOperand(0);
1505
1506 // We need to be extracting the lower bit of an extend.
1507 if (!N0.isMachineOpcode() ||
1508 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1509 N0.getConstantOperandVal(1) != X86::sub_8bit)
1510 return false;
1511
1512 // We're looking for either a movsx or movzx to match the original opcode.
1513 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1514 : X86::MOVSX32rr8_NOREX;
1515 SDValue N00 = N0.getOperand(0);
1516 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1517 return false;
1518
1519 if (Opc == X86::MOVSX64rr8) {
1520 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1521 // to 64.
1522 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1523 MVT::i64, N00);
1524 ReplaceUses(N, Extend);
1525 } else {
1526 // Ok we can drop this extend and just use the original extend.
1527 ReplaceUses(N, N00.getNode());
1528 }
1529
1530 return true;
1531}
1532
1533void X86DAGToDAGISel::PostprocessISelDAG() {
1534 // Skip peepholes at -O0.
1535 if (TM.getOptLevel() == CodeGenOptLevel::None)
1536 return;
1537
1538 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1539
1540 bool MadeChange = false;
1541 while (Position != CurDAG->allnodes_begin()) {
1542 SDNode *N = &*--Position;
1543 // Skip dead nodes and any non-machine opcodes.
1544 if (N->use_empty() || !N->isMachineOpcode())
1545 continue;
1546
1547 if (tryOptimizeRem8Extend(N)) {
1548 MadeChange = true;
1549 continue;
1550 }
1551
1552 // Look for a TESTrr+ANDrr pattern where both operands of the test are
1553 // the same. Rewrite to remove the AND.
1554 unsigned Opc = N->getMachineOpcode();
1555 if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
1556 Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
1557 N->getOperand(0) == N->getOperand(1) &&
1558 N->getOperand(0)->hasNUsesOfValue(2, N->getOperand(0).getResNo()) &&
1559 N->getOperand(0).isMachineOpcode()) {
1560 SDValue And = N->getOperand(0);
1561 unsigned N0Opc = And.getMachineOpcode();
1562 if ((N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
1563 N0Opc == X86::AND32rr || N0Opc == X86::AND64rr ||
1564 N0Opc == X86::AND8rr_ND || N0Opc == X86::AND16rr_ND ||
1565 N0Opc == X86::AND32rr_ND || N0Opc == X86::AND64rr_ND) &&
1566 !And->hasAnyUseOfValue(1)) {
1567 MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1568 MVT::i32,
1569 And.getOperand(0),
1570 And.getOperand(1));
1571 ReplaceUses(N, Test);
1572 MadeChange = true;
1573 continue;
1574 }
1575 if ((N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
1576 N0Opc == X86::AND32rm || N0Opc == X86::AND64rm ||
1577 N0Opc == X86::AND8rm_ND || N0Opc == X86::AND16rm_ND ||
1578 N0Opc == X86::AND32rm_ND || N0Opc == X86::AND64rm_ND) &&
1579 !And->hasAnyUseOfValue(1)) {
1580 unsigned NewOpc;
1581#define CASE_ND(OP) \
1582 case X86::OP: \
1583 case X86::OP##_ND:
1584#define FROM_TO(A, B) \
1585 CASE_ND(A) NewOpc = X86::B; \
1586 break;
1587 switch (N0Opc) {
1588 FROM_TO(AND8rm, TEST8mr);
1589 FROM_TO(AND16rm, TEST16mr);
1590 FROM_TO(AND32rm, TEST32mr);
1591 FROM_TO(AND64rm, TEST64mr);
1592 }
1593#undef FROM_TO
1594#undef CASE_ND
1595
1596 // Need to swap the memory and register operand.
1597 SDValue Ops[] = { And.getOperand(1),
1598 And.getOperand(2),
1599 And.getOperand(3),
1600 And.getOperand(4),
1601 And.getOperand(5),
1602 And.getOperand(0),
1603 And.getOperand(6) /* Chain */ };
1604 MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1605 MVT::i32, MVT::Other, Ops);
1606 CurDAG->setNodeMemRefs(
1607 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1608 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1609 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1610 MadeChange = true;
1611 continue;
1612 }
1613 }
1614
1615 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1616 // used. We're doing this late so we can prefer to fold the AND into masked
1617 // comparisons. Doing that can be better for the live range of the mask
1618 // register.
1619 if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
1620 Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
1621 N->getOperand(0) == N->getOperand(1) &&
1622 N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1623 N->getOperand(0).isMachineOpcode() &&
1624 onlyUsesZeroFlag(SDValue(N, 0))) {
1625 SDValue And = N->getOperand(0);
1626 unsigned N0Opc = And.getMachineOpcode();
1627 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1628 // KAND instructions and KTEST use the same ISA feature.
1629 if (N0Opc == X86::KANDBrr ||
1630 (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
1631 N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
1632 unsigned NewOpc;
1633 switch (Opc) {
1634 default: llvm_unreachable("Unexpected opcode!");
1635 case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1636 case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1637 case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1638 case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1639 }
1640 MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1641 MVT::i32,
1642 And.getOperand(0),
1643 And.getOperand(1));
1644 ReplaceUses(N, KTest);
1645 MadeChange = true;
1646 continue;
1647 }
1648 }
1649
1650 // Attempt to remove vectors moves that were inserted to zero upper bits.
1651 if (Opc != TargetOpcode::SUBREG_TO_REG)
1652 continue;
1653
1654 unsigned SubRegIdx = N->getConstantOperandVal(2);
1655 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1656 continue;
1657
1658 SDValue Move = N->getOperand(1);
1659 if (!Move.isMachineOpcode())
1660 continue;
1661
1662 // Make sure its one of the move opcodes we recognize.
1663 switch (Move.getMachineOpcode()) {
1664 default:
1665 continue;
1666 case X86::VMOVAPDrr: case X86::VMOVUPDrr:
1667 case X86::VMOVAPSrr: case X86::VMOVUPSrr:
1668 case X86::VMOVDQArr: case X86::VMOVDQUrr:
1669 case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
1670 case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
1671 case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
1672 case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
1673 case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
1674 case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1675 case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1676 case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
1677 case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
1678 case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1679 case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1680 break;
1681 }
1682
1683 SDValue In = Move.getOperand(0);
1684 if (!In.isMachineOpcode() ||
1685 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1686 continue;
1687
1688 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1689 // the SHA instructions which use a legacy encoding.
1690 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1691 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1692 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1693 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1694 continue;
1695
1696 // Producing instruction is another vector instruction. We can drop the
1697 // move.
1698 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1699 MadeChange = true;
1700 }
1701
1702 if (MadeChange)
1703 CurDAG->RemoveDeadNodes();
1704}
1705
1706
1707/// Emit any code that needs to be executed only in the main function.
1708void X86DAGToDAGISel::emitSpecialCodeForMain() {
1709 if (Subtarget->isTargetCygMing()) {
1711 auto &DL = CurDAG->getDataLayout();
1712
1714 CLI.setChain(CurDAG->getRoot())
1715 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1716 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1717 std::move(Args));
1718 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1719 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1720 CurDAG->setRoot(Result.second);
1721 }
1722}
1723
1724void X86DAGToDAGISel::emitFunctionEntryCode() {
1725 // If this is main, emit special code for main.
1726 const Function &F = MF->getFunction();
1727 if (F.hasExternalLinkage() && F.getName() == "main")
1728 emitSpecialCodeForMain();
1729}
1730
1731static bool isDispSafeForFrameIndex(int64_t Val) {
1732 // On 64-bit platforms, we can run into an issue where a frame index
1733 // includes a displacement that, when added to the explicit displacement,
1734 // will overflow the displacement field. Assuming that the frame index
1735 // displacement fits into a 31-bit integer (which is only slightly more
1736 // aggressive than the current fundamental assumption that it fits into
1737 // a 32-bit integer), a 31-bit disp should always be safe.
1738 return isInt<31>(Val);
1739}
1740
1741bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1742 X86ISelAddressMode &AM) {
1743 // We may have already matched a displacement and the caller just added the
1744 // symbolic displacement. So we still need to do the checks even if Offset
1745 // is zero.
1746
1747 int64_t Val = AM.Disp + Offset;
1748
1749 // Cannot combine ExternalSymbol displacements with integer offsets.
1750 if (Val != 0 && (AM.ES || AM.MCSym))
1751 return true;
1752
1753 CodeModel::Model M = TM.getCodeModel();
1754 if (Subtarget->is64Bit()) {
1755 if (Val != 0 &&
1757 AM.hasSymbolicDisplacement()))
1758 return true;
1759 // In addition to the checks required for a register base, check that
1760 // we do not try to use an unsafe Disp with a frame index.
1761 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1763 return true;
1764 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1765 // 64 bits. Instructions with 32-bit register addresses perform this zero
1766 // extension for us and we can safely ignore the high bits of Offset.
1767 // Instructions with only a 32-bit immediate address do not, though: they
1768 // sign extend instead. This means only address the low 2GB of address space
1769 // is directly addressable, we need indirect addressing for the high 2GB of
1770 // address space.
1771 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1772 // implicit zero extension of instructions would cover up any problem.
1773 // However, we have asserts elsewhere that get triggered if we do, so keep
1774 // the checks for now.
1775 // TODO: We would actually be able to accept these, as well as the same
1776 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1777 // to get an address size override to be emitted. However, this
1778 // pseudo-register is not part of any register class and therefore causes
1779 // MIR verification to fail.
1780 if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1781 !AM.hasBaseOrIndexReg())
1782 return true;
1783 }
1784 AM.Disp = Val;
1785 return false;
1786}
1787
1788bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1789 bool AllowSegmentRegForX32) {
1790 SDValue Address = N->getOperand(1);
1791
1792 // load gs:0 -> GS segment register.
1793 // load fs:0 -> FS segment register.
1794 //
1795 // This optimization is generally valid because the GNU TLS model defines that
1796 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1797 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1798 // zero-extended to 64 bits and then added it to the base address, which gives
1799 // unwanted results when the register holds a negative value.
1800 // For more information see http://people.redhat.com/drepper/tls.pdf
1801 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1802 !IndirectTlsSegRefs &&
1803 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1804 Subtarget->isTargetFuchsia())) {
1805 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1806 return true;
1807 switch (N->getPointerInfo().getAddrSpace()) {
1808 case X86AS::GS:
1809 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1810 return false;
1811 case X86AS::FS:
1812 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1813 return false;
1814 // Address space X86AS::SS is not handled here, because it is not used to
1815 // address TLS areas.
1816 }
1817 }
1818
1819 return true;
1820}
1821
1822/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1823/// mode. These wrap things that will resolve down into a symbol reference.
1824/// If no match is possible, this returns true, otherwise it returns false.
1825bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1826 // If the addressing mode already has a symbol as the displacement, we can
1827 // never match another symbol.
1828 if (AM.hasSymbolicDisplacement())
1829 return true;
1830
1831 bool IsRIPRelTLS = false;
1832 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1833 if (IsRIPRel) {
1834 SDValue Val = N.getOperand(0);
1836 IsRIPRelTLS = true;
1837 }
1838
1839 // We can't use an addressing mode in the 64-bit large code model.
1840 // Global TLS addressing is an exception. In the medium code model,
1841 // we use can use a mode when RIP wrappers are present.
1842 // That signifies access to globals that are known to be "near",
1843 // such as the GOT itself.
1844 CodeModel::Model M = TM.getCodeModel();
1845 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1846 return true;
1847
1848 // Base and index reg must be 0 in order to use %rip as base.
1849 if (IsRIPRel && AM.hasBaseOrIndexReg())
1850 return true;
1851
1852 // Make a local copy in case we can't do this fold.
1853 X86ISelAddressMode Backup = AM;
1854
1855 int64_t Offset = 0;
1856 SDValue N0 = N.getOperand(0);
1857 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1858 AM.GV = G->getGlobal();
1859 AM.SymbolFlags = G->getTargetFlags();
1860 Offset = G->getOffset();
1861 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1862 AM.CP = CP->getConstVal();
1863 AM.Alignment = CP->getAlign();
1864 AM.SymbolFlags = CP->getTargetFlags();
1865 Offset = CP->getOffset();
1866 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1867 AM.ES = S->getSymbol();
1868 AM.SymbolFlags = S->getTargetFlags();
1869 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1870 AM.MCSym = S->getMCSymbol();
1871 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1872 AM.JT = J->getIndex();
1873 AM.SymbolFlags = J->getTargetFlags();
1874 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1875 AM.BlockAddr = BA->getBlockAddress();
1876 AM.SymbolFlags = BA->getTargetFlags();
1877 Offset = BA->getOffset();
1878 } else
1879 llvm_unreachable("Unhandled symbol reference node.");
1880
1881 // Can't use an addressing mode with large globals.
1882 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1883 TM.isLargeGlobalValue(AM.GV)) {
1884 AM = Backup;
1885 return true;
1886 }
1887
1888 if (foldOffsetIntoAddress(Offset, AM)) {
1889 AM = Backup;
1890 return true;
1891 }
1892
1893 if (IsRIPRel)
1894 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1895
1896 // Commit the changes now that we know this fold is safe.
1897 return false;
1898}
1899
1900/// Add the specified node to the specified addressing mode, returning true if
1901/// it cannot be done. This just pattern matches for the addressing mode.
1902bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1903 if (matchAddressRecursively(N, AM, 0))
1904 return true;
1905
1906 // Post-processing: Make a second attempt to fold a load, if we now know
1907 // that there will not be any other register. This is only performed for
1908 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1909 // any foldable load the first time.
1910 if (Subtarget->isTarget64BitILP32() &&
1911 AM.BaseType == X86ISelAddressMode::RegBase &&
1912 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1913 SDValue Save_Base_Reg = AM.Base_Reg;
1914 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1915 AM.Base_Reg = SDValue();
1916 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1917 AM.Base_Reg = Save_Base_Reg;
1918 }
1919 }
1920
1921 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1922 // a smaller encoding and avoids a scaled-index.
1923 if (AM.Scale == 2 &&
1924 AM.BaseType == X86ISelAddressMode::RegBase &&
1925 AM.Base_Reg.getNode() == nullptr) {
1926 AM.Base_Reg = AM.IndexReg;
1927 AM.Scale = 1;
1928 }
1929
1930 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1931 // because it has a smaller encoding.
1932 if (TM.getCodeModel() != CodeModel::Large &&
1933 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
1934 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
1935 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
1936 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
1937 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1938 }
1939
1940 return false;
1941}
1942
1943bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1944 unsigned Depth) {
1945 // Add an artificial use to this node so that we can keep track of
1946 // it if it gets CSE'd with a different node.
1947 HandleSDNode Handle(N);
1948
1949 X86ISelAddressMode Backup = AM;
1950 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1951 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1952 return false;
1953 AM = Backup;
1954
1955 // Try again after commutating the operands.
1956 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
1957 Depth + 1) &&
1958 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
1959 return false;
1960 AM = Backup;
1961
1962 // If we couldn't fold both operands into the address at the same time,
1963 // see if we can just put each operand into a register and fold at least
1964 // the add.
1965 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1966 !AM.Base_Reg.getNode() &&
1967 !AM.IndexReg.getNode()) {
1968 N = Handle.getValue();
1969 AM.Base_Reg = N.getOperand(0);
1970 AM.IndexReg = N.getOperand(1);
1971 AM.Scale = 1;
1972 return false;
1973 }
1974 N = Handle.getValue();
1975 return true;
1976}
1977
1978// Insert a node into the DAG at least before the Pos node's position. This
1979// will reposition the node as needed, and will assign it a node ID that is <=
1980// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1981// IDs! The selection DAG must no longer depend on their uniqueness when this
1982// is used.
1983static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1984 if (N->getNodeId() == -1 ||
1987 DAG.RepositionNode(Pos->getIterator(), N.getNode());
1988 // Mark Node as invalid for pruning as after this it may be a successor to a
1989 // selected node but otherwise be in the same position of Pos.
1990 // Conservatively mark it with the same -abs(Id) to assure node id
1991 // invariant is preserved.
1992 N->setNodeId(Pos->getNodeId());
1994 }
1995}
1996
1997// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1998// safe. This allows us to convert the shift and and into an h-register
1999// extract and a scaled index. Returns false if the simplification is
2000// performed.
2002 uint64_t Mask,
2003 SDValue Shift, SDValue X,
2004 X86ISelAddressMode &AM) {
2005 if (Shift.getOpcode() != ISD::SRL ||
2006 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2007 !Shift.hasOneUse())
2008 return true;
2009
2010 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2011 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2012 Mask != (0xffu << ScaleLog))
2013 return true;
2014
2015 MVT XVT = X.getSimpleValueType();
2016 MVT VT = N.getSimpleValueType();
2017 SDLoc DL(N);
2018 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2019 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2020 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2021 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2022 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2023 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2024 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2025
2026 // Insert the new nodes into the topological ordering. We must do this in
2027 // a valid topological ordering as nothing is going to go back and re-sort
2028 // these nodes. We continually insert before 'N' in sequence as this is
2029 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2030 // hierarchy left to express.
2031 insertDAGNode(DAG, N, Eight);
2032 insertDAGNode(DAG, N, NewMask);
2033 insertDAGNode(DAG, N, Srl);
2034 insertDAGNode(DAG, N, And);
2035 insertDAGNode(DAG, N, Ext);
2036 insertDAGNode(DAG, N, ShlCount);
2037 insertDAGNode(DAG, N, Shl);
2038 DAG.ReplaceAllUsesWith(N, Shl);
2039 DAG.RemoveDeadNode(N.getNode());
2040 AM.IndexReg = Ext;
2041 AM.Scale = (1 << ScaleLog);
2042 return false;
2043}
2044
2045// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2046// allows us to fold the shift into this addressing mode. Returns false if the
2047// transform succeeded.
2049 X86ISelAddressMode &AM) {
2050 SDValue Shift = N.getOperand(0);
2051
2052 // Use a signed mask so that shifting right will insert sign bits. These
2053 // bits will be removed when we shift the result left so it doesn't matter
2054 // what we use. This might allow a smaller immediate encoding.
2055 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2056
2057 // If we have an any_extend feeding the AND, look through it to see if there
2058 // is a shift behind it. But only if the AND doesn't use the extended bits.
2059 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2060 bool FoundAnyExtend = false;
2061 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2062 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2063 isUInt<32>(Mask)) {
2064 FoundAnyExtend = true;
2065 Shift = Shift.getOperand(0);
2066 }
2067
2068 if (Shift.getOpcode() != ISD::SHL ||
2069 !isa<ConstantSDNode>(Shift.getOperand(1)))
2070 return true;
2071
2072 SDValue X = Shift.getOperand(0);
2073
2074 // Not likely to be profitable if either the AND or SHIFT node has more
2075 // than one use (unless all uses are for address computation). Besides,
2076 // isel mechanism requires their node ids to be reused.
2077 if (!N.hasOneUse() || !Shift.hasOneUse())
2078 return true;
2079
2080 // Verify that the shift amount is something we can fold.
2081 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2082 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2083 return true;
2084
2085 MVT VT = N.getSimpleValueType();
2086 SDLoc DL(N);
2087 if (FoundAnyExtend) {
2088 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2089 insertDAGNode(DAG, N, NewX);
2090 X = NewX;
2091 }
2092
2093 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
2094 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2095 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2096
2097 // Insert the new nodes into the topological ordering. We must do this in
2098 // a valid topological ordering as nothing is going to go back and re-sort
2099 // these nodes. We continually insert before 'N' in sequence as this is
2100 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2101 // hierarchy left to express.
2102 insertDAGNode(DAG, N, NewMask);
2103 insertDAGNode(DAG, N, NewAnd);
2104 insertDAGNode(DAG, N, NewShift);
2105 DAG.ReplaceAllUsesWith(N, NewShift);
2106 DAG.RemoveDeadNode(N.getNode());
2107
2108 AM.Scale = 1 << ShiftAmt;
2109 AM.IndexReg = NewAnd;
2110 return false;
2111}
2112
2113// Implement some heroics to detect shifts of masked values where the mask can
2114// be replaced by extending the shift and undoing that in the addressing mode
2115// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2116// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2117// the addressing mode. This results in code such as:
2118//
2119// int f(short *y, int *lookup_table) {
2120// ...
2121// return *y + lookup_table[*y >> 11];
2122// }
2123//
2124// Turning into:
2125// movzwl (%rdi), %eax
2126// movl %eax, %ecx
2127// shrl $11, %ecx
2128// addl (%rsi,%rcx,4), %eax
2129//
2130// Instead of:
2131// movzwl (%rdi), %eax
2132// movl %eax, %ecx
2133// shrl $9, %ecx
2134// andl $124, %rcx
2135// addl (%rsi,%rcx), %eax
2136//
2137// Note that this function assumes the mask is provided as a mask *after* the
2138// value is shifted. The input chain may or may not match that, but computing
2139// such a mask is trivial.
2141 uint64_t Mask,
2142 SDValue Shift, SDValue X,
2143 X86ISelAddressMode &AM) {
2144 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2145 !isa<ConstantSDNode>(Shift.getOperand(1)))
2146 return true;
2147
2148 // We need to ensure that mask is a continuous run of bits.
2149 unsigned MaskIdx, MaskLen;
2150 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2151 return true;
2152 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2153
2154 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2155
2156 // The amount of shift we're trying to fit into the addressing mode is taken
2157 // from the shifted mask index (number of trailing zeros of the mask).
2158 unsigned AMShiftAmt = MaskIdx;
2159
2160 // There is nothing we can do here unless the mask is removing some bits.
2161 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2162 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2163
2164 // Scale the leading zero count down based on the actual size of the value.
2165 // Also scale it down based on the size of the shift.
2166 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2167 if (MaskLZ < ScaleDown)
2168 return true;
2169 MaskLZ -= ScaleDown;
2170
2171 // The final check is to ensure that any masked out high bits of X are
2172 // already known to be zero. Otherwise, the mask has a semantic impact
2173 // other than masking out a couple of low bits. Unfortunately, because of
2174 // the mask, zero extensions will be removed from operands in some cases.
2175 // This code works extra hard to look through extensions because we can
2176 // replace them with zero extensions cheaply if necessary.
2177 bool ReplacingAnyExtend = false;
2178 if (X.getOpcode() == ISD::ANY_EXTEND) {
2179 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2180 X.getOperand(0).getSimpleValueType().getSizeInBits();
2181 // Assume that we'll replace the any-extend with a zero-extend, and
2182 // narrow the search to the extended value.
2183 X = X.getOperand(0);
2184 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2185 ReplacingAnyExtend = true;
2186 }
2187 APInt MaskedHighBits =
2188 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2189 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2190 return true;
2191
2192 // We've identified a pattern that can be transformed into a single shift
2193 // and an addressing mode. Make it so.
2194 MVT VT = N.getSimpleValueType();
2195 if (ReplacingAnyExtend) {
2196 assert(X.getValueType() != VT);
2197 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2198 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2199 insertDAGNode(DAG, N, NewX);
2200 X = NewX;
2201 }
2202
2203 MVT XVT = X.getSimpleValueType();
2204 SDLoc DL(N);
2205 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2206 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2207 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2208 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2209 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2210
2211 // Insert the new nodes into the topological ordering. We must do this in
2212 // a valid topological ordering as nothing is going to go back and re-sort
2213 // these nodes. We continually insert before 'N' in sequence as this is
2214 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2215 // hierarchy left to express.
2216 insertDAGNode(DAG, N, NewSRLAmt);
2217 insertDAGNode(DAG, N, NewSRL);
2218 insertDAGNode(DAG, N, NewExt);
2219 insertDAGNode(DAG, N, NewSHLAmt);
2220 insertDAGNode(DAG, N, NewSHL);
2221 DAG.ReplaceAllUsesWith(N, NewSHL);
2222 DAG.RemoveDeadNode(N.getNode());
2223
2224 AM.Scale = 1 << AMShiftAmt;
2225 AM.IndexReg = NewExt;
2226 return false;
2227}
2228
2229// Transform "(X >> SHIFT) & (MASK << C1)" to
2230// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2231// matched to a BEXTR later. Returns false if the simplification is performed.
2233 uint64_t Mask,
2234 SDValue Shift, SDValue X,
2235 X86ISelAddressMode &AM,
2236 const X86Subtarget &Subtarget) {
2237 if (Shift.getOpcode() != ISD::SRL ||
2238 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2239 !Shift.hasOneUse() || !N.hasOneUse())
2240 return true;
2241
2242 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2243 if (!Subtarget.hasTBM() &&
2244 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2245 return true;
2246
2247 // We need to ensure that mask is a continuous run of bits.
2248 unsigned MaskIdx, MaskLen;
2249 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2250 return true;
2251
2252 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2253
2254 // The amount of shift we're trying to fit into the addressing mode is taken
2255 // from the shifted mask index (number of trailing zeros of the mask).
2256 unsigned AMShiftAmt = MaskIdx;
2257
2258 // There is nothing we can do here unless the mask is removing some bits.
2259 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2260 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2261
2262 MVT XVT = X.getSimpleValueType();
2263 MVT VT = N.getSimpleValueType();
2264 SDLoc DL(N);
2265 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2266 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2267 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2268 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2269 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2270 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2271 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2272
2273 // Insert the new nodes into the topological ordering. We must do this in
2274 // a valid topological ordering as nothing is going to go back and re-sort
2275 // these nodes. We continually insert before 'N' in sequence as this is
2276 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2277 // hierarchy left to express.
2278 insertDAGNode(DAG, N, NewSRLAmt);
2279 insertDAGNode(DAG, N, NewSRL);
2280 insertDAGNode(DAG, N, NewMask);
2281 insertDAGNode(DAG, N, NewAnd);
2282 insertDAGNode(DAG, N, NewExt);
2283 insertDAGNode(DAG, N, NewSHLAmt);
2284 insertDAGNode(DAG, N, NewSHL);
2285 DAG.ReplaceAllUsesWith(N, NewSHL);
2286 DAG.RemoveDeadNode(N.getNode());
2287
2288 AM.Scale = 1 << AMShiftAmt;
2289 AM.IndexReg = NewExt;
2290 return false;
2291}
2292
2293// Attempt to peek further into a scaled index register, collecting additional
2294// extensions / offsets / etc. Returns /p N if we can't peek any further.
2295SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2296 X86ISelAddressMode &AM,
2297 unsigned Depth) {
2298 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2299 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2300 "Illegal index scale");
2301
2302 // Limit recursion.
2304 return N;
2305
2306 EVT VT = N.getValueType();
2307 unsigned Opc = N.getOpcode();
2308
2309 // index: add(x,c) -> index: x, disp + c
2310 if (CurDAG->isBaseWithConstantOffset(N)) {
2311 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2312 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2313 if (!foldOffsetIntoAddress(Offset, AM))
2314 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2315 }
2316
2317 // index: add(x,x) -> index: x, scale * 2
2318 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2319 if (AM.Scale <= 4) {
2320 AM.Scale *= 2;
2321 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2322 }
2323 }
2324
2325 // index: shl(x,i) -> index: x, scale * (1 << i)
2326 if (Opc == X86ISD::VSHLI) {
2327 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2328 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2329 if ((AM.Scale * ScaleAmt) <= 8) {
2330 AM.Scale *= ScaleAmt;
2331 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2332 }
2333 }
2334
2335 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2336 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2337 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2338 SDValue Src = N.getOperand(0);
2339 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2340 Src.hasOneUse()) {
2341 if (CurDAG->isBaseWithConstantOffset(Src)) {
2342 SDValue AddSrc = Src.getOperand(0);
2343 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2344 uint64_t Offset = (uint64_t)AddVal->getSExtValue();
2345 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2346 SDLoc DL(N);
2347 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2348 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2349 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2350 insertDAGNode(*CurDAG, N, ExtSrc);
2351 insertDAGNode(*CurDAG, N, ExtVal);
2352 insertDAGNode(*CurDAG, N, ExtAdd);
2353 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2354 CurDAG->RemoveDeadNode(N.getNode());
2355 return ExtSrc;
2356 }
2357 }
2358 }
2359 }
2360
2361 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2362 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2363 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2364 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2365 SDValue Src = N.getOperand(0);
2366 unsigned SrcOpc = Src.getOpcode();
2367 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2368 CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2369 Src.hasOneUse()) {
2370 if (CurDAG->isBaseWithConstantOffset(Src)) {
2371 SDValue AddSrc = Src.getOperand(0);
2372 uint64_t Offset = Src.getConstantOperandVal(1);
2373 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2374 SDLoc DL(N);
2375 SDValue Res;
2376 // If we're also scaling, see if we can use that as well.
2377 if (AddSrc.getOpcode() == ISD::SHL &&
2378 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2379 SDValue ShVal = AddSrc.getOperand(0);
2380 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2381 APInt HiBits =
2383 uint64_t ScaleAmt = 1ULL << ShAmt;
2384 if ((AM.Scale * ScaleAmt) <= 8 &&
2385 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2386 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2387 AM.Scale *= ScaleAmt;
2388 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2389 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2390 AddSrc.getOperand(1));
2391 insertDAGNode(*CurDAG, N, ExtShVal);
2392 insertDAGNode(*CurDAG, N, ExtShift);
2393 AddSrc = ExtShift;
2394 Res = ExtShVal;
2395 }
2396 }
2397 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2398 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2399 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2400 insertDAGNode(*CurDAG, N, ExtSrc);
2401 insertDAGNode(*CurDAG, N, ExtVal);
2402 insertDAGNode(*CurDAG, N, ExtAdd);
2403 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2404 CurDAG->RemoveDeadNode(N.getNode());
2405 return Res ? Res : ExtSrc;
2406 }
2407 }
2408 }
2409 }
2410
2411 // TODO: Handle extensions, shifted masks etc.
2412 return N;
2413}
2414
2415bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2416 unsigned Depth) {
2417 SDLoc dl(N);
2418 LLVM_DEBUG({
2419 dbgs() << "MatchAddress: ";
2420 AM.dump(CurDAG);
2421 });
2422 // Limit recursion.
2424 return matchAddressBase(N, AM);
2425
2426 // If this is already a %rip relative address, we can only merge immediates
2427 // into it. Instead of handling this in every case, we handle it here.
2428 // RIP relative addressing: %rip + 32-bit displacement!
2429 if (AM.isRIPRelative()) {
2430 // FIXME: JumpTable and ExternalSymbol address currently don't like
2431 // displacements. It isn't very important, but this should be fixed for
2432 // consistency.
2433 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2434 return true;
2435
2436 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2437 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2438 return false;
2439 return true;
2440 }
2441
2442 switch (N.getOpcode()) {
2443 default: break;
2444 case ISD::LOCAL_RECOVER: {
2445 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2446 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2447 // Use the symbol and don't prefix it.
2448 AM.MCSym = ESNode->getMCSymbol();
2449 return false;
2450 }
2451 break;
2452 }
2453 case ISD::Constant: {
2454 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2455 if (!foldOffsetIntoAddress(Val, AM))
2456 return false;
2457 break;
2458 }
2459
2460 case X86ISD::Wrapper:
2461 case X86ISD::WrapperRIP:
2462 if (!matchWrapper(N, AM))
2463 return false;
2464 break;
2465
2466 case ISD::LOAD:
2467 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2468 return false;
2469 break;
2470
2471 case ISD::FrameIndex:
2472 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2473 AM.Base_Reg.getNode() == nullptr &&
2474 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2475 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2476 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2477 return false;
2478 }
2479 break;
2480
2481 case ISD::SHL:
2482 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2483 break;
2484
2485 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2486 unsigned Val = CN->getZExtValue();
2487 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2488 // that the base operand remains free for further matching. If
2489 // the base doesn't end up getting used, a post-processing step
2490 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2491 if (Val == 1 || Val == 2 || Val == 3) {
2492 SDValue ShVal = N.getOperand(0);
2493 AM.Scale = 1 << Val;
2494 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2495 return false;
2496 }
2497 }
2498 break;
2499
2500 case ISD::SRL: {
2501 // Scale must not be used already.
2502 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2503
2504 // We only handle up to 64-bit values here as those are what matter for
2505 // addressing mode optimizations.
2506 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2507 "Unexpected value size!");
2508
2509 SDValue And = N.getOperand(0);
2510 if (And.getOpcode() != ISD::AND) break;
2511 SDValue X = And.getOperand(0);
2512
2513 // The mask used for the transform is expected to be post-shift, but we
2514 // found the shift first so just apply the shift to the mask before passing
2515 // it down.
2516 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2517 !isa<ConstantSDNode>(And.getOperand(1)))
2518 break;
2519 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2520
2521 // Try to fold the mask and shift into the scale, and return false if we
2522 // succeed.
2523 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2524 return false;
2525 break;
2526 }
2527
2528 case ISD::SMUL_LOHI:
2529 case ISD::UMUL_LOHI:
2530 // A mul_lohi where we need the low part can be folded as a plain multiply.
2531 if (N.getResNo() != 0) break;
2532 [[fallthrough]];
2533 case ISD::MUL:
2534 case X86ISD::MUL_IMM:
2535 // X*[3,5,9] -> X+X*[2,4,8]
2536 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2537 AM.Base_Reg.getNode() == nullptr &&
2538 AM.IndexReg.getNode() == nullptr) {
2539 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2540 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2541 CN->getZExtValue() == 9) {
2542 AM.Scale = unsigned(CN->getZExtValue())-1;
2543
2544 SDValue MulVal = N.getOperand(0);
2545 SDValue Reg;
2546
2547 // Okay, we know that we have a scale by now. However, if the scaled
2548 // value is an add of something and a constant, we can fold the
2549 // constant into the disp field here.
2550 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2551 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2552 Reg = MulVal.getOperand(0);
2553 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2554 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2555 if (foldOffsetIntoAddress(Disp, AM))
2556 Reg = N.getOperand(0);
2557 } else {
2558 Reg = N.getOperand(0);
2559 }
2560
2561 AM.IndexReg = AM.Base_Reg = Reg;
2562 return false;
2563 }
2564 }
2565 break;
2566
2567 case ISD::SUB: {
2568 // Given A-B, if A can be completely folded into the address and
2569 // the index field with the index field unused, use -B as the index.
2570 // This is a win if a has multiple parts that can be folded into
2571 // the address. Also, this saves a mov if the base register has
2572 // other uses, since it avoids a two-address sub instruction, however
2573 // it costs an additional mov if the index register has other uses.
2574
2575 // Add an artificial use to this node so that we can keep track of
2576 // it if it gets CSE'd with a different node.
2577 HandleSDNode Handle(N);
2578
2579 // Test if the LHS of the sub can be folded.
2580 X86ISelAddressMode Backup = AM;
2581 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2582 N = Handle.getValue();
2583 AM = Backup;
2584 break;
2585 }
2586 N = Handle.getValue();
2587 // Test if the index field is free for use.
2588 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2589 AM = Backup;
2590 break;
2591 }
2592
2593 int Cost = 0;
2594 SDValue RHS = N.getOperand(1);
2595 // If the RHS involves a register with multiple uses, this
2596 // transformation incurs an extra mov, due to the neg instruction
2597 // clobbering its operand.
2598 if (!RHS.getNode()->hasOneUse() ||
2599 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2600 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2601 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2602 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2603 RHS.getOperand(0).getValueType() == MVT::i32))
2604 ++Cost;
2605 // If the base is a register with multiple uses, this
2606 // transformation may save a mov.
2607 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2608 !AM.Base_Reg.getNode()->hasOneUse()) ||
2609 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2610 --Cost;
2611 // If the folded LHS was interesting, this transformation saves
2612 // address arithmetic.
2613 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2614 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2615 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2616 --Cost;
2617 // If it doesn't look like it may be an overall win, don't do it.
2618 if (Cost >= 0) {
2619 AM = Backup;
2620 break;
2621 }
2622
2623 // Ok, the transformation is legal and appears profitable. Go for it.
2624 // Negation will be emitted later to avoid creating dangling nodes if this
2625 // was an unprofitable LEA.
2626 AM.IndexReg = RHS;
2627 AM.NegateIndex = true;
2628 AM.Scale = 1;
2629 return false;
2630 }
2631
2632 case ISD::OR:
2633 case ISD::XOR:
2634 // See if we can treat the OR/XOR node as an ADD node.
2635 if (!CurDAG->isADDLike(N))
2636 break;
2637 [[fallthrough]];
2638 case ISD::ADD:
2639 if (!matchAdd(N, AM, Depth))
2640 return false;
2641 break;
2642
2643 case ISD::AND: {
2644 // Perform some heroic transforms on an and of a constant-count shift
2645 // with a constant to enable use of the scaled offset field.
2646
2647 // Scale must not be used already.
2648 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2649
2650 // We only handle up to 64-bit values here as those are what matter for
2651 // addressing mode optimizations.
2652 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2653 "Unexpected value size!");
2654
2655 if (!isa<ConstantSDNode>(N.getOperand(1)))
2656 break;
2657
2658 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2659 SDValue Shift = N.getOperand(0);
2660 SDValue X = Shift.getOperand(0);
2661
2662 uint64_t Mask = N.getConstantOperandVal(1);
2663
2664 // Try to fold the mask and shift into an extract and scale.
2665 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2666 return false;
2667
2668 // Try to fold the mask and shift directly into the scale.
2669 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2670 return false;
2671
2672 // Try to fold the mask and shift into BEXTR and scale.
2673 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2674 return false;
2675 }
2676
2677 // Try to swap the mask and shift to place shifts which can be done as
2678 // a scale on the outside of the mask.
2679 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2680 return false;
2681
2682 break;
2683 }
2684 case ISD::ZERO_EXTEND: {
2685 // Try to widen a zexted shift left to the same size as its use, so we can
2686 // match the shift as a scale factor.
2687 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2688 break;
2689
2690 SDValue Src = N.getOperand(0);
2691
2692 // See if we can match a zext(addlike(x,c)).
2693 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2694 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2695 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2696 if (Index != N) {
2697 AM.IndexReg = Index;
2698 return false;
2699 }
2700
2701 // Peek through mask: zext(and(shl(x,c1),c2))
2702 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2703 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2704 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2705 Mask = MaskC->getAPIntValue();
2706 Src = Src.getOperand(0);
2707 }
2708
2709 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse()) {
2710 // Give up if the shift is not a valid scale factor [1,2,3].
2711 SDValue ShlSrc = Src.getOperand(0);
2712 SDValue ShlAmt = Src.getOperand(1);
2713 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2714 if (!ShAmtC)
2715 break;
2716 unsigned ShAmtV = ShAmtC->getZExtValue();
2717 if (ShAmtV > 3)
2718 break;
2719
2720 // The narrow shift must only shift out zero bits (it must be 'nuw').
2721 // That makes it safe to widen to the destination type.
2722 APInt HighZeros =
2723 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2724 if (!Src->getFlags().hasNoUnsignedWrap() &&
2725 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2726 break;
2727
2728 // zext (shl nuw i8 %x, C1) to i32
2729 // --> shl (zext i8 %x to i32), (zext C1)
2730 // zext (and (shl nuw i8 %x, C1), C2) to i32
2731 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2732 MVT SrcVT = ShlSrc.getSimpleValueType();
2733 MVT VT = N.getSimpleValueType();
2734 SDLoc DL(N);
2735
2736 SDValue Res = ShlSrc;
2737 if (!Mask.isAllOnes()) {
2738 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2739 insertDAGNode(*CurDAG, N, Res);
2740 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2741 insertDAGNode(*CurDAG, N, Res);
2742 }
2743 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2744 insertDAGNode(*CurDAG, N, Zext);
2745 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2746 insertDAGNode(*CurDAG, N, NewShl);
2747 CurDAG->ReplaceAllUsesWith(N, NewShl);
2748 CurDAG->RemoveDeadNode(N.getNode());
2749
2750 // Convert the shift to scale factor.
2751 AM.Scale = 1 << ShAmtV;
2752 // If matchIndexRecursively is not called here,
2753 // Zext may be replaced by other nodes but later used to call a builder
2754 // method
2755 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2756 return false;
2757 }
2758
2759 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2760 // Try to fold the mask and shift into an extract and scale.
2761 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2762 Src.getOperand(0), AM))
2763 return false;
2764
2765 // Try to fold the mask and shift directly into the scale.
2766 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2767 Src.getOperand(0), AM))
2768 return false;
2769
2770 // Try to fold the mask and shift into BEXTR and scale.
2771 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2772 Src.getOperand(0), AM, *Subtarget))
2773 return false;
2774 }
2775
2776 break;
2777 }
2778 }
2779
2780 return matchAddressBase(N, AM);
2781}
2782
2783/// Helper for MatchAddress. Add the specified node to the
2784/// specified addressing mode without any further recursion.
2785bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2786 // Is the base register already occupied?
2787 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2788 // If so, check to see if the scale index register is set.
2789 if (!AM.IndexReg.getNode()) {
2790 AM.IndexReg = N;
2791 AM.Scale = 1;
2792 return false;
2793 }
2794
2795 // Otherwise, we cannot select it.
2796 return true;
2797 }
2798
2799 // Default, generate it as a register.
2800 AM.BaseType = X86ISelAddressMode::RegBase;
2801 AM.Base_Reg = N;
2802 return false;
2803}
2804
2805bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2806 X86ISelAddressMode &AM,
2807 unsigned Depth) {
2808 SDLoc dl(N);
2809 LLVM_DEBUG({
2810 dbgs() << "MatchVectorAddress: ";
2811 AM.dump(CurDAG);
2812 });
2813 // Limit recursion.
2815 return matchAddressBase(N, AM);
2816
2817 // TODO: Support other operations.
2818 switch (N.getOpcode()) {
2819 case ISD::Constant: {
2820 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2821 if (!foldOffsetIntoAddress(Val, AM))
2822 return false;
2823 break;
2824 }
2825 case X86ISD::Wrapper:
2826 if (!matchWrapper(N, AM))
2827 return false;
2828 break;
2829 case ISD::ADD: {
2830 // Add an artificial use to this node so that we can keep track of
2831 // it if it gets CSE'd with a different node.
2832 HandleSDNode Handle(N);
2833
2834 X86ISelAddressMode Backup = AM;
2835 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2836 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2837 Depth + 1))
2838 return false;
2839 AM = Backup;
2840
2841 // Try again after commuting the operands.
2842 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2843 Depth + 1) &&
2844 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2845 Depth + 1))
2846 return false;
2847 AM = Backup;
2848
2849 N = Handle.getValue();
2850 break;
2851 }
2852 }
2853
2854 return matchAddressBase(N, AM);
2855}
2856
2857/// Helper for selectVectorAddr. Handles things that can be folded into a
2858/// gather/scatter address. The index register and scale should have already
2859/// been handled.
2860bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2861 return matchVectorAddressRecursively(N, AM, 0);
2862}
2863
2864bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2865 SDValue IndexOp, SDValue ScaleOp,
2866 SDValue &Base, SDValue &Scale,
2867 SDValue &Index, SDValue &Disp,
2868 SDValue &Segment) {
2869 X86ISelAddressMode AM;
2870 AM.Scale = ScaleOp->getAsZExtVal();
2871
2872 // Attempt to match index patterns, as long as we're not relying on implicit
2873 // sign-extension, which is performed BEFORE scale.
2874 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2875 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2876 else
2877 AM.IndexReg = IndexOp;
2878
2879 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2880 if (AddrSpace == X86AS::GS)
2881 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2882 if (AddrSpace == X86AS::FS)
2883 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2884 if (AddrSpace == X86AS::SS)
2885 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2886
2887 SDLoc DL(BasePtr);
2888 MVT VT = BasePtr.getSimpleValueType();
2889
2890 // Try to match into the base and displacement fields.
2891 if (matchVectorAddress(BasePtr, AM))
2892 return false;
2893
2894 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2895 return true;
2896}
2897
2898/// Returns true if it is able to pattern match an addressing mode.
2899/// It returns the operands which make up the maximal addressing mode it can
2900/// match by reference.
2901///
2902/// Parent is the parent node of the addr operand that is being matched. It
2903/// is always a load, store, atomic node, or null. It is only null when
2904/// checking memory operands for inline asm nodes.
2905bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2906 SDValue &Scale, SDValue &Index,
2907 SDValue &Disp, SDValue &Segment) {
2908 X86ISelAddressMode AM;
2909
2910 if (Parent &&
2911 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2912 // that are not a MemSDNode, and thus don't have proper addrspace info.
2913 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2914 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2915 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2916 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2917 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2918 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2919 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2920 unsigned AddrSpace =
2921 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2922 if (AddrSpace == X86AS::GS)
2923 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2924 if (AddrSpace == X86AS::FS)
2925 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2926 if (AddrSpace == X86AS::SS)
2927 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2928 }
2929
2930 // Save the DL and VT before calling matchAddress, it can invalidate N.
2931 SDLoc DL(N);
2932 MVT VT = N.getSimpleValueType();
2933
2934 if (matchAddress(N, AM))
2935 return false;
2936
2937 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2938 return true;
2939}
2940
2941bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2942 // Cannot use 32 bit constants to reference objects in kernel/large code
2943 // model.
2944 if (TM.getCodeModel() == CodeModel::Kernel ||
2945 TM.getCodeModel() == CodeModel::Large)
2946 return false;
2947
2948 // In static codegen with small code model, we can get the address of a label
2949 // into a register with 'movl'
2950 if (N->getOpcode() != X86ISD::Wrapper)
2951 return false;
2952
2953 N = N.getOperand(0);
2954
2955 // At least GNU as does not accept 'movl' for TPOFF relocations.
2956 // FIXME: We could use 'movl' when we know we are targeting MC.
2957 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2958 return false;
2959
2960 Imm = N;
2961 // Small/medium code model can reference non-TargetGlobalAddress objects with
2962 // 32 bit constants.
2963 if (N->getOpcode() != ISD::TargetGlobalAddress) {
2964 return TM.getCodeModel() == CodeModel::Small ||
2965 TM.getCodeModel() == CodeModel::Medium;
2966 }
2967
2968 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
2969 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
2970 return CR->getUnsignedMax().ult(1ull << 32);
2971
2972 return !TM.isLargeGlobalValue(GV);
2973}
2974
2975bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2976 SDValue &Scale, SDValue &Index,
2977 SDValue &Disp, SDValue &Segment) {
2978 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2979 SDLoc DL(N);
2980
2981 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2982 return false;
2983
2984 auto *RN = dyn_cast<RegisterSDNode>(Base);
2985 if (RN && RN->getReg() == 0)
2986 Base = CurDAG->getRegister(0, MVT::i64);
2987 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2988 // Base could already be %rip, particularly in the x32 ABI.
2989 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2990 MVT::i64), 0);
2991 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2992 Base);
2993 }
2994
2995 RN = dyn_cast<RegisterSDNode>(Index);
2996 if (RN && RN->getReg() == 0)
2997 Index = CurDAG->getRegister(0, MVT::i64);
2998 else {
2999 assert(Index.getValueType() == MVT::i32 &&
3000 "Expect to be extending 32-bit registers for use in LEA");
3001 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3002 MVT::i64), 0);
3003 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3004 Index);
3005 }
3006
3007 return true;
3008}
3009
3010/// Calls SelectAddr and determines if the maximal addressing
3011/// mode it matches can be cost effectively emitted as an LEA instruction.
3012bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3013 SDValue &Base, SDValue &Scale,
3014 SDValue &Index, SDValue &Disp,
3015 SDValue &Segment) {
3016 X86ISelAddressMode AM;
3017
3018 // Save the DL and VT before calling matchAddress, it can invalidate N.
3019 SDLoc DL(N);
3020 MVT VT = N.getSimpleValueType();
3021
3022 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3023 // segments.
3024 SDValue Copy = AM.Segment;
3025 SDValue T = CurDAG->getRegister(0, MVT::i32);
3026 AM.Segment = T;
3027 if (matchAddress(N, AM))
3028 return false;
3029 assert (T == AM.Segment);
3030 AM.Segment = Copy;
3031
3032 unsigned Complexity = 0;
3033 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3034 Complexity = 1;
3035 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3036 Complexity = 4;
3037
3038 if (AM.IndexReg.getNode())
3039 Complexity++;
3040
3041 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3042 // a simple shift.
3043 if (AM.Scale > 1)
3044 Complexity++;
3045
3046 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3047 // to a LEA. This is determined with some experimentation but is by no means
3048 // optimal (especially for code size consideration). LEA is nice because of
3049 // its three-address nature. Tweak the cost function again when we can run
3050 // convertToThreeAddress() at register allocation time.
3051 if (AM.hasSymbolicDisplacement()) {
3052 // For X86-64, always use LEA to materialize RIP-relative addresses.
3053 if (Subtarget->is64Bit())
3054 Complexity = 4;
3055 else
3056 Complexity += 2;
3057 }
3058
3059 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3060 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3061 // duplicating flag-producing instructions later in the pipeline.
3062 if (N.getOpcode() == ISD::ADD) {
3063 auto isMathWithFlags = [](SDValue V) {
3064 switch (V.getOpcode()) {
3065 case X86ISD::ADD:
3066 case X86ISD::SUB:
3067 case X86ISD::ADC:
3068 case X86ISD::SBB:
3069 case X86ISD::SMUL:
3070 case X86ISD::UMUL:
3071 /* TODO: These opcodes can be added safely, but we may want to justify
3072 their inclusion for different reasons (better for reg-alloc).
3073 case X86ISD::OR:
3074 case X86ISD::XOR:
3075 case X86ISD::AND:
3076 */
3077 // Value 1 is the flag output of the node - verify it's not dead.
3078 return !SDValue(V.getNode(), 1).use_empty();
3079 default:
3080 return false;
3081 }
3082 };
3083 // TODO: We might want to factor in whether there's a load folding
3084 // opportunity for the math op that disappears with LEA.
3085 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3086 Complexity++;
3087 }
3088
3089 if (AM.Disp)
3090 Complexity++;
3091
3092 // If it isn't worth using an LEA, reject it.
3093 if (Complexity <= 2)
3094 return false;
3095
3096 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3097 return true;
3098}
3099
3100/// This is only run on TargetGlobalTLSAddress nodes.
3101bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3102 SDValue &Scale, SDValue &Index,
3103 SDValue &Disp, SDValue &Segment) {
3104 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3105 N.getOpcode() == ISD::TargetExternalSymbol);
3106
3107 X86ISelAddressMode AM;
3108 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3109 AM.GV = GA->getGlobal();
3110 AM.Disp += GA->getOffset();
3111 AM.SymbolFlags = GA->getTargetFlags();
3112 } else {
3113 auto *SA = cast<ExternalSymbolSDNode>(N);
3114 AM.ES = SA->getSymbol();
3115 AM.SymbolFlags = SA->getTargetFlags();
3116 }
3117
3118 if (Subtarget->is32Bit()) {
3119 AM.Scale = 1;
3120 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3121 }
3122
3123 MVT VT = N.getSimpleValueType();
3124 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3125 return true;
3126}
3127
3128bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3129 // Keep track of the original value type and whether this value was
3130 // truncated. If we see a truncation from pointer type to VT that truncates
3131 // bits that are known to be zero, we can use a narrow reference.
3132 EVT VT = N.getValueType();
3133 bool WasTruncated = false;
3134 if (N.getOpcode() == ISD::TRUNCATE) {
3135 WasTruncated = true;
3136 N = N.getOperand(0);
3137 }
3138
3139 if (N.getOpcode() != X86ISD::Wrapper)
3140 return false;
3141
3142 // We can only use non-GlobalValues as immediates if they were not truncated,
3143 // as we do not have any range information. If we have a GlobalValue and the
3144 // address was not truncated, we can select it as an operand directly.
3145 unsigned Opc = N.getOperand(0)->getOpcode();
3146 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3147 Op = N.getOperand(0);
3148 // We can only select the operand directly if we didn't have to look past a
3149 // truncate.
3150 return !WasTruncated;
3151 }
3152
3153 // Check that the global's range fits into VT.
3154 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3155 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3156 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3157 return false;
3158
3159 // Okay, we can use a narrow reference.
3160 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3161 GA->getOffset(), GA->getTargetFlags());
3162 return true;
3163}
3164
3165bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3166 SDValue &Base, SDValue &Scale,
3167 SDValue &Index, SDValue &Disp,
3168 SDValue &Segment) {
3169 assert(Root && P && "Unknown root/parent nodes");
3170 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3171 !IsProfitableToFold(N, P, Root) ||
3172 !IsLegalToFold(N, P, Root, OptLevel))
3173 return false;
3174
3175 return selectAddr(N.getNode(),
3176 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3177}
3178
3179bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3180 SDValue &Base, SDValue &Scale,
3181 SDValue &Index, SDValue &Disp,
3182 SDValue &Segment) {
3183 assert(Root && P && "Unknown root/parent nodes");
3184 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3185 !IsProfitableToFold(N, P, Root) ||
3186 !IsLegalToFold(N, P, Root, OptLevel))
3187 return false;
3188
3189 return selectAddr(N.getNode(),
3190 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3191}
3192
3193/// Return an SDNode that returns the value of the global base register.
3194/// Output instructions required to initialize the global base register,
3195/// if necessary.
3196SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3197 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3198 auto &DL = MF->getDataLayout();
3199 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3200}
3201
3202bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3203 if (N->getOpcode() == ISD::TRUNCATE)
3204 N = N->getOperand(0).getNode();
3205 if (N->getOpcode() != X86ISD::Wrapper)
3206 return false;
3207
3208 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3209 if (!GA)
3210 return false;
3211
3212 auto *GV = GA->getGlobal();
3213 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3214 if (CR)
3215 return CR->getSignedMin().sge(-1ull << Width) &&
3216 CR->getSignedMax().slt(1ull << Width);
3217 // In the kernel code model, globals are in the negative 2GB of the address
3218 // space, so globals can be a sign extended 32-bit immediate.
3219 // In other code models, small globals are in the low 2GB of the address
3220 // space, so sign extending them is equivalent to zero extending them.
3221 return Width == 32 && !TM.isLargeGlobalValue(GV);
3222}
3223
3224X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3225 assert(N->isMachineOpcode() && "Unexpected node");
3226 unsigned Opc = N->getMachineOpcode();
3227 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3228 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3229 if (CondNo < 0)
3230 return X86::COND_INVALID;
3231
3232 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3233}
3234
3235/// Test whether the given X86ISD::CMP node has any users that use a flag
3236/// other than ZF.
3237bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3238 // Examine each user of the node.
3239 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3240 UI != UE; ++UI) {
3241 // Only check things that use the flags.
3242 if (UI.getUse().getResNo() != Flags.getResNo())
3243 continue;
3244 // Only examine CopyToReg uses that copy to EFLAGS.
3245 if (UI->getOpcode() != ISD::CopyToReg ||
3246 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3247 return false;
3248 // Examine each user of the CopyToReg use.
3249 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3250 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3251 // Only examine the Flag result.
3252 if (FlagUI.getUse().getResNo() != 1) continue;
3253 // Anything unusual: assume conservatively.
3254 if (!FlagUI->isMachineOpcode()) return false;
3255 // Examine the condition code of the user.
3256 X86::CondCode CC = getCondFromNode(*FlagUI);
3257
3258 switch (CC) {
3259 // Comparisons which only use the zero flag.
3260 case X86::COND_E: case X86::COND_NE:
3261 continue;
3262 // Anything else: assume conservatively.
3263 default:
3264 return false;
3265 }
3266 }
3267 }
3268 return true;
3269}
3270
3271/// Test whether the given X86ISD::CMP node has any uses which require the SF
3272/// flag to be accurate.
3273bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3274 // Examine each user of the node.
3275 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3276 UI != UE; ++UI) {
3277 // Only check things that use the flags.
3278 if (UI.getUse().getResNo() != Flags.getResNo())
3279 continue;
3280 // Only examine CopyToReg uses that copy to EFLAGS.
3281 if (UI->getOpcode() != ISD::CopyToReg ||
3282 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3283 return false;
3284 // Examine each user of the CopyToReg use.
3285 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3286 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3287 // Only examine the Flag result.
3288 if (FlagUI.getUse().getResNo() != 1) continue;
3289 // Anything unusual: assume conservatively.
3290 if (!FlagUI->isMachineOpcode()) return false;
3291 // Examine the condition code of the user.
3292 X86::CondCode CC = getCondFromNode(*FlagUI);
3293
3294 switch (CC) {
3295 // Comparisons which don't examine the SF flag.
3296 case X86::COND_A: case X86::COND_AE:
3297 case X86::COND_B: case X86::COND_BE:
3298 case X86::COND_E: case X86::COND_NE:
3299 case X86::COND_O: case X86::COND_NO:
3300 case X86::COND_P: case X86::COND_NP:
3301 continue;
3302 // Anything else: assume conservatively.
3303 default:
3304 return false;
3305 }
3306 }
3307 }
3308 return true;
3309}
3310
3312 switch (CC) {
3313 // Comparisons which don't examine the CF flag.
3314 case X86::COND_O: case X86::COND_NO:
3315 case X86::COND_E: case X86::COND_NE:
3316 case X86::COND_S: case X86::COND_NS:
3317 case X86::COND_P: case X86::COND_NP:
3318 case X86::COND_L: case X86::COND_GE:
3319 case X86::COND_G: case X86::COND_LE:
3320 return false;
3321 // Anything else: assume conservatively.
3322 default:
3323 return true;
3324 }
3325}
3326
3327/// Test whether the given node which sets flags has any uses which require the
3328/// CF flag to be accurate.
3329 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3330 // Examine each user of the node.
3331 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3332 UI != UE; ++UI) {
3333 // Only check things that use the flags.
3334 if (UI.getUse().getResNo() != Flags.getResNo())
3335 continue;
3336
3337 unsigned UIOpc = UI->getOpcode();
3338
3339 if (UIOpc == ISD::CopyToReg) {
3340 // Only examine CopyToReg uses that copy to EFLAGS.
3341 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3342 return false;
3343 // Examine each user of the CopyToReg use.
3344 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
3345 FlagUI != FlagUE; ++FlagUI) {
3346 // Only examine the Flag result.
3347 if (FlagUI.getUse().getResNo() != 1)
3348 continue;
3349 // Anything unusual: assume conservatively.
3350 if (!FlagUI->isMachineOpcode())
3351 return false;
3352 // Examine the condition code of the user.
3353 X86::CondCode CC = getCondFromNode(*FlagUI);
3354
3355 if (mayUseCarryFlag(CC))
3356 return false;
3357 }
3358
3359 // This CopyToReg is ok. Move on to the next user.
3360 continue;
3361 }
3362
3363 // This might be an unselected node. So look for the pre-isel opcodes that
3364 // use flags.
3365 unsigned CCOpNo;
3366 switch (UIOpc) {
3367 default:
3368 // Something unusual. Be conservative.
3369 return false;
3370 case X86ISD::SETCC: CCOpNo = 0; break;
3371 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3372 case X86ISD::CMOV: CCOpNo = 2; break;
3373 case X86ISD::BRCOND: CCOpNo = 2; break;
3374 }
3375
3376 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
3377 if (mayUseCarryFlag(CC))
3378 return false;
3379 }
3380 return true;
3381}
3382
3383/// Check whether or not the chain ending in StoreNode is suitable for doing
3384/// the {load; op; store} to modify transformation.
3386 SDValue StoredVal, SelectionDAG *CurDAG,
3387 unsigned LoadOpNo,
3388 LoadSDNode *&LoadNode,
3389 SDValue &InputChain) {
3390 // Is the stored value result 0 of the operation?
3391 if (StoredVal.getResNo() != 0) return false;
3392
3393 // Are there other uses of the operation other than the store?
3394 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3395
3396 // Is the store non-extending and non-indexed?
3397 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3398 return false;
3399
3400 SDValue Load = StoredVal->getOperand(LoadOpNo);
3401 // Is the stored value a non-extending and non-indexed load?
3402 if (!ISD::isNormalLoad(Load.getNode())) return false;
3403
3404 // Return LoadNode by reference.
3405 LoadNode = cast<LoadSDNode>(Load);
3406
3407 // Is store the only read of the loaded value?
3408 if (!Load.hasOneUse())
3409 return false;
3410
3411 // Is the address of the store the same as the load?
3412 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3413 LoadNode->getOffset() != StoreNode->getOffset())
3414 return false;
3415
3416 bool FoundLoad = false;
3417 SmallVector<SDValue, 4> ChainOps;
3418 SmallVector<const SDNode *, 4> LoopWorklist;
3420 const unsigned int Max = 1024;
3421
3422 // Visualization of Load-Op-Store fusion:
3423 // -------------------------
3424 // Legend:
3425 // *-lines = Chain operand dependencies.
3426 // |-lines = Normal operand dependencies.
3427 // Dependencies flow down and right. n-suffix references multiple nodes.
3428 //
3429 // C Xn C
3430 // * * *
3431 // * * *
3432 // Xn A-LD Yn TF Yn
3433 // * * \ | * |
3434 // * * \ | * |
3435 // * * \ | => A--LD_OP_ST
3436 // * * \| \
3437 // TF OP \
3438 // * | \ Zn
3439 // * | \
3440 // A-ST Zn
3441 //
3442
3443 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3444 // #2: Yn -> LD
3445 // #3: ST -> Zn
3446
3447 // Ensure the transform is safe by checking for the dual
3448 // dependencies to make sure we do not induce a loop.
3449
3450 // As LD is a predecessor to both OP and ST we can do this by checking:
3451 // a). if LD is a predecessor to a member of Xn or Yn.
3452 // b). if a Zn is a predecessor to ST.
3453
3454 // However, (b) can only occur through being a chain predecessor to
3455 // ST, which is the same as Zn being a member or predecessor of Xn,
3456 // which is a subset of LD being a predecessor of Xn. So it's
3457 // subsumed by check (a).
3458
3459 SDValue Chain = StoreNode->getChain();
3460
3461 // Gather X elements in ChainOps.
3462 if (Chain == Load.getValue(1)) {
3463 FoundLoad = true;
3464 ChainOps.push_back(Load.getOperand(0));
3465 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3466 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3467 SDValue Op = Chain.getOperand(i);
3468 if (Op == Load.getValue(1)) {
3469 FoundLoad = true;
3470 // Drop Load, but keep its chain. No cycle check necessary.
3471 ChainOps.push_back(Load.getOperand(0));
3472 continue;
3473 }
3474 LoopWorklist.push_back(Op.getNode());
3475 ChainOps.push_back(Op);
3476 }
3477 }
3478
3479 if (!FoundLoad)
3480 return false;
3481
3482 // Worklist is currently Xn. Add Yn to worklist.
3483 for (SDValue Op : StoredVal->ops())
3484 if (Op.getNode() != LoadNode)
3485 LoopWorklist.push_back(Op.getNode());
3486
3487 // Check (a) if Load is a predecessor to Xn + Yn
3488 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3489 true))
3490 return false;
3491
3492 InputChain =
3493 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3494 return true;
3495}
3496
3497// Change a chain of {load; op; store} of the same value into a simple op
3498// through memory of that value, if the uses of the modified value and its
3499// address are suitable.
3500//
3501// The tablegen pattern memory operand pattern is currently not able to match
3502// the case where the EFLAGS on the original operation are used.
3503//
3504// To move this to tablegen, we'll need to improve tablegen to allow flags to
3505// be transferred from a node in the pattern to the result node, probably with
3506// a new keyword. For example, we have this
3507// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3508// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3509// (implicit EFLAGS)]>;
3510// but maybe need something like this
3511// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3512// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3513// (transferrable EFLAGS)]>;
3514//
3515// Until then, we manually fold these and instruction select the operation
3516// here.
3517bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3518 auto *StoreNode = cast<StoreSDNode>(Node);
3519 SDValue StoredVal = StoreNode->getOperand(1);
3520 unsigned Opc = StoredVal->getOpcode();
3521
3522 // Before we try to select anything, make sure this is memory operand size
3523 // and opcode we can handle. Note that this must match the code below that
3524 // actually lowers the opcodes.
3525 EVT MemVT = StoreNode->getMemoryVT();
3526 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3527 MemVT != MVT::i8)
3528 return false;
3529
3530 bool IsCommutable = false;
3531 bool IsNegate = false;
3532 switch (Opc) {
3533 default:
3534 return false;
3535 case X86ISD::SUB:
3536 IsNegate = isNullConstant(StoredVal.getOperand(0));
3537 break;
3538 case X86ISD::SBB:
3539 break;
3540 case X86ISD::ADD:
3541 case X86ISD::ADC:
3542 case X86ISD::AND:
3543 case X86ISD::OR:
3544 case X86ISD::XOR:
3545 IsCommutable = true;
3546 break;
3547 }
3548
3549 unsigned LoadOpNo = IsNegate ? 1 : 0;
3550 LoadSDNode *LoadNode = nullptr;
3551 SDValue InputChain;
3552 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3553 LoadNode, InputChain)) {
3554 if (!IsCommutable)
3555 return false;
3556
3557 // This operation is commutable, try the other operand.
3558 LoadOpNo = 1;
3559 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3560 LoadNode, InputChain))
3561 return false;
3562 }
3563
3564 SDValue Base, Scale, Index, Disp, Segment;
3565 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3566 Segment))
3567 return false;
3568
3569 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3570 unsigned Opc8) {
3571 switch (MemVT.getSimpleVT().SimpleTy) {
3572 case MVT::i64:
3573 return Opc64;
3574 case MVT::i32:
3575 return Opc32;
3576 case MVT::i16:
3577 return Opc16;
3578 case MVT::i8:
3579 return Opc8;
3580 default:
3581 llvm_unreachable("Invalid size!");
3582 }
3583 };
3584
3586 switch (Opc) {
3587 case X86ISD::SUB:
3588 // Handle negate.
3589 if (IsNegate) {
3590 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3591 X86::NEG8m);
3592 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3593 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3594 MVT::Other, Ops);
3595 break;
3596 }
3597 [[fallthrough]];
3598 case X86ISD::ADD:
3599 // Try to match inc/dec.
3600 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3601 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3602 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3603 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3604 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3605 unsigned NewOpc =
3606 ((Opc == X86ISD::ADD) == IsOne)
3607 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3608 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3609 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3610 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3611 MVT::Other, Ops);
3612 break;
3613 }
3614 }
3615 [[fallthrough]];
3616 case X86ISD::ADC:
3617 case X86ISD::SBB:
3618 case X86ISD::AND:
3619 case X86ISD::OR:
3620 case X86ISD::XOR: {
3621 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3622 switch (Opc) {
3623 case X86ISD::ADD:
3624 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3625 X86::ADD8mr);
3626 case X86ISD::ADC:
3627 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3628 X86::ADC8mr);
3629 case X86ISD::SUB:
3630 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3631 X86::SUB8mr);
3632 case X86ISD::SBB:
3633 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3634 X86::SBB8mr);
3635 case X86ISD::AND:
3636 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3637 X86::AND8mr);
3638 case X86ISD::OR:
3639 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3640 case X86ISD::XOR:
3641 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3642 X86::XOR8mr);
3643 default:
3644 llvm_unreachable("Invalid opcode!");
3645 }
3646 };
3647 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3648 switch (Opc) {
3649 case X86ISD::ADD:
3650 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3651 X86::ADD8mi);
3652 case X86ISD::ADC:
3653 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3654 X86::ADC8mi);
3655 case X86ISD::SUB:
3656 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3657 X86::SUB8mi);
3658 case X86ISD::SBB:
3659 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3660 X86::SBB8mi);
3661 case X86ISD::AND:
3662 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3663 X86::AND8mi);
3664 case X86ISD::OR:
3665 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3666 X86::OR8mi);
3667 case X86ISD::XOR:
3668 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3669 X86::XOR8mi);
3670 default:
3671 llvm_unreachable("Invalid opcode!");
3672 }
3673 };
3674
3675 unsigned NewOpc = SelectRegOpcode(Opc);
3676 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3677
3678 // See if the operand is a constant that we can fold into an immediate
3679 // operand.
3680 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3681 int64_t OperandV = OperandC->getSExtValue();
3682
3683 // Check if we can shrink the operand enough to fit in an immediate (or
3684 // fit into a smaller immediate) by negating it and switching the
3685 // operation.
3686 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3687 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3688 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3689 isInt<32>(-OperandV))) &&
3690 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3691 OperandV = -OperandV;
3692 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3693 }
3694
3695 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3696 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3697 NewOpc = SelectImmOpcode(Opc);
3698 }
3699 }
3700
3701 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3702 SDValue CopyTo =
3703 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3704 StoredVal.getOperand(2), SDValue());
3705
3706 const SDValue Ops[] = {Base, Scale, Index, Disp,
3707 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3708 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3709 Ops);
3710 } else {
3711 const SDValue Ops[] = {Base, Scale, Index, Disp,
3712 Segment, Operand, InputChain};
3713 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3714 Ops);
3715 }
3716 break;
3717 }
3718 default:
3719 llvm_unreachable("Invalid opcode!");
3720 }
3721
3722 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3723 LoadNode->getMemOperand()};
3724 CurDAG->setNodeMemRefs(Result, MemOps);
3725
3726 // Update Load Chain uses as well.
3727 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3728 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3729 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3730 CurDAG->RemoveDeadNode(Node);
3731 return true;
3732}
3733
3734// See if this is an X & Mask that we can match to BEXTR/BZHI.
3735// Where Mask is one of the following patterns:
3736// a) x & (1 << nbits) - 1
3737// b) x & ~(-1 << nbits)
3738// c) x & (-1 >> (32 - y))
3739// d) x << (32 - y) >> (32 - y)
3740// e) (1 << nbits) - 1
3741bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3742 assert(
3743 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3744 Node->getOpcode() == ISD::SRL) &&
3745 "Should be either an and-mask, or right-shift after clearing high bits.");
3746
3747 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3748 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3749 return false;
3750
3751 MVT NVT = Node->getSimpleValueType(0);
3752
3753 // Only supported for 32 and 64 bits.
3754 if (NVT != MVT::i32 && NVT != MVT::i64)
3755 return false;
3756
3757 SDValue NBits;
3758 bool NegateNBits;
3759
3760 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3761 // Else, if we only have BMI1's BEXTR, we require one-use.
3762 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3763 auto checkUses = [AllowExtraUsesByDefault](
3764 SDValue Op, unsigned NUses,
3765 std::optional<bool> AllowExtraUses) {
3766 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3767 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3768 };
3769 auto checkOneUse = [checkUses](SDValue Op,
3770 std::optional<bool> AllowExtraUses =
3771 std::nullopt) {
3772 return checkUses(Op, 1, AllowExtraUses);
3773 };
3774 auto checkTwoUse = [checkUses](SDValue Op,
3775 std::optional<bool> AllowExtraUses =
3776 std::nullopt) {
3777 return checkUses(Op, 2, AllowExtraUses);
3778 };
3779
3780 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3781 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3782 assert(V.getSimpleValueType() == MVT::i32 &&
3783 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3784 "Expected i64 -> i32 truncation");
3785 V = V.getOperand(0);
3786 }
3787 return V;
3788 };
3789
3790 // a) x & ((1 << nbits) + (-1))
3791 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3792 &NegateNBits](SDValue Mask) -> bool {
3793 // Match `add`. Must only have one use!
3794 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3795 return false;
3796 // We should be adding all-ones constant (i.e. subtracting one.)
3797 if (!isAllOnesConstant(Mask->getOperand(1)))
3798 return false;
3799 // Match `1 << nbits`. Might be truncated. Must only have one use!
3800 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3801 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3802 return false;
3803 if (!isOneConstant(M0->getOperand(0)))
3804 return false;
3805 NBits = M0->getOperand(1);
3806 NegateNBits = false;
3807 return true;
3808 };
3809
3810 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3811 V = peekThroughOneUseTruncation(V);
3812 return CurDAG->MaskedValueIsAllOnes(
3813 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3814 NVT.getSizeInBits()));
3815 };
3816
3817 // b) x & ~(-1 << nbits)
3818 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3819 &NBits, &NegateNBits](SDValue Mask) -> bool {
3820 // Match `~()`. Must only have one use!
3821 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3822 return false;
3823 // The -1 only has to be all-ones for the final Node's NVT.
3824 if (!isAllOnes(Mask->getOperand(1)))
3825 return false;
3826 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3827 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3828 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3829 return false;
3830 // The -1 only has to be all-ones for the final Node's NVT.
3831 if (!isAllOnes(M0->getOperand(0)))
3832 return false;
3833 NBits = M0->getOperand(1);
3834 NegateNBits = false;
3835 return true;
3836 };
3837
3838 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3839 // or leave the shift amount as-is, but then we'll have to negate it.
3840 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3841 unsigned Bitwidth) {
3842 NBits = ShiftAmt;
3843 NegateNBits = true;
3844 // Skip over a truncate of the shift amount, if any.
3845 if (NBits.getOpcode() == ISD::TRUNCATE)
3846 NBits = NBits.getOperand(0);
3847 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3848 // If it doesn't match, that's fine, we'll just negate it ourselves.
3849 if (NBits.getOpcode() != ISD::SUB)
3850 return;
3851 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3852 if (!V0 || V0->getZExtValue() != Bitwidth)
3853 return;
3854 NBits = NBits.getOperand(1);
3855 NegateNBits = false;
3856 };
3857
3858 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3859 // or
3860 // c) x & (-1 >> (32 - y))
3861 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3862 canonicalizeShiftAmt](SDValue Mask) -> bool {
3863 // The mask itself may be truncated.
3864 Mask = peekThroughOneUseTruncation(Mask);
3865 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3866 // Match `l>>`. Must only have one use!
3867 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3868 return false;
3869 // We should be shifting truly all-ones constant.
3870 if (!isAllOnesConstant(Mask.getOperand(0)))
3871 return false;
3872 SDValue M1 = Mask.getOperand(1);
3873 // The shift amount should not be used externally.
3874 if (!checkOneUse(M1))
3875 return false;
3876 canonicalizeShiftAmt(M1, Bitwidth);
3877 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3878 // is no extra use of the mask. Clearly, there was one since we are here.
3879 // But at the same time, if we need to negate the shift amount,
3880 // then we don't want the mask to stick around, else it's unprofitable.
3881 return !NegateNBits;
3882 };
3883
3884 SDValue X;
3885
3886 // d) x << z >> z but then we'll have to subtract z from bitwidth
3887 // or
3888 // d) x << (32 - y) >> (32 - y)
3889 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3890 AllowExtraUsesByDefault, &NegateNBits,
3891 &X](SDNode *Node) -> bool {
3892 if (Node->getOpcode() != ISD::SRL)
3893 return false;
3894 SDValue N0 = Node->getOperand(0);
3895 if (N0->getOpcode() != ISD::SHL)
3896 return false;
3897 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3898 SDValue N1 = Node->getOperand(1);
3899 SDValue N01 = N0->getOperand(1);
3900 // Both of the shifts must be by the exact same value.
3901 if (N1 != N01)
3902 return false;
3903 canonicalizeShiftAmt(N1, Bitwidth);
3904 // There should not be any external uses of the inner shift / shift amount.
3905 // Note that while we are generally okay with external uses given BMI2,
3906 // iff we need to negate the shift amount, we are not okay with extra uses.
3907 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3908 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3909 return false;
3910 X = N0->getOperand(0);
3911 return true;
3912 };
3913
3914 auto matchLowBitMask = [matchPatternA, matchPatternB,
3915 matchPatternC](SDValue Mask) -> bool {
3916 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3917 };
3918
3919 if (Node->getOpcode() == ISD::AND) {
3920 X = Node->getOperand(0);
3921 SDValue Mask = Node->getOperand(1);
3922
3923 if (matchLowBitMask(Mask)) {
3924 // Great.
3925 } else {
3926 std::swap(X, Mask);
3927 if (!matchLowBitMask(Mask))
3928 return false;
3929 }
3930 } else if (matchLowBitMask(SDValue(Node, 0))) {
3931 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
3932 } else if (!matchPatternD(Node))
3933 return false;
3934
3935 // If we need to negate the shift amount, require BMI2 BZHI support.
3936 // It's just too unprofitable for BMI1 BEXTR.
3937 if (NegateNBits && !Subtarget->hasBMI2())
3938 return false;
3939
3940 SDLoc DL(Node);
3941
3942 // Truncate the shift amount.
3943 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3944 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3945
3946 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3947 // All the other bits are undefined, we do not care about them.
3948 SDValue ImplDef = SDValue(
3949 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3950 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3951
3952 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3953 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3954 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
3955 MVT::i32, ImplDef, NBits, SRIdxVal),
3956 0);
3957 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3958
3959 // We might have matched the amount of high bits to be cleared,
3960 // but we want the amount of low bits to be kept, so negate it then.
3961 if (NegateNBits) {
3962 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
3963 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
3964
3965 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
3966 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3967 }
3968
3969 if (Subtarget->hasBMI2()) {
3970 // Great, just emit the BZHI..
3971 if (NVT != MVT::i32) {
3972 // But have to place the bit count into the wide-enough register first.
3973 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3974 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3975 }
3976
3977 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3978 ReplaceNode(Node, Extract.getNode());
3979 SelectCode(Extract.getNode());
3980 return true;
3981 }
3982
3983 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3984 // *logically* shifted (potentially with one-use trunc inbetween),
3985 // and the truncation was the only use of the shift,
3986 // and if so look past one-use truncation.
3987 {
3988 SDValue RealX = peekThroughOneUseTruncation(X);
3989 // FIXME: only if the shift is one-use?
3990 if (RealX != X && RealX.getOpcode() == ISD::SRL)
3991 X = RealX;
3992 }
3993
3994 MVT XVT = X.getSimpleValueType();
3995
3996 // Else, emitting BEXTR requires one more step.
3997 // The 'control' of BEXTR has the pattern of:
3998 // [15...8 bit][ 7...0 bit] location
3999 // [ bit count][ shift] name
4000 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4001
4002 // Shift NBits left by 8 bits, thus producing 'control'.
4003 // This makes the low 8 bits to be zero.
4004 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4005 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4006 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4007 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4008
4009 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4010 // FIXME: only if the shift is one-use?
4011 if (X.getOpcode() == ISD::SRL) {
4012 SDValue ShiftAmt = X.getOperand(1);
4013 X = X.getOperand(0);
4014
4015 assert(ShiftAmt.getValueType() == MVT::i8 &&
4016 "Expected shift amount to be i8");
4017
4018 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4019 // We could zext to i16 in some form, but we intentionally don't do that.
4020 SDValue OrigShiftAmt = ShiftAmt;
4021 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4022 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4023
4024 // And now 'or' these low 8 bits of shift amount into the 'control'.
4025 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4026 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4027 }
4028
4029 // But have to place the 'control' into the wide-enough register first.
4030 if (XVT != MVT::i32) {
4031 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4032 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4033 }
4034
4035 // And finally, form the BEXTR itself.
4036 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4037
4038 // The 'X' was originally truncated. Do that now.
4039 if (XVT != NVT) {
4040 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4041 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4042 }
4043
4044 ReplaceNode(Node, Extract.getNode());
4045 SelectCode(Extract.getNode());
4046
4047 return true;
4048}
4049
4050// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4051MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4052 MVT NVT = Node->getSimpleValueType(0);
4053 SDLoc dl(Node);
4054
4055 SDValue N0 = Node->getOperand(0);
4056 SDValue N1 = Node->getOperand(1);
4057
4058 // If we have TBM we can use an immediate for the control. If we have BMI
4059 // we should only do this if the BEXTR instruction is implemented well.
4060 // Otherwise moving the control into a register makes this more costly.
4061 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4062 // hoisting the move immediate would make it worthwhile with a less optimal
4063 // BEXTR?
4064 bool PreferBEXTR =
4065 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4066 if (!PreferBEXTR && !Subtarget->hasBMI2())
4067 return nullptr;
4068
4069 // Must have a shift right.
4070 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4071 return nullptr;
4072
4073 // Shift can't have additional users.
4074 if (!N0->hasOneUse())
4075 return nullptr;
4076
4077 // Only supported for 32 and 64 bits.
4078 if (NVT != MVT::i32 && NVT != MVT::i64)
4079 return nullptr;
4080
4081 // Shift amount and RHS of and must be constant.
4082 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4083 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4084 if (!MaskCst || !ShiftCst)
4085 return nullptr;
4086
4087 // And RHS must be a mask.
4088 uint64_t Mask = MaskCst->getZExtValue();
4089 if (!isMask_64(Mask))
4090 return nullptr;
4091
4092 uint64_t Shift = ShiftCst->getZExtValue();
4093 uint64_t MaskSize = llvm::popcount(Mask);
4094
4095 // Don't interfere with something that can be handled by extracting AH.
4096 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4097 if (Shift == 8 && MaskSize == 8)
4098 return nullptr;
4099
4100 // Make sure we are only using bits that were in the original value, not
4101 // shifted in.
4102 if (Shift + MaskSize > NVT.getSizeInBits())
4103 return nullptr;
4104
4105 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4106 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4107 // does not fit into 32 bits. Load folding is not a sufficient reason.
4108 if (!PreferBEXTR && MaskSize <= 32)
4109 return nullptr;
4110
4111 SDValue Control;
4112 unsigned ROpc, MOpc;
4113
4114#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4115 if (!PreferBEXTR) {
4116 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4117 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4118 // Let's perform the mask first, and apply shift later. Note that we need to
4119 // widen the mask to account for the fact that we'll apply shift afterwards!
4120 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4121 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4122 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4123 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4124 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4125 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4126 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4127 } else {
4128 // The 'control' of BEXTR has the pattern of:
4129 // [15...8 bit][ 7...0 bit] location
4130 // [ bit count][ shift] name
4131 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4132 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4133 if (Subtarget->hasTBM()) {
4134 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4135 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4136 } else {
4137 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4138 // BMI requires the immediate to placed in a register.
4139 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4140 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4141 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4142 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4143 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4144 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4145 }
4146 }
4147
4148 MachineSDNode *NewNode;
4149 SDValue Input = N0->getOperand(0);
4150 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4151 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4152 SDValue Ops[] = {
4153 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4154 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4155 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4156 // Update the chain.
4157 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4158 // Record the mem-refs
4159 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4160 } else {
4161 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4162 }
4163
4164 if (!PreferBEXTR) {
4165 // We still need to apply the shift.
4166 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4167 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4168 : GET_ND_IF_ENABLED(X86::SHR32ri);
4169 NewNode =
4170 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4171 }
4172
4173 return NewNode;
4174}
4175
4176// Emit a PCMISTR(I/M) instruction.
4177MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4178 bool MayFoldLoad, const SDLoc &dl,
4179 MVT VT, SDNode *Node) {
4180 SDValue N0 = Node->getOperand(0);
4181 SDValue N1 = Node->getOperand(1);
4182 SDValue Imm = Node->getOperand(2);
4183 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4184 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4185
4186 // Try to fold a load. No need to check alignment.
4187 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4188 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4189 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4190 N1.getOperand(0) };
4191 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4192 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4193 // Update the chain.
4194 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4195 // Record the mem-refs
4196 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4197 return CNode;
4198 }
4199
4200 SDValue Ops[] = { N0, N1, Imm };
4201 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4202 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4203 return CNode;
4204}
4205
4206// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4207// to emit a second instruction after this one. This is needed since we have two
4208// copyToReg nodes glued before this and we need to continue that glue through.
4209MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4210 bool MayFoldLoad, const SDLoc &dl,
4211 MVT VT, SDNode *Node,
4212 SDValue &InGlue) {
4213 SDValue N0 = Node->getOperand(0);
4214 SDValue N2 = Node->getOperand(2);
4215 SDValue Imm = Node->getOperand(4);
4216 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4217 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4218
4219 // Try to fold a load. No need to check alignment.
4220 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4221 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4222 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4223 N2.getOperand(0), InGlue };
4224 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4225 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4226 InGlue = SDValue(CNode, 3);
4227 // Update the chain.
4228 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4229 // Record the mem-refs
4230 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4231 return CNode;
4232 }
4233
4234 SDValue Ops[] = { N0, N2, Imm, InGlue };
4235 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4236 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4237 InGlue = SDValue(CNode, 2);
4238 return CNode;
4239}
4240
4241bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4242 EVT VT = N->getValueType(0);
4243
4244 // Only handle scalar shifts.
4245 if (VT.isVector())
4246 return false;
4247
4248 // Narrower shifts only mask to 5 bits in hardware.
4249 unsigned Size = VT == MVT::i64 ? 64 : 32;
4250
4251 SDValue OrigShiftAmt = N->getOperand(1);
4252 SDValue ShiftAmt = OrigShiftAmt;
4253 SDLoc DL(N);
4254
4255 // Skip over a truncate of the shift amount.
4256 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4257 ShiftAmt = ShiftAmt->getOperand(0);
4258
4259 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4260 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4261
4262 SDValue NewShiftAmt;
4263 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4264 ShiftAmt->getOpcode() == ISD::XOR) {
4265 SDValue Add0 = ShiftAmt->getOperand(0);
4266 SDValue Add1 = ShiftAmt->getOperand(1);
4267 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4268 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4269 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4270 // to avoid the ADD/SUB/XOR.
4271 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4272 NewShiftAmt = Add0;
4273
4274 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4275 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4276 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4277 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4278 // we can replace it with a NOT. In the XOR case it may save some code
4279 // size, in the SUB case it also may save a move.
4280 assert(Add0C == nullptr || Add1C == nullptr);
4281
4282 // We can only do N-X, not X-N
4283 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4284 return false;
4285
4286 EVT OpVT = ShiftAmt.getValueType();
4287
4288 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4289 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4290 Add0C == nullptr ? Add0 : Add1, AllOnes);
4291 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4292 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4293 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4294 // -X to generate a NEG instead of a SUB of a constant.
4295 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4296 Add0C->getZExtValue() != 0) {
4297 EVT SubVT = ShiftAmt.getValueType();
4298 SDValue X;
4299 if (Add0C->getZExtValue() % Size == 0)
4300 X = Add1;
4301 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4302 Add0C->getZExtValue() % 32 == 0) {
4303 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4304 // This is mainly beneficial if we already compute (x+n*32).
4305 if (Add1.getOpcode() == ISD::TRUNCATE) {
4306 Add1 = Add1.getOperand(0);
4307 SubVT = Add1.getValueType();
4308 }
4309 if (Add0.getValueType() != SubVT) {
4310 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4311 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4312 }
4313
4314 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4315 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4316 } else
4317 return false;
4318 // Insert a negate op.
4319 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4320 // that uses it that's not a shift.
4321 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4322 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4323 NewShiftAmt = Neg;
4324
4325 // Insert these operands into a valid topological order so they can
4326 // get selected independently.
4327 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4328 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4329 } else
4330 return false;
4331 } else
4332 return false;
4333
4334 if (NewShiftAmt.getValueType() != MVT::i8) {
4335 // Need to truncate the shift amount.
4336 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4337 // Add to a correct topological ordering.
4338 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4339 }
4340
4341 // Insert a new mask to keep the shift amount legal. This should be removed
4342 // by isel patterns.
4343 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4344 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4345 // Place in a correct topological ordering.
4346 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4347
4348 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4349 NewShiftAmt);
4350 if (UpdatedNode != N) {
4351 // If we found an existing node, we should replace ourselves with that node
4352 // and wait for it to be selected after its other users.
4353 ReplaceNode(N, UpdatedNode);
4354 return true;
4355 }
4356
4357 // If the original shift amount is now dead, delete it so that we don't run
4358 // it through isel.
4359 if (OrigShiftAmt.getNode()->use_empty())
4360 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4361
4362 // Now that we've optimized the shift amount, defer to normal isel to get
4363 // load folding and legacy vs BMI2 selection without repeating it here.
4364 SelectCode(N);
4365 return true;
4366}
4367
4368bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4369 MVT NVT = N->getSimpleValueType(0);
4370 unsigned Opcode = N->getOpcode();
4371 SDLoc dl(N);
4372
4373 // For operations of the form (x << C1) op C2, check if we can use a smaller
4374 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4375 SDValue Shift = N->getOperand(0);
4376 SDValue N1 = N->getOperand(1);
4377
4378 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4379 if (!Cst)
4380 return false;
4381
4382 int64_t Val = Cst->getSExtValue();
4383
4384 // If we have an any_extend feeding the AND, look through it to see if there
4385 // is a shift behind it. But only if the AND doesn't use the extended bits.
4386 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4387 bool FoundAnyExtend = false;
4388 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4389 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4390 isUInt<32>(Val)) {
4391 FoundAnyExtend = true;
4392 Shift = Shift.getOperand(0);
4393 }
4394
4395 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4396 return false;
4397
4398 // i8 is unshrinkable, i16 should be promoted to i32.
4399 if (NVT != MVT::i32 && NVT != MVT::i64)
4400 return false;
4401
4402 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4403 if (!ShlCst)
4404 return false;
4405
4406 uint64_t ShAmt = ShlCst->getZExtValue();
4407
4408 // Make sure that we don't change the operation by removing bits.
4409 // This only matters for OR and XOR, AND is unaffected.
4410 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4411 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4412 return false;
4413
4414 // Check the minimum bitwidth for the new constant.
4415 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4416 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4417 if (Opcode == ISD::AND) {
4418 // AND32ri is the same as AND64ri32 with zext imm.
4419 // Try this before sign extended immediates below.
4420 ShiftedVal = (uint64_t)Val >> ShAmt;
4421 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4422 return true;
4423 // Also swap order when the AND can become MOVZX.
4424 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4425 return true;
4426 }
4427 ShiftedVal = Val >> ShAmt;
4428 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4429 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4430 return true;
4431 if (Opcode != ISD::AND) {
4432 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4433 ShiftedVal = (uint64_t)Val >> ShAmt;
4434 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4435 return true;
4436 }
4437 return false;
4438 };
4439
4440 int64_t ShiftedVal;
4441 if (!CanShrinkImmediate(ShiftedVal))
4442 return false;
4443
4444 // Ok, we can reorder to get a smaller immediate.
4445
4446 // But, its possible the original immediate allowed an AND to become MOVZX.
4447 // Doing this late due to avoid the MakedValueIsZero call as late as
4448 // possible.
4449 if (Opcode == ISD::AND) {
4450 // Find the smallest zext this could possibly be.
4451 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4452 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4453
4454 // Figure out which bits need to be zero to achieve that mask.
4455 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4456 ZExtWidth);
4457 NeededMask &= ~Cst->getAPIntValue();
4458
4459 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4460 return false;
4461 }
4462
4463 SDValue X = Shift.getOperand(0);
4464 if (FoundAnyExtend) {
4465 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4466 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4467 X = NewX;
4468 }
4469
4470 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
4471 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4472 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4473 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4474 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4475 Shift.getOperand(1));
4476 ReplaceNode(N, NewSHL.getNode());
4477 SelectCode(NewSHL.getNode());
4478 return true;
4479}
4480
4481bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4482 SDNode *ParentB, SDNode *ParentC,
4484 uint8_t Imm) {
4485 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4486 C.isOperandOf(ParentC) && "Incorrect parent node");
4487
4488 auto tryFoldLoadOrBCast =
4489 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4490 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4491 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4492 return true;
4493
4494 // Not a load, check for broadcast which may be behind a bitcast.
4495 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4496 P = L.getNode();
4497 L = L.getOperand(0);
4498 }
4499
4500 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4501 return false;
4502
4503 // Only 32 and 64 bit broadcasts are supported.
4504 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4505 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4506 if (Size != 32 && Size != 64)
4507 return false;
4508
4509 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4510 };
4511
4512 bool FoldedLoad = false;
4513 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4514 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4515 FoldedLoad = true;
4516 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4517 Tmp4)) {
4518 FoldedLoad = true;
4519 std::swap(A, C);
4520 // Swap bits 1/4 and 3/6.
4521 uint8_t OldImm = Imm;
4522 Imm = OldImm & 0xa5;
4523 if (OldImm & 0x02) Imm |= 0x10;
4524 if (OldImm & 0x10) Imm |= 0x02;
4525 if (OldImm & 0x08) Imm |= 0x40;
4526 if (OldImm & 0x40) Imm |= 0x08;
4527 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4528 Tmp4)) {
4529 FoldedLoad = true;
4530 std::swap(B, C);
4531 // Swap bits 1/2 and 5/6.
4532 uint8_t OldImm = Imm;
4533 Imm = OldImm & 0x99;
4534 if (OldImm & 0x02) Imm |= 0x04;
4535 if (OldImm & 0x04) Imm |= 0x02;
4536 if (OldImm & 0x20) Imm |= 0x40;
4537 if (OldImm & 0x40) Imm |= 0x20;
4538 }
4539
4540 SDLoc DL(Root);
4541
4542 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4543
4544 MVT NVT = Root->getSimpleValueType(0);
4545
4546 MachineSDNode *MNode;
4547 if (FoldedLoad) {
4548 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4549
4550 unsigned Opc;
4551 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4552 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4553 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4554 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4555
4556 bool UseD = EltSize == 32;
4557 if (NVT.is128BitVector())
4558 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4559 else if (NVT.is256BitVector())
4560 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4561 else if (NVT.is512BitVector())
4562 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4563 else
4564 llvm_unreachable("Unexpected vector size!");
4565 } else {
4566 bool UseD = NVT.getVectorElementType() == MVT::i32;
4567 if (NVT.is128BitVector())
4568 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4569 else if (NVT.is256BitVector())
4570 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4571 else if (NVT.is512BitVector())
4572 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4573 else
4574 llvm_unreachable("Unexpected vector size!");
4575 }
4576
4577 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4578 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4579
4580 // Update the chain.
4581 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4582 // Record the mem-refs
4583 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4584 } else {
4585 bool UseD = NVT.getVectorElementType() == MVT::i32;
4586 unsigned Opc;
4587 if (NVT.is128BitVector())
4588 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4589 else if (NVT.is256BitVector())
4590 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4591 else if (NVT.is512BitVector())
4592 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4593 else
4594 llvm_unreachable("Unexpected vector size!");
4595
4596 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4597 }
4598
4599 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4600 CurDAG->RemoveDeadNode(Root);
4601 return true;
4602}
4603
4604// Try to match two logic ops to a VPTERNLOG.
4605// FIXME: Handle more complex patterns that use an operand more than once?
4606bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4607 MVT NVT = N->getSimpleValueType(0);
4608
4609 // Make sure we support VPTERNLOG.
4610 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4611 NVT.getVectorElementType() == MVT::i1)
4612 return false;
4613
4614 // We need VLX for 128/256-bit.
4615 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4616 return false;
4617
4618 SDValue N0 = N->getOperand(0);
4619 SDValue N1 = N->getOperand(1);
4620
4621 auto getFoldableLogicOp = [](SDValue Op) {
4622 // Peek through single use bitcast.
4623 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4624 Op = Op.getOperand(0);
4625
4626 if (!Op.hasOneUse())
4627 return SDValue();
4628
4629 unsigned Opc = Op.getOpcode();
4630 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4631 Opc == X86ISD::ANDNP)
4632 return Op;
4633
4634 return SDValue();
4635 };
4636
4637 SDValue A, FoldableOp;
4638 if ((FoldableOp = getFoldableLogicOp(N1))) {
4639 A = N0;
4640 } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4641 A = N1;
4642 } else
4643 return false;
4644
4645 SDValue B = FoldableOp.getOperand(0);
4646 SDValue C = FoldableOp.getOperand(1);
4647 SDNode *ParentA = N;
4648 SDNode *ParentB = FoldableOp.getNode();
4649 SDNode *ParentC = FoldableOp.getNode();
4650
4651 // We can build the appropriate control immediate by performing the logic
4652 // operation we're matching using these constants for A, B, and C.
4653 uint8_t TernlogMagicA = 0xf0;
4654 uint8_t TernlogMagicB = 0xcc;
4655 uint8_t TernlogMagicC = 0xaa;
4656
4657 // Some of the inputs may be inverted, peek through them and invert the
4658 // magic values accordingly.
4659 // TODO: There may be a bitcast before the xor that we should peek through.
4660 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4661 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4662 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4663 Magic = ~Magic;
4664 Parent = Op.getNode();
4665 Op = Op.getOperand(0);
4666 }
4667 };
4668
4669 PeekThroughNot(A, ParentA, TernlogMagicA);
4670 PeekThroughNot(B, ParentB, TernlogMagicB);
4671 PeekThroughNot(C, ParentC, TernlogMagicC);
4672
4673 uint8_t Imm;
4674 switch (FoldableOp.getOpcode()) {
4675 default: llvm_unreachable("Unexpected opcode!");
4676 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4677 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4678 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4679 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4680 }
4681
4682 switch (N->getOpcode()) {
4683 default: llvm_unreachable("Unexpected opcode!");
4684 case X86ISD::ANDNP:
4685 if (A == N0)
4686 Imm &= ~TernlogMagicA;
4687 else
4688 Imm = ~(Imm) & TernlogMagicA;
4689 break;
4690 case ISD::AND: Imm &= TernlogMagicA; break;
4691 case ISD::OR: Imm |= TernlogMagicA; break;
4692 case ISD::XOR: Imm ^= TernlogMagicA; break;
4693 }
4694
4695 return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4696}
4697
4698/// If the high bits of an 'and' operand are known zero, try setting the
4699/// high bits of an 'and' constant operand to produce a smaller encoding by
4700/// creating a small, sign-extended negative immediate rather than a large
4701/// positive one. This reverses a transform in SimplifyDemandedBits that
4702/// shrinks mask constants by clearing bits. There is also a possibility that
4703/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4704/// case, just replace the 'and'. Return 'true' if the node is replaced.
4705bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4706 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4707 // have immediate operands.
4708 MVT VT = And->getSimpleValueType(0);
4709 if (VT != MVT::i32 && VT != MVT::i64)
4710 return false;
4711
4712 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4713 if (!And1C)
4714 return false;
4715
4716 // Bail out if the mask constant is already negative. It's can't shrink more.
4717 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4718 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4719 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4720 // are negative too.
4721 APInt MaskVal = And1C->getAPIntValue();
4722 unsigned MaskLZ = MaskVal.countl_zero();
4723 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4724 return false;
4725
4726 // Don't extend into the upper 32 bits of a 64 bit mask.
4727 if (VT == MVT::i64 && MaskLZ >= 32) {
4728 MaskLZ -= 32;
4729 MaskVal = MaskVal.trunc(32);
4730 }
4731
4732 SDValue And0 = And->getOperand(0);
4733 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4734 APInt NegMaskVal = MaskVal | HighZeros;
4735
4736 // If a negative constant would not allow a smaller encoding, there's no need
4737 // to continue. Only change the constant when we know it's a win.
4738 unsigned MinWidth = NegMaskVal.getSignificantBits();
4739 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4740 return false;
4741
4742 // Extend masks if we truncated above.
4743 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4744 NegMaskVal = NegMaskVal.zext(64);
4745 HighZeros = HighZeros.zext(64);
4746 }
4747
4748 // The variable operand must be all zeros in the top bits to allow using the
4749 // new, negative constant as the mask.
4750 if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
4751 return false;
4752
4753 // Check if the mask is -1. In that case, this is an unnecessary instruction
4754 // that escaped earlier analysis.
4755 if (NegMaskVal.isAllOnes()) {
4756 ReplaceNode(And, And0.getNode());
4757 return true;
4758 }
4759
4760 // A negative mask allows a smaller encoding. Create a new 'and' node.
4761 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4762 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4763 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4764 ReplaceNode(And, NewAnd.getNode());
4765 SelectCode(NewAnd.getNode());
4766 return true;
4767}
4768
4769static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4770 bool FoldedBCast, bool Masked) {
4771#define VPTESTM_CASE(VT, SUFFIX) \
4772case MVT::VT: \
4773 if (Masked) \
4774 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4775 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4776
4777
4778#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4779default: llvm_unreachable("Unexpected VT!"); \
4780VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4781VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4782VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4783VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4784VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4785VPTESTM_CASE(v8i64, QZ##SUFFIX)
4786
4787#define VPTESTM_FULL_CASES(SUFFIX) \
4788VPTESTM_BROADCAST_CASES(SUFFIX) \
4789VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4790VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4791VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4792VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4793VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4794VPTESTM_CASE(v32i16, WZ##SUFFIX)
4795
4796 if (FoldedBCast) {
4797 switch (TestVT.SimpleTy) {
4799 }
4800 }
4801
4802 if (FoldedLoad) {
4803 switch (TestVT.SimpleTy) {
4805 }
4806 }
4807
4808 switch (TestVT.SimpleTy) {
4810 }
4811
4812#undef VPTESTM_FULL_CASES
4813#undef VPTESTM_BROADCAST_CASES
4814#undef VPTESTM_CASE
4815}
4816
4817// Try to create VPTESTM instruction. If InMask is not null, it will be used
4818// to form a masked operation.
4819bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4820 SDValue InMask) {
4821 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4822 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4823 "Unexpected VT!");
4824
4825 // Look for equal and not equal compares.
4826 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4827 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4828 return false;
4829
4830 SDValue SetccOp0 = Setcc.getOperand(0);
4831 SDValue SetccOp1 = Setcc.getOperand(1);
4832
4833 // Canonicalize the all zero vector to the RHS.
4834 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4835 std::swap(SetccOp0, SetccOp1);
4836
4837 // See if we're comparing against zero.
4838 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4839 return false;
4840
4841 SDValue N0 = SetccOp0;
4842
4843 MVT CmpVT = N0.getSimpleValueType();
4844 MVT CmpSVT = CmpVT.getVectorElementType();
4845
4846 // Start with both operands the same. We'll try to refine this.
4847 SDValue Src0 = N0;
4848 SDValue Src1 = N0;
4849
4850 {
4851 // Look through single use bitcasts.
4852 SDValue N0Temp = N0;
4853 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4854 N0Temp = N0.getOperand(0);
4855
4856 // Look for single use AND.
4857 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4858 Src0 = N0Temp.getOperand(0);
4859 Src1 = N0Temp.getOperand(1);
4860 }
4861 }
4862
4863 // Without VLX we need to widen the operation.
4864 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4865
4866 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4867 SDValue &Base, SDValue &Scale, SDValue &Index,
4868 SDValue &Disp, SDValue &Segment) {
4869 // If we need to widen, we can't fold the load.
4870 if (!Widen)
4871 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4872 return true;
4873
4874 // If we didn't fold a load, try to match broadcast. No widening limitation
4875 // for this. But only 32 and 64 bit types are supported.
4876 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4877 return false;
4878
4879 // Look through single use bitcasts.
4880 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4881 P = L.getNode();
4882 L = L.getOperand(0);
4883 }
4884
4885 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4886 return false;
4887
4888 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4889 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4890 return false;
4891
4892 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4893 };
4894
4895 // We can only fold loads if the sources are unique.
4896 bool CanFoldLoads = Src0 != Src1;
4897
4898 bool FoldedLoad = false;
4899 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4900 if (CanFoldLoads) {
4901 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4902 Tmp3, Tmp4);
4903 if (!FoldedLoad) {
4904 // And is commutative.
4905 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4906 Tmp2, Tmp3, Tmp4);
4907 if (FoldedLoad)
4908 std::swap(Src0, Src1);
4909 }
4910 }
4911
4912 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4913
4914 bool IsMasked = InMask.getNode() != nullptr;
4915
4916 SDLoc dl(Root);
4917
4918 MVT ResVT = Setcc.getSimpleValueType();
4919 MVT MaskVT = ResVT;
4920 if (Widen) {
4921 // Widen the inputs using insert_subreg or copy_to_regclass.
4922 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4923 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4924 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4925 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4926 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4927 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4928 CmpVT), 0);
4929 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4930
4931 if (!FoldedBCast)
4932 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4933
4934 if (IsMasked) {
4935 // Widen the mask.
4936 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
4937 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4938 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4939 dl, MaskVT, InMask, RC), 0);
4940 }
4941 }
4942
4943 bool IsTestN = CC == ISD::SETEQ;
4944 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4945 IsMasked);
4946
4947 MachineSDNode *CNode;
4948 if (FoldedLoad) {
4949 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4950
4951 if (IsMasked) {
4952 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4953 Src1.getOperand(0) };
4954 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4955 } else {
4956 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4957 Src1.getOperand(0) };
4958 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4959 }
4960
4961 // Update the chain.
4962 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
4963 // Record the mem-refs
4964 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
4965 } else {
4966 if (IsMasked)
4967 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4968 else
4969 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4970 }
4971
4972 // If we widened, we need to shrink the mask VT.
4973 if (Widen) {
4974 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
4975 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4976 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4977 dl, ResVT, SDValue(CNode, 0), RC);
4978 }
4979
4980 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4981 CurDAG->RemoveDeadNode(Root);
4982 return true;
4983}
4984
4985// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4986// into vpternlog.
4987bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
4988 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
4989
4990 MVT NVT = N->getSimpleValueType(0);
4991
4992 // Make sure we support VPTERNLOG.
4993 if (!NVT.isVector() || !Subtarget->hasAVX512())
4994 return false;
4995
4996 // We need VLX for 128/256-bit.
4997 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4998 return false;
4999
5000 SDValue N0 = N->getOperand(0);
5001 SDValue N1 = N->getOperand(1);
5002
5003 // Canonicalize AND to LHS.
5004 if (N1.getOpcode() == ISD::AND)
5005 std::swap(N0, N1);
5006
5007 if (N0.getOpcode() != ISD::AND ||
5008 N1.getOpcode() != X86ISD::ANDNP ||
5009 !N0.hasOneUse() || !N1.hasOneUse())
5010 return false;
5011
5012 // ANDN is not commutable, use it to pick down A and C.
5013 SDValue A = N1.getOperand(0);
5014 SDValue C = N1.getOperand(1);
5015
5016 // AND is commutable, if one operand matches A, the other operand is B.
5017 // Otherwise this isn't a match.
5018 SDValue B;
5019 if (N0.getOperand(0) == A)
5020 B = N0.getOperand(1);
5021 else if (N0.getOperand(1) == A)
5022 B = N0.getOperand(0);
5023 else
5024 return false;
5025
5026 SDLoc dl(N);
5027 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5028 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5029 ReplaceNode(N, Ternlog.getNode());
5030
5031 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5032 Ternlog.getNode(), A, B, C, 0xCA);
5033}
5034
5035void X86DAGToDAGISel::Select(SDNode *Node) {
5036 MVT NVT = Node->getSimpleValueType(0);
5037 unsigned Opcode = Node->getOpcode();
5038 SDLoc dl(Node);
5039
5040 if (Node->isMachineOpcode()) {
5041 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5042 Node->setNodeId(-1);
5043 return; // Already selected.
5044 }
5045
5046 switch (Opcode) {
5047 default: break;
5049 unsigned IntNo = Node->getConstantOperandVal(1);
5050 switch (IntNo) {
5051 default: break;
5052 case Intrinsic::x86_encodekey128:
5053 case Intrinsic::x86_encodekey256: {
5054 if (!Subtarget->hasKL())
5055 break;
5056
5057 unsigned Opcode;
5058 switch (IntNo) {
5059 default: llvm_unreachable("Impossible intrinsic");
5060 case Intrinsic::x86_encodekey128:
5061 Opcode = X86::ENCODEKEY128;
5062 break;
5063 case Intrinsic::x86_encodekey256:
5064 Opcode = X86::ENCODEKEY256;
5065 break;
5066 }
5067
5068 SDValue Chain = Node->getOperand(0);
5069 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5070 SDValue());
5071 if (Opcode == X86::ENCODEKEY256)
5072 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5073 Chain.getValue(1));
5074
5075 MachineSDNode *Res = CurDAG->getMachineNode(
5076 Opcode, dl, Node->getVTList(),
5077 {Node->getOperand(2), Chain, Chain.getValue(1)});
5078 ReplaceNode(Node, Res);
5079 return;
5080 }
5081 case Intrinsic::x86_tileloadd64_internal:
5082 case Intrinsic::x86_tileloaddt164_internal: {
5083 if (!Subtarget->hasAMXTILE())
5084 break;
5085 unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
5086 ? X86::PTILELOADDV
5087 : X86::PTILELOADDT1V;
5088 // _tile_loadd_internal(row, col, buf, STRIDE)
5089 SDValue Base = Node->getOperand(4);
5090 SDValue Scale = getI8Imm(1, dl);
5091 SDValue Index = Node->getOperand(5);
5092 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5093 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5094 SDValue Chain = Node->getOperand(0);
5095 MachineSDNode *CNode;
5096 SDValue Ops[] = {Node->getOperand(2),
5097 Node->getOperand(3),
5098 Base,
5099 Scale,
5100 Index,
5101 Disp,
5102 Segment,
5103 Chain};
5104 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5105 ReplaceNode(Node, CNode);
5106 return;
5107 }
5108 }
5109 break;
5110 }
5111 case ISD::INTRINSIC_VOID: {
5112 unsigned IntNo = Node->getConstantOperandVal(1);
5113 switch (IntNo) {
5114 default: break;
5115 case Intrinsic::x86_sse3_monitor:
5116 case Intrinsic::x86_monitorx:
5117 case Intrinsic::x86_clzero: {
5118 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5119
5120 unsigned Opc = 0;
5121 switch (IntNo) {
5122 default: llvm_unreachable("Unexpected intrinsic!");
5123 case Intrinsic::x86_sse3_monitor:
5124 if (!Subtarget->hasSSE3())
5125 break;
5126 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5127 break;
5128 case Intrinsic::x86_monitorx:
5129 if (!Subtarget->hasMWAITX())
5130 break;
5131 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5132 break;
5133 case Intrinsic::x86_clzero:
5134 if (!Subtarget->hasCLZERO())
5135 break;
5136 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5137 break;
5138 }
5139
5140 if (Opc) {
5141 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5142 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5143 Node->getOperand(2), SDValue());
5144 SDValue InGlue = Chain.getValue(1);
5145
5146 if (IntNo == Intrinsic::x86_sse3_monitor ||
5147 IntNo == Intrinsic::x86_monitorx) {
5148 // Copy the other two operands to ECX and EDX.
5149 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5150 InGlue);
5151 InGlue = Chain.getValue(1);
5152 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5153 InGlue);
5154 InGlue = Chain.getValue(1);
5155 }
5156
5157 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5158 { Chain, InGlue});
5159 ReplaceNode(Node, CNode);
5160 return;
5161 }
5162
5163 break;
5164 }
5165 case Intrinsic::x86_tilestored64_internal: {
5166 unsigned Opc = X86::PTILESTOREDV;
5167 // _tile_stored_internal(row, col, buf, STRIDE, c)
5168 SDValue Base = Node->getOperand(4);
5169 SDValue Scale = getI8Imm(1, dl);
5170 SDValue Index = Node->getOperand(5);
5171 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5172 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5173 SDValue Chain = Node->getOperand(0);
5174 MachineSDNode *CNode;
5175 SDValue Ops[] = {Node->getOperand(2),
5176 Node->getOperand(3),
5177 Base,
5178 Scale,
5179 Index,
5180 Disp,
5181 Segment,
5182 Node->getOperand(6),
5183 Chain};
5184 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5185 ReplaceNode(Node, CNode);
5186 return;
5187 }
5188 case Intrinsic::x86_tileloadd64:
5189 case Intrinsic::x86_tileloaddt164:
5190 case Intrinsic::x86_tilestored64: {
5191 if (!Subtarget->hasAMXTILE())
5192 break;
5193 unsigned Opc;
5194 switch (IntNo) {
5195 default: llvm_unreachable("Unexpected intrinsic!");
5196 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5197 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5198 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5199 }
5200 // FIXME: Match displacement and scale.
5201 unsigned TIndex = Node->getConstantOperandVal(2);
5202 SDValue TReg = getI8Imm(TIndex, dl);
5203 SDValue Base = Node->getOperand(3);
5204 SDValue Scale = getI8Imm(1, dl);
5205 SDValue Index = Node->getOperand(4);
5206 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5207 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5208 SDValue Chain = Node->getOperand(0);
5209 MachineSDNode *CNode;
5210 if (Opc == X86::PTILESTORED) {
5211 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5212 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5213 } else {
5214 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5215 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5216 }
5217 ReplaceNode(Node, CNode);
5218 return;
5219 }
5220 }
5221 break;
5222 }
5223 case ISD::BRIND:
5224 case X86ISD::NT_BRIND: {
5225 if (Subtarget->isTargetNaCl())
5226 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5227 // leave the instruction alone.
5228 break;
5229 if (Subtarget->isTarget64BitILP32()) {
5230 // Converts a 32-bit register to a 64-bit, zero-extended version of
5231 // it. This is needed because x86-64 can do many things, but jmp %r32
5232 // ain't one of them.
5233 SDValue Target = Node->getOperand(1);
5234 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5235 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5236 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5237 Node->getOperand(0), ZextTarget);
5238 ReplaceNode(Node, Brind.getNode());
5239 SelectCode(ZextTarget.getNode());
5240 SelectCode(Brind.getNode());
5241 return;
5242 }
5243 break;
5244 }
5246 ReplaceNode(Node, getGlobalBaseReg());
5247 return;
5248
5249 case ISD::BITCAST:
5250 // Just drop all 128/256/512-bit bitcasts.
5251 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5252 NVT == MVT::f128) {
5253 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5254 CurDAG->RemoveDeadNode(Node);
5255 return;
5256 }
5257 break;
5258
5259 case ISD::SRL:
5260 if (matchBitExtract(Node))
5261 return;
5262 [[fallthrough]];
5263 case ISD::SRA:
5264 case ISD::SHL:
5265 if (tryShiftAmountMod(Node))
5266 return;
5267 break;
5268
5269 case X86ISD::VPTERNLOG: {
5270 uint8_t Imm = Node->getConstantOperandVal(3);
5271 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5272 Node->getOperand(1), Node->getOperand(2), Imm))
5273 return;
5274 break;
5275 }
5276
5277 case X86ISD::ANDNP:
5278 if (tryVPTERNLOG(Node))
5279 return;
5280 break;
5281
5282 case ISD::AND:
5283 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5284 // Try to form a masked VPTESTM. Operands can be in either order.
5285 SDValue N0 = Node->getOperand(0);
5286 SDValue N1 = Node->getOperand(1);
5287 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5288 tryVPTESTM(Node, N0, N1))
5289 return;
5290 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5291 tryVPTESTM(Node, N1, N0))
5292 return;
5293 }
5294
5295 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5296 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5297 CurDAG->RemoveDeadNode(Node);
5298 return;
5299 }
5300 if (matchBitExtract(Node))
5301 return;
5302 if (AndImmShrink && shrinkAndImmediate(Node))
5303 return;
5304
5305 [[fallthrough]];
5306 case ISD::OR:
5307 case ISD::XOR:
5308 if (tryShrinkShlLogicImm(Node))
5309 return;
5310 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5311 return;
5312 if (tryVPTERNLOG(Node))
5313 return;
5314
5315 [[fallthrough]];
5316 case ISD::ADD:
5317 if (Opcode == ISD::ADD && matchBitExtract(Node))
5318 return;
5319 [[fallthrough]];
5320 case ISD::SUB: {
5321 // Try to avoid folding immediates with multiple uses for optsize.
5322 // This code tries to select to register form directly to avoid going
5323 // through the isel table which might fold the immediate. We can't change
5324 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5325 // tablegen files to check immediate use count without making the patterns
5326 // unavailable to the fast-isel table.
5327 if (!CurDAG->shouldOptForSize())
5328 break;
5329
5330 // Only handle i8/i16/i32/i64.
5331 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5332 break;
5333
5334 SDValue N0 = Node->getOperand(0);
5335 SDValue N1 = Node->getOperand(1);
5336
5337 auto *Cst = dyn_cast<ConstantSDNode>(N1);
5338 if (!Cst)
5339 break;
5340
5341 int64_t Val = Cst->getSExtValue();
5342
5343 // Make sure its an immediate that is considered foldable.
5344 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5345 if (!isInt<8>(Val) && !isInt<32>(Val))
5346 break;
5347
5348 // If this can match to INC/DEC, let it go.
5349 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5350 break;
5351
5352 // Check if we should avoid folding this immediate.
5353 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5354 break;
5355
5356 // We should not fold the immediate. So we need a register form instead.
5357 unsigned ROpc, MOpc;
5358 switch (NVT.SimpleTy) {
5359 default: llvm_unreachable("Unexpected VT!");
5360 case MVT::i8:
5361 switch (Opcode) {
5362 default: llvm_unreachable("Unexpected opcode!");
5363 case ISD::ADD:
5364 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5365 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5366 break;
5367 case ISD::SUB:
5368 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5369 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5370 break;
5371 case ISD::AND:
5372 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5373 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5374 break;
5375 case ISD::OR:
5376 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5377 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5378 break;
5379 case ISD::XOR:
5380 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5381 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5382 break;
5383 }
5384 break;
5385 case MVT::i16:
5386 switch (Opcode) {
5387 default: llvm_unreachable("Unexpected opcode!");
5388 case ISD::ADD:
5389 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5390 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5391 break;
5392 case ISD::SUB:
5393 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5394 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5395 break;
5396 case ISD::AND:
5397 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5398 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5399 break;
5400 case ISD::OR:
5401 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5402 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5403 break;
5404 case ISD::XOR:
5405 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5406 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5407 break;
5408 }
5409 break;
5410 case MVT::i32:
5411 switch (Opcode) {
5412 default: llvm_unreachable("Unexpected opcode!");
5413 case ISD::ADD:
5414 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5415 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5416 break;
5417 case ISD::SUB:
5418 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5419 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5420 break;
5421 case ISD::AND:
5422 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5423 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5424 break;
5425 case ISD::OR:
5426 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5427 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5428 break;
5429 case ISD::XOR:
5430 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5431 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5432 break;
5433 }
5434 break;
5435 case MVT::i64:
5436 switch (Opcode) {
5437 default: llvm_unreachable("Unexpected opcode!");
5438 case ISD::ADD:
5439 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5440 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5441 break;
5442 case ISD::SUB:
5443 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5444 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5445 break;
5446 case ISD::AND:
5447 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5448 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5449 break;
5450 case ISD::OR:
5451 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5452 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5453 break;
5454 case ISD::XOR:
5455 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5456 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5457 break;
5458 }
5459 break;
5460 }
5461
5462 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5463
5464 // If this is a not a subtract, we can still try to fold a load.
5465 if (Opcode != ISD::SUB) {
5466 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5467 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5468 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5469 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5470 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5471 // Update the chain.
5472 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5473 // Record the mem-refs
5474 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5475 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5476 CurDAG->RemoveDeadNode(Node);
5477 return;
5478 }
5479 }
5480
5481 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5482 return;
5483 }
5484
5485 case X86ISD::SMUL:
5486 // i16/i32/i64 are handled with isel patterns.
5487 if (NVT != MVT::i8)
5488 break;
5489 [[fallthrough]];
5490 case X86ISD::UMUL: {
5491 SDValue N0 = Node->getOperand(0);
5492 SDValue N1 = Node->getOperand(1);
5493
5494 unsigned LoReg, ROpc, MOpc;
5495 switch (NVT.SimpleTy) {
5496 default: llvm_unreachable("Unsupported VT!");
5497 case MVT::i8:
5498 LoReg = X86::AL;
5499 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5500 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5501 break;
5502 case MVT::i16:
5503 LoReg = X86::AX;
5504 ROpc = X86::MUL16r;
5505 MOpc = X86::MUL16m;
5506 break;
5507 case MVT::i32:
5508 LoReg = X86::EAX;
5509 ROpc = X86::MUL32r;
5510 MOpc = X86::MUL32m;
5511 break;
5512 case MVT::i64:
5513 LoReg = X86::RAX;
5514 ROpc = X86::MUL64r;
5515 MOpc = X86::MUL64m;
5516 break;
5517 }
5518
5519 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5520 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5521 // Multiply is commutative.
5522 if (!FoldedLoad) {
5523 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5524 if (FoldedLoad)
5525 std::swap(N0, N1);
5526 }
5527
5528 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5529 N0, SDValue()).getValue(1);
5530
5531 MachineSDNode *CNode;
5532 if (FoldedLoad) {
5533 // i16/i32/i64 use an instruction that produces a low and high result even
5534 // though only the low result is used.
5535 SDVTList VTs;
5536 if (NVT == MVT::i8)
5537 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5538 else
5539 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5540
5541 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5542 InGlue };
5543 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5544
5545 // Update the chain.
5546 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5547 // Record the mem-refs
5548 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5549 } else {
5550 // i16/i32/i64 use an instruction that produces a low and high result even
5551 // though only the low result is used.
5552 SDVTList VTs;
5553 if (NVT == MVT::i8)
5554 VTs = CurDAG->getVTList(NVT, MVT::i32);
5555 else
5556 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5557
5558 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5559 }
5560
5561 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5562 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5563 CurDAG->RemoveDeadNode(Node);
5564 return;
5565 }
5566
5567 case ISD::SMUL_LOHI:
5568 case ISD::UMUL_LOHI: {
5569 SDValue N0 = Node->getOperand(0);
5570 SDValue N1 = Node->getOperand(1);
5571
5572 unsigned Opc, MOpc;
5573 unsigned LoReg, HiReg;
5574 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5575 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5576 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5577 switch (NVT.SimpleTy) {
5578 default: llvm_unreachable("Unsupported VT!");
5579 case MVT::i32:
5580 Opc = UseMULXHi ? X86::MULX32Hrr
5581 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5582 : IsSigned ? X86::IMUL32r
5583 : X86::MUL32r;
5584 MOpc = UseMULXHi ? X86::MULX32Hrm
5585 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5586 : IsSigned ? X86::IMUL32m
5587 : X86::MUL32m;
5588 LoReg = UseMULX ? X86::EDX : X86::EAX;
5589 HiReg = X86::EDX;
5590 break;
5591 case MVT::i64:
5592 Opc = UseMULXHi ? X86::MULX64Hrr
5593 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5594 : IsSigned ? X86::IMUL64r
5595 : X86::MUL64r;
5596 MOpc = UseMULXHi ? X86::MULX64Hrm
5597 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5598 : IsSigned ? X86::IMUL64m
5599 : X86::MUL64m;
5600 LoReg = UseMULX ? X86::RDX : X86::RAX;
5601 HiReg = X86::RDX;
5602 break;
5603 }
5604
5605 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5606 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5607 // Multiply is commutative.
5608 if (!foldedLoad) {
5609 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5610 if (foldedLoad)
5611 std::swap(N0, N1);
5612 }
5613
5614 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5615 N0, SDValue()).getValue(1);
5616 SDValue ResHi, ResLo;
5617 if (foldedLoad) {
5618 SDValue Chain;
5619 MachineSDNode *CNode = nullptr;
5620 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5621 InGlue };
5622 if (UseMULXHi) {
5623 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5624 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5625 ResHi = SDValue(CNode, 0);
5626 Chain = SDValue(CNode, 1);
5627 } else if (UseMULX) {
5628 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5629 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5630 ResHi = SDValue(CNode, 0);
5631 ResLo = SDValue(CNode, 1);
5632 Chain = SDValue(CNode, 2);
5633 } else {
5634 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5635 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5636 Chain = SDValue(CNode, 0);
5637 InGlue = SDValue(CNode, 1);
5638 }
5639
5640 // Update the chain.
5641 ReplaceUses(N1.getValue(1), Chain);
5642 // Record the mem-refs
5643 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5644 } else {
5645 SDValue Ops[] = { N1, InGlue };
5646 if (UseMULXHi) {
5647 SDVTList VTs = CurDAG->getVTList(NVT);
5648 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5649 ResHi = SDValue(CNode, 0);
5650 } else if (UseMULX) {
5651 SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5652 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5653 ResHi = SDValue(CNode, 0);
5654 ResLo = SDValue(CNode, 1);
5655 } else {
5656 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5657 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5658 InGlue = SDValue(CNode, 0);
5659 }
5660 }
5661
5662 // Copy the low half of the result, if it is needed.
5663 if (!SDValue(Node, 0).use_empty()) {
5664 if (!ResLo) {
5665 assert(LoReg && "Register for low half is not defined!");
5666 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5667 NVT, InGlue);
5668 InGlue = ResLo.getValue(2);
5669 }
5670 ReplaceUses(SDValue(Node, 0), ResLo);
5671 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5672 dbgs() << '\n');
5673 }
5674 // Copy the high half of the result, if it is needed.
5675 if (!SDValue(Node, 1).use_empty()) {
5676 if (!ResHi) {
5677 assert(HiReg && "Register for high half is not defined!");
5678 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5679 NVT, InGlue);
5680 InGlue = ResHi.getValue(2);
5681 }
5682 ReplaceUses(SDValue(Node, 1), ResHi);
5683 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5684 dbgs() << '\n');
5685 }
5686
5687 CurDAG->RemoveDeadNode(Node);
5688 return;
5689 }
5690
5691 case ISD::SDIVREM:
5692 case ISD::UDIVREM: {
5693 SDValue N0 = Node->getOperand(0);
5694 SDValue N1 = Node->getOperand(1);
5695
5696 unsigned ROpc, MOpc;
5697 bool isSigned = Opcode == ISD::SDIVREM;
5698 if (!isSigned) {
5699 switch (NVT.SimpleTy) {
5700 default: llvm_unreachable("Unsupported VT!");
5701 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5702 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5703 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5704 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5705 }
5706 } else {
5707 switch (NVT.SimpleTy) {
5708 default: llvm_unreachable("Unsupported VT!");
5709 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5710 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5711 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5712 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5713 }
5714 }
5715
5716 unsigned LoReg, HiReg, ClrReg;
5717 unsigned SExtOpcode;
5718 switch (NVT.SimpleTy) {
5719 default: llvm_unreachable("Unsupported VT!");
5720 case MVT::i8:
5721 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5722 SExtOpcode = 0; // Not used.
5723 break;
5724 case MVT::i16:
5725 LoReg = X86::AX; HiReg = X86::DX;
5726 ClrReg = X86::DX;
5727 SExtOpcode = X86::CWD;
5728 break;
5729 case MVT::i32:
5730 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5731 SExtOpcode = X86::CDQ;
5732 break;
5733 case MVT::i64:
5734 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5735 SExtOpcode = X86::CQO;
5736 break;
5737 }
5738
5739 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5740 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5741 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5742
5743 SDValue InGlue;
5744 if (NVT == MVT::i8) {
5745 // Special case for div8, just use a move with zero extension to AX to
5746 // clear the upper 8 bits (AH).
5747 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5748 MachineSDNode *Move;
5749 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5750 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5751 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5752 : X86::MOVZX16rm8;
5753 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5754 Chain = SDValue(Move, 1);
5755 ReplaceUses(N0.getValue(1), Chain);
5756 // Record the mem-refs
5757 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5758 } else {
5759 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5760 : X86::MOVZX16rr8;
5761 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5762 Chain = CurDAG->getEntryNode();
5763 }
5764 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5765 SDValue());
5766 InGlue = Chain.getValue(1);
5767 } else {
5768 InGlue =
5769 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5770 LoReg, N0, SDValue()).getValue(1);
5771 if (isSigned && !signBitIsZero) {
5772 // Sign extend the low part into the high part.
5773 InGlue =
5774 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5775 } else {
5776 // Zero out the high part, effectively zero extending the input.
5777 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5778 SDValue ClrNode = SDValue(
5779 CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
5780 switch (NVT.SimpleTy) {
5781 case MVT::i16:
5782 ClrNode =
5783 SDValue(CurDAG->getMachineNode(
5784 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5785 CurDAG->getTargetConstant(X86::sub_16bit, dl,
5786 MVT::i32)),
5787 0);
5788 break;
5789 case MVT::i32:
5790 break;
5791 case MVT::i64:
5792 ClrNode =
5793 SDValue(CurDAG->getMachineNode(
5794 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5795 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5796 CurDAG->getTargetConstant(X86::sub_32bit, dl,
5797 MVT::i32)),
5798 0);
5799 break;
5800 default:
5801 llvm_unreachable("Unexpected division source");
5802 }
5803
5804 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5805 ClrNode, InGlue).getValue(1);
5806 }
5807 }
5808
5809 if (foldedLoad) {
5810 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5811 InGlue };
5812 MachineSDNode *CNode =
5813 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5814 InGlue = SDValue(CNode, 1);
5815 // Update the chain.
5816 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5817 // Record the mem-refs
5818 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5819 } else {
5820 InGlue =
5821 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
5822 }
5823
5824 // Prevent use of AH in a REX instruction by explicitly copying it to
5825 // an ABCD_L register.
5826 //
5827 // The current assumption of the register allocator is that isel
5828 // won't generate explicit references to the GR8_ABCD_H registers. If
5829 // the allocator and/or the backend get enhanced to be more robust in
5830 // that regard, this can be, and should be, removed.
5831 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
5832 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
5833 unsigned AHExtOpcode =
5834 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5835
5836 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
5837 MVT::Glue, AHCopy, InGlue);
5838 SDValue Result(RNode, 0);
5839 InGlue = SDValue(RNode, 1);
5840
5841 Result =
5842 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
5843
5844 ReplaceUses(SDValue(Node, 1), Result);
5845 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5846 dbgs() << '\n');
5847 }
5848 // Copy the division (low) result, if it is needed.
5849 if (!SDValue(Node, 0).use_empty()) {
5850 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5851 LoReg, NVT, InGlue);
5852 InGlue = Result.getValue(2);
5853 ReplaceUses(SDValue(Node, 0), Result);
5854 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5855 dbgs() << '\n');
5856 }
5857 // Copy the remainder (high) result, if it is needed.
5858 if (!SDValue(Node, 1).use_empty()) {
5859 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5860 HiReg, NVT, InGlue);
5861 InGlue = Result.getValue(2);
5862 ReplaceUses(SDValue(Node, 1), Result);
5863 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5864 dbgs() << '\n');
5865 }
5866 CurDAG->RemoveDeadNode(Node);
5867 return;
5868 }
5869
5870 case X86ISD::FCMP:
5872 case X86ISD::STRICT_FCMPS: {
5873 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
5874 Node->getOpcode() == X86ISD::STRICT_FCMPS;
5875 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
5876 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
5877
5878 // Save the original VT of the compare.
5879 MVT CmpVT = N0.getSimpleValueType();
5880
5881 // Floating point needs special handling if we don't have FCOMI.
5882 if (Subtarget->canUseCMOV())
5883 break;
5884
5885 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
5886
5887 unsigned Opc;
5888 switch (CmpVT.SimpleTy) {
5889 default: llvm_unreachable("Unexpected type!");
5890 case MVT::f32:
5891 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
5892 break;
5893 case MVT::f64:
5894 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
5895 break;
5896 case MVT::f80:
5897 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
5898 break;
5899 }
5900
5901 SDValue Chain =
5902 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
5903 SDValue Glue;
5904 if (IsStrictCmp) {
5905 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5906 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
5907 Glue = Chain.getValue(1);
5908 } else {
5909 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
5910 }
5911
5912 // Move FPSW to AX.
5913 SDValue FNSTSW =
5914 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
5915
5916 // Extract upper 8-bits of AX.
5917 SDValue Extract =
5918 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
5919
5920 // Move AH into flags.
5921 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
5922 assert(Subtarget->canUseLAHFSAHF() &&
5923 "Target doesn't support SAHF or FCOMI?");
5924 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
5925 Chain = AH;
5926 SDValue SAHF = SDValue(
5927 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
5928
5929 if (IsStrictCmp)
5930 ReplaceUses(SDValue(Node, 1), Chain);
5931
5932 ReplaceUses(SDValue(Node, 0), SAHF);
5933 CurDAG->RemoveDeadNode(Node);
5934 return;
5935 }
5936
5937 case X86ISD::CMP: {
5938 SDValue N0 = Node->getOperand(0);
5939 SDValue N1 = Node->getOperand(1);
5940
5941 // Optimizations for TEST compares.
5942 if (!isNullConstant(N1))
5943 break;
5944
5945 // Save the original VT of the compare.
5946 MVT CmpVT = N0.getSimpleValueType();
5947
5948 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5949 // by a test instruction. The test should be removed later by
5950 // analyzeCompare if we are using only the zero flag.
5951 // TODO: Should we check the users and use the BEXTR flags directly?
5952 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
5953 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
5954 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
5955 : X86::TEST32rr;
5956 SDValue BEXTR = SDValue(NewNode, 0);
5957 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
5958 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5959 CurDAG->RemoveDeadNode(Node);
5960 return;
5961 }
5962 }
5963
5964 // We can peek through truncates, but we need to be careful below.
5965 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
5966 N0 = N0.getOperand(0);
5967
5968 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5969 // use a smaller encoding.
5970 // Look past the truncate if CMP is the only use of it.
5971 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
5972 N0.getValueType() != MVT::i8) {
5973 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5974 if (!MaskC)
5975 break;
5976
5977 // We may have looked through a truncate so mask off any bits that
5978 // shouldn't be part of the compare.
5979 uint64_t Mask = MaskC->getZExtValue();
5980 Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
5981
5982 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
5983 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
5984 // zero flag.
5985 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
5986 onlyUsesZeroFlag(SDValue(Node, 0))) {
5987 unsigned ShiftOpcode = ISD::DELETED_NODE;
5988 unsigned ShiftAmt;
5989 unsigned SubRegIdx;
5990 MVT SubRegVT;
5991 unsigned TestOpcode;
5992 unsigned LeadingZeros = llvm::countl_zero(Mask);
5993 unsigned TrailingZeros = llvm::countr_zero(Mask);
5994
5995 // With leading/trailing zeros, the transform is profitable if we can
5996 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
5997 // incurring any extra register moves.
5998 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
5999 if (LeadingZeros == 0 && SavesBytes) {
6000 // If the mask covers the most significant bit, then we can replace
6001 // TEST+AND with a SHR and check eflags.
6002 // This emits a redundant TEST which is subsequently eliminated.
6003 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6004 ShiftAmt = TrailingZeros;
6005 SubRegIdx = 0;
6006 TestOpcode = X86::TEST64rr;
6007 } else if (TrailingZeros == 0 && SavesBytes) {
6008 // If the mask covers the least significant bit, then we can replace
6009 // TEST+AND with a SHL and check eflags.
6010 // This emits a redundant TEST which is subsequently eliminated.
6011 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6012 ShiftAmt = LeadingZeros;
6013 SubRegIdx = 0;
6014 TestOpcode = X86::TEST64rr;
6015 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6016 // If the shifted mask extends into the high half and is 8/16/32 bits
6017 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6018 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6019 if (PopCount == 8) {
6020 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6021 ShiftAmt = TrailingZeros;
6022 SubRegIdx = X86::sub_8bit;
6023 SubRegVT = MVT::i8;
6024 TestOpcode = X86::TEST8rr;
6025 } else if (PopCount == 16) {
6026 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6027 ShiftAmt = TrailingZeros;
6028 SubRegIdx = X86::sub_16bit;
6029 SubRegVT = MVT::i16;
6030 TestOpcode = X86::TEST16rr;
6031 } else if (PopCount == 32) {
6032 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6033 ShiftAmt = TrailingZeros;
6034 SubRegIdx = X86::sub_32bit;
6035 SubRegVT = MVT::i32;
6036 TestOpcode = X86::TEST32rr;
6037 }
6038 }
6039 if (ShiftOpcode != ISD::DELETED_NODE) {
6040 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6041 SDValue Shift = SDValue(
6042 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6043 N0.getOperand(0), ShiftC),
6044 0);
6045 if (SubRegIdx != 0) {
6046 Shift =
6047 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6048 }
6050 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6051 ReplaceNode(Node, Test);
6052 return;
6053 }
6054 }
6055
6056 MVT VT;
6057 int SubRegOp;
6058 unsigned ROpc, MOpc;
6059
6060 // For each of these checks we need to be careful if the sign flag is
6061 // being used. It is only safe to use the sign flag in two conditions,
6062 // either the sign bit in the shrunken mask is zero or the final test
6063 // size is equal to the original compare size.
6064
6065 if (isUInt<8>(Mask) &&
6066 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6067 hasNoSignFlagUses(SDValue(Node, 0)))) {
6068 // For example, convert "testl %eax, $8" to "testb %al, $8"
6069 VT = MVT::i8;
6070 SubRegOp = X86::sub_8bit;
6071 ROpc = X86::TEST8ri;
6072 MOpc = X86::TEST8mi;
6073 } else if (OptForMinSize && isUInt<16>(Mask) &&
6074 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6075 hasNoSignFlagUses(SDValue(Node, 0)))) {
6076 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6077 // NOTE: We only want to form TESTW instructions if optimizing for
6078 // min size. Otherwise we only save one byte and possibly get a length
6079 // changing prefix penalty in the decoders.
6080 VT = MVT::i16;
6081 SubRegOp = X86::sub_16bit;
6082 ROpc = X86::TEST16ri;
6083 MOpc = X86::TEST16mi;
6084 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6085 ((!(Mask & 0x80000000) &&
6086 // Without minsize 16-bit Cmps can get here so we need to
6087 // be sure we calculate the correct sign flag if needed.
6088 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6089 CmpVT == MVT::i32 ||
6090 hasNoSignFlagUses(SDValue(Node, 0)))) {
6091 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6092 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6093 // Otherwize, we find ourselves in a position where we have to do
6094 // promotion. If previous passes did not promote the and, we assume
6095 // they had a good reason not to and do not promote here.
6096 VT = MVT::i32;
6097 SubRegOp = X86::sub_32bit;
6098 ROpc = X86::TEST32ri;
6099 MOpc = X86::TEST32mi;
6100 } else {
6101 // No eligible transformation was found.
6102 break;
6103 }
6104
6105 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6106 SDValue Reg = N0.getOperand(0);
6107
6108 // Emit a testl or testw.
6109 MachineSDNode *NewNode;
6110 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6111 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6112 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6113 if (!LoadN->isSimple()) {
6114 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6115 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6116 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6117 (MOpc == X86::TEST32mi && NumVolBits != 32))
6118 break;
6119 }
6120 }
6121 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6122 Reg.getOperand(0) };
6123 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6124 // Update the chain.
6125 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6126 // Record the mem-refs
6127 CurDAG->setNodeMemRefs(NewNode,
6128 {cast<LoadSDNode>(Reg)->getMemOperand()});
6129 } else {
6130 // Extract the subregister if necessary.
6131 if (N0.getValueType() != VT)
6132 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6133
6134 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6135 }
6136 // Replace CMP with TEST.
6137 ReplaceNode(Node, NewNode);
6138 return;
6139 }
6140 break;
6141 }
6142 case X86ISD::PCMPISTR: {
6143 if (!Subtarget->hasSSE42())
6144 break;
6145
6146 bool NeedIndex = !SDValue(Node, 0).use_empty();
6147 bool NeedMask = !SDValue(Node, 1).use_empty();
6148 // We can't fold a load if we are going to make two instructions.
6149 bool MayFoldLoad = !NeedIndex || !NeedMask;
6150
6151 MachineSDNode *CNode;
6152 if (NeedMask) {
6153 unsigned ROpc =
6154 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6155 unsigned MOpc =
6156 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6157 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6158 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6159 }
6160 if (NeedIndex || !NeedMask) {
6161 unsigned ROpc =
6162 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6163 unsigned MOpc =
6164 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6165 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6166 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6167 }
6168
6169 // Connect the flag usage to the last instruction created.
6170 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6171 CurDAG->RemoveDeadNode(Node);
6172 return;
6173 }
6174 case X86ISD::PCMPESTR: {
6175 if (!Subtarget->hasSSE42())
6176 break;
6177
6178 // Copy the two implicit register inputs.
6179 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6180 Node->getOperand(1),
6181 SDValue()).getValue(1);
6182 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6183 Node->getOperand(3), InGlue).getValue(1);
6184
6185 bool NeedIndex = !SDValue(Node, 0).use_empty();
6186 bool NeedMask = !SDValue(Node, 1).use_empty();
6187 // We can't fold a load if we are going to make two instructions.
6188 bool MayFoldLoad = !NeedIndex || !NeedMask;
6189
6190 MachineSDNode *CNode;
6191 if (NeedMask) {
6192 unsigned ROpc =
6193 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6194 unsigned MOpc =
6195 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6196 CNode =
6197 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6198 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6199 }
6200 if (NeedIndex || !NeedMask) {
6201 unsigned ROpc =
6202 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6203 unsigned MOpc =
6204 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6205 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6206 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6207 }
6208 // Connect the flag usage to the last instruction created.
6209 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6210 CurDAG->RemoveDeadNode(Node);
6211 return;
6212 }
6213
6214 case ISD::SETCC: {
6215 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6216 return;
6217
6218 break;
6219 }
6220
6221 case ISD::STORE:
6222 if (foldLoadStoreIntoMemOperand(Node))
6223 return;
6224 break;
6225
6226 case X86ISD::SETCC_CARRY: {
6227 MVT VT = Node->getSimpleValueType(0);
6229 if (Subtarget->hasSBBDepBreaking()) {
6230 // We have to do this manually because tblgen will put the eflags copy in
6231 // the wrong place if we use an extract_subreg in the pattern.
6232 // Copy flags to the EFLAGS register and glue it to next node.
6233 SDValue EFLAGS =
6234 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6235 Node->getOperand(1), SDValue());
6236
6237 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6238 // 32-bit version.
6239 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6240 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6241 Result = SDValue(
6242 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6243 0);
6244 } else {
6245 // The target does not recognize sbb with the same reg operand as a
6246 // no-source idiom, so we explicitly zero the input values.
6247 Result = getSBBZero(Node);
6248 }
6249
6250 // For less than 32-bits we need to extract from the 32-bit node.
6251 if (VT == MVT::i8 || VT == MVT::i16) {
6252 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6253 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6254 }
6255
6256 ReplaceUses(SDValue(Node, 0), Result);
6257 CurDAG->RemoveDeadNode(Node);
6258 return;
6259 }
6260 case X86ISD::SBB: {
6261 if (isNullConstant(Node->getOperand(0)) &&
6262 isNullConstant(Node->getOperand(1))) {
6263 SDValue Result = getSBBZero(Node);
6264
6265 // Replace the flag use.
6266 ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6267
6268 // Replace the result use.
6269 if (!SDValue(Node, 0).use_empty()) {
6270 // For less than 32-bits we need to extract from the 32-bit node.
6271 MVT VT = Node->getSimpleValueType(0);
6272 if (VT == MVT::i8 || VT == MVT::i16) {
6273 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6274 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6275 }
6276 ReplaceUses(SDValue(Node, 0), Result);
6277 }
6278
6279 CurDAG->RemoveDeadNode(Node);
6280 return;
6281 }
6282 break;
6283 }
6284 case X86ISD::MGATHER: {
6285 auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6286 SDValue IndexOp = Mgt->getIndex();
6287 SDValue Mask = Mgt->getMask();
6288 MVT IndexVT = IndexOp.getSimpleValueType();
6289 MVT ValueVT = Node->getSimpleValueType(0);
6290 MVT MaskVT = Mask.getSimpleValueType();
6291
6292 // This is just to prevent crashes if the nodes are malformed somehow. We're
6293 // otherwise only doing loose type checking in here based on type what
6294 // a type constraint would say just like table based isel.
6295 if (!ValueVT.isVector() || !MaskVT.isVector())
6296 break;
6297
6298 unsigned NumElts = ValueVT.getVectorNumElements();
6299 MVT ValueSVT = ValueVT.getVectorElementType();
6300
6301 bool IsFP = ValueSVT.isFloatingPoint();
6302 unsigned EltSize = ValueSVT.getSizeInBits();
6303
6304 unsigned Opc = 0;
6305 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6306 if (AVX512Gather) {
6307 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6308 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6309 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6310 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6311 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6312 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6313 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6314 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6315 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6316 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6317 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6318 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6319 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6320 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6321 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6322 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6323 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6324 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6325 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6326 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6327 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6328 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6329 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6330 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6331 } else {
6332 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6333 "Unexpected mask VT!");
6334 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6335 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6336 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6337 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6338 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6339 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6340 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6341 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6342 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6343 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6344 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6345 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6346 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6347 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6348 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6349 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6350 }
6351
6352 if (!Opc)
6353 break;
6354
6355 SDValue Base, Scale, Index, Disp, Segment;
6356 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6357 Base, Scale, Index, Disp, Segment))
6358 break;
6359
6360 SDValue PassThru = Mgt->getPassThru();
6361 SDValue Chain = Mgt->getChain();
6362 // Gather instructions have a mask output not in the ISD node.
6363 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6364
6365 MachineSDNode *NewNode;
6366 if (AVX512Gather) {
6367 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6368 Index, Disp, Segment, Chain};
6369 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6370 } else {
6371 SDValue Ops[] = {PassThru, Base, Scale, Index,
6372 Disp, Segment, Mask, Chain};
6373 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6374 }
6375 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6376 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6377 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6378 CurDAG->RemoveDeadNode(Node);
6379 return;
6380 }
6381 case X86ISD::MSCATTER: {
6382 auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6383 SDValue Value = Sc->getValue();
6384 SDValue IndexOp = Sc->getIndex();
6385 MVT IndexVT = IndexOp.getSimpleValueType();
6386 MVT ValueVT = Value.getSimpleValueType();
6387
6388 // This is just to prevent crashes if the nodes are malformed somehow. We're
6389 // otherwise only doing loose type checking in here based on type what
6390 // a type constraint would say just like table based isel.
6391 if (!ValueVT.isVector())
6392 break;
6393
6394 unsigned NumElts = ValueVT.getVectorNumElements();
6395 MVT ValueSVT = ValueVT.getVectorElementType();
6396
6397 bool IsFP = ValueSVT.isFloatingPoint();
6398 unsigned EltSize = ValueSVT.getSizeInBits();
6399
6400 unsigned Opc;
6401 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6402 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6403 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6404 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6405 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6406 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6407 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6408 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6409 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6410 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6411 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6412 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6413 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6414 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6415 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6416 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6417 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6418 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6419 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6420 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6421 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6422 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6423 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6424 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6425 else
6426 break;
6427
6428 SDValue Base, Scale, Index, Disp, Segment;
6429 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6430 Base, Scale, Index, Disp, Segment))
6431 break;
6432
6433 SDValue Mask = Sc->getMask();
6434 SDValue Chain = Sc->getChain();
6435 // Scatter instructions have a mask output not in the ISD node.
6436 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6437 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6438
6439 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6440 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6441 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6442 CurDAG->RemoveDeadNode(Node);
6443 return;
6444 }
6446 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6447 auto CallId = MFI->getPreallocatedIdForCallSite(
6448 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6449 SDValue Chain = Node->getOperand(0);
6450 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6451 MachineSDNode *New = CurDAG->getMachineNode(
6452 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6453 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6454 CurDAG->RemoveDeadNode(Node);
6455 return;
6456 }
6457 case ISD::PREALLOCATED_ARG: {
6458 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6459 auto CallId = MFI->getPreallocatedIdForCallSite(
6460 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6461 SDValue Chain = Node->getOperand(0);
6462 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6463 SDValue ArgIndex = Node->getOperand(2);
6464 SDValue Ops[3];
6465 Ops[0] = CallIdValue;
6466 Ops[1] = ArgIndex;
6467 Ops[2] = Chain;
6468 MachineSDNode *New = CurDAG->getMachineNode(
6469 TargetOpcode::PREALLOCATED_ARG, dl,
6470 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6471 MVT::Other),
6472 Ops);
6473 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6474 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6475 CurDAG->RemoveDeadNode(Node);
6476 return;
6477 }
6482 if (!Subtarget->hasWIDEKL())
6483 break;
6484
6485 unsigned Opcode;
6486 switch (Node->getOpcode()) {
6487 default:
6488 llvm_unreachable("Unexpected opcode!");
6490 Opcode = X86::AESENCWIDE128KL;
6491 break;
6493 Opcode = X86::AESDECWIDE128KL;
6494 break;
6496 Opcode = X86::AESENCWIDE256KL;
6497 break;
6499 Opcode = X86::AESDECWIDE256KL;
6500 break;
6501 }
6502
6503 SDValue Chain = Node->getOperand(0);
6504 SDValue Addr = Node->getOperand(1);
6505
6506 SDValue Base, Scale, Index, Disp, Segment;
6507 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6508 break;
6509
6510 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6511 SDValue());
6512 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6513 Chain.getValue(1));
6514 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6515 Chain.getValue(1));
6516 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6517 Chain.getValue(1));
6518 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6519 Chain.getValue(1));
6520 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6521 Chain.getValue(1));
6522 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6523 Chain.getValue(1));
6524 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6525 Chain.getValue(1));
6526
6527 MachineSDNode *Res = CurDAG->getMachineNode(
6528 Opcode, dl, Node->getVTList(),
6529 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6530 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6531 ReplaceNode(Node, Res);
6532 return;
6533 }
6534 }
6535
6536 SelectCode(Node);
6537}
6538
6539bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6540 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6541 std::vector<SDValue> &OutOps) {
6542 SDValue Op0, Op1, Op2, Op3, Op4;
6543 switch (ConstraintID) {
6544 default:
6545 llvm_unreachable("Unexpected asm memory constraint");
6546 case InlineAsm::ConstraintCode::o: // offsetable ??
6547 case InlineAsm::ConstraintCode::v: // not offsetable ??
6548 case InlineAsm::ConstraintCode::m: // memory
6549 case InlineAsm::ConstraintCode::X:
6550 case InlineAsm::ConstraintCode::p: // address
6551 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6552 return true;
6553 break;
6554 }
6555
6556 OutOps.push_back(Op0);
6557 OutOps.push_back(Op1);
6558 OutOps.push_back(Op2);
6559 OutOps.push_back(Op3);
6560 OutOps.push_back(Op4);
6561 return false;
6562}
6563
6564/// This pass converts a legalized DAG into a X86-specific DAG,
6565/// ready for instruction scheduling.
6567 CodeGenOptLevel OptLevel) {
6568 return new X86DAGToDAGISel(TM, OptLevel);
6569}
unsigned SubReg
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
aarch64 promote const
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu AMDGPU Register Bank Select
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
#define P(N)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII)
Check if the instruction uses RIP relative addressing.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, unsigned LoadOpNo, LoadSDNode *&LoadNode, SDValue &InputChain)
Check whether or not the chain ending in StoreNode is suitable for doing the {load; op; store} to mod...
#define GET_EGPR_IF_ENABLED(OPC)
static bool needBWI(MVT VT)
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked)
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget)
static bool mayUseCarryFlag(X86::CondCode CC)
static cl::opt< bool > EnablePromoteAnyextLoad("x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden)
cl::opt< bool > IndirectBranchTracking
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain)
Replace the original chain operand of the call with load's chain operand and move load below the call...
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N)
#define GET_ND_IF_ENABLED(OPC)
#define VPTESTM_BROADCAST_CASES(SUFFIX)
#define FROM_TO(A, B)
static cl::opt< bool > AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden)
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
#define VPTESTM_FULL_CASES(SUFFIX)
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
static bool isDispSafeForFrameIndex(int64_t Val)
#define PASS_NAME
#define DEBUG_TYPE
static bool isEndbrImm64(uint64_t Imm)
#define GET_ND_IF_ENABLED(OPC)
Value * RHS
DEMANGLE_DUMP_METHOD void dump() const
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:906
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1446
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1555
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1489
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1613
The address of a basic block.
Definition: Constants.h:889
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
std::optional< ConstantRange > getAbsoluteSymbolRange() const
If this is an absolute symbol reference, returns the range of the symbol, otherwise returns std::null...
Definition: Globals.cpp:393
This class is used to form a handle around another node that is persistent and is updated across invo...
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineModuleInfo & getMMI() const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const Module * getModule() const
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
Root of the metadata hierarchy.
Definition: Metadata.h:62
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:331
Register getReg() const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
int getNodeId() const
Return the unique node id.
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
const SDValue & getOperand(unsigned Num) const
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
op_iterator op_end() const
op_iterator op_begin() const
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
bool use_empty() const
Return true if there are no nodes using value ResNo of Node.
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
unsigned getNumOperands() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps)
SelectInlineAsmMemoryOperand - Select the specified address as a target addressing mode,...
virtual void PostprocessISelDAG()
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
static int getUninvalidatedNodeId(SDNode *N)
virtual void emitFunctionEntryCode()
virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const
IsProfitableToFold - Returns true if it's profitable to fold the specific operand node N of U during ...
virtual bool ComplexPatternFuncMutatesDAG() const
Return true if complex patterns for this target can mutate the DAG.
virtual void PreprocessISelDAG()
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
static void InvalidateNodeId(SDNode *N)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:448
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
void RepositionNode(allnodes_iterator Position, SDNode *N)
Move node N in the AllNodes list to be immediately before the given iterator Position.
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:534
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
std::vector< ArgListEntry > ArgListTy
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
unsigned getID() const
Return the register class ID number.
Target - Wrapper for Target specific information.
static Type * getVoidTy(LLVMContext &C)
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:5239
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
size_t getPreallocatedIdForCallSite(const Value *CS)
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ STRICT_FCEIL
Definition: ISDOpcodes.h:427
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ FrameIndex
Definition: ISDOpcodes.h:80
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ PREALLOCATED_SETUP
Definition: ISDOpcodes.h:1167
@ TargetExternalSymbol
Definition: ISDOpcodes.h:169
@ PREALLOCATED_ARG
Definition: ISDOpcodes.h:1170
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1059
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:209
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:164
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:431
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:114
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:810
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:428
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:430
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:444
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:466
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:443
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:471
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:832
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:424
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ CALLSEQ_START
CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end of a call sequence,...
Definition: ISDOpcodes.h:1147
@ STRICT_FRINT
Definition: ISDOpcodes.h:423
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ TargetGlobalTLSAddress
Definition: ISDOpcodes.h:165
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1535
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
const uint64_t Magic
Definition: InstrProf.h:1138
SymbolFlags
Symbol flags.
Definition: Symbol.h:24
@ GlobalBaseReg
The result of the mflr at function entry, used for PIC code.
@ X86
Windows x64, Windows Itanium (IA-64)
@ SS
Definition: X86.h:207
@ FS
Definition: X86.h:206
@ GS
Definition: X86.h:205
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:377
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
Definition: X86BaseInfo.h:839
@ VEX
VEX - encoding using 0xC4/0xC5.
Definition: X86BaseInfo.h:832
@ XOP
XOP - Opcode prefix used by XOP instructions.
Definition: X86BaseInfo.h:834
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ STRICT_FCMP
X86 strict FP compare instructions.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ CALL
These operations represent an abstract X86 call instruction, which includes a bunch of information.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FLD
This instruction implements an extending load to FP stack slots.
@ TC_RETURN
Tail call return.
@ FOR
Bitwise logical OR of floating point values.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:456
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
unsigned M1(unsigned Val)
Definition: VE.h:376
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:257
FunctionPass * createX86ISelDag(X86TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a X86-specific DAG, ready for instruction scheduling.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
@ And
Bitwise or logical AND of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This structure contains all information that is necessary for lowering calls.