LLVM 22.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
34#include <cstdint>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "x86-isel"
39#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46
48 "x86-promote-anyext-load", cl::init(true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
52
53//===----------------------------------------------------------------------===//
54// Pattern Matcher Implementation
55//===----------------------------------------------------------------------===//
56
57namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
65
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
69
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
83
84 X86ISelAddressMode() = default;
85
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89 }
90
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94 }
95
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
103 }
104
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
108 }
109
110#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151 }
152#endif
153 };
154}
155
156namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
160 ///
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
165
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
168
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
171
172 public:
173 X86DAGToDAGISel() = delete;
174
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 "indirect-tls-seg-refs");
184
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
188 }
189
190 void emitFunctionEntryCode() override;
191
192 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
193
194 void PreprocessISelDAG() override;
195 void PostprocessISelDAG() override;
196
197// Include the pieces autogenerated from the target description.
198#include "X86GenDAGISel.inc"
199
200 private:
201 void Select(SDNode *N) override;
202
203 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205 bool AllowSegmentRegForX32 = false);
206 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211 unsigned Depth);
212 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
218 SDValue &Scale, SDValue &Index, SDValue &Disp,
219 SDValue &Segment);
220 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
221 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
222 SDValue &Index, SDValue &Disp, SDValue &Segment);
223 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
224 bool selectLEAAddr(SDValue N, SDValue &Base,
225 SDValue &Scale, SDValue &Index, SDValue &Disp,
226 SDValue &Segment);
227 bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
228 SDValue &Index, SDValue &Disp, SDValue &Segment);
229 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectRelocImm(SDValue N, SDValue &Op);
233
234 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
235 SDValue &Base, SDValue &Scale,
236 SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238
239 // Convenience method where P is also root.
240 bool tryFoldLoad(SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment) {
244 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
245 }
246
247 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
248 SDValue &Base, SDValue &Scale,
249 SDValue &Index, SDValue &Disp,
250 SDValue &Segment);
251
252 bool isProfitableToFormMaskedOp(SDNode *N) const;
253
254 /// Implement addressing mode selection for inline asm expressions.
255 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
256 InlineAsm::ConstraintCode ConstraintID,
257 std::vector<SDValue> &OutOps) override;
258
259 void emitSpecialCodeForMain();
260
261 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
262 MVT VT, SDValue &Base, SDValue &Scale,
263 SDValue &Index, SDValue &Disp,
264 SDValue &Segment) {
265 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
266 Base = CurDAG->getTargetFrameIndex(
267 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
268 else if (AM.Base_Reg.getNode())
269 Base = AM.Base_Reg;
270 else
271 Base = CurDAG->getRegister(0, VT);
272
273 Scale = getI8Imm(AM.Scale, DL);
274
275#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
276 // Negate the index if needed.
277 if (AM.NegateIndex) {
278 unsigned NegOpc;
279 switch (VT.SimpleTy) {
280 default:
281 llvm_unreachable("Unsupported VT!");
282 case MVT::i64:
283 NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
284 break;
285 case MVT::i32:
286 NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
287 break;
288 case MVT::i16:
289 NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
290 break;
291 case MVT::i8:
292 NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
293 break;
294 }
295 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
296 AM.IndexReg), 0);
297 AM.IndexReg = Neg;
298 }
299
300 if (AM.IndexReg.getNode())
301 Index = AM.IndexReg;
302 else
303 Index = CurDAG->getRegister(0, VT);
304
305 // These are 32-bit even in 64-bit mode since RIP-relative offset
306 // is 32-bit.
307 if (AM.GV)
308 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
309 MVT::i32, AM.Disp,
310 AM.SymbolFlags);
311 else if (AM.CP)
312 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
313 AM.Disp, AM.SymbolFlags);
314 else if (AM.ES) {
315 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
316 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
317 } else if (AM.MCSym) {
318 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
319 assert(AM.SymbolFlags == 0 && "oo");
320 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
321 } else if (AM.JT != -1) {
322 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
323 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
324 } else if (AM.BlockAddr)
325 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
326 AM.SymbolFlags);
327 else
328 Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
329
330 if (AM.Segment.getNode())
331 Segment = AM.Segment;
332 else
333 Segment = CurDAG->getRegister(0, MVT::i16);
334 }
335
336 // Utility function to determine whether it is AMX SDNode right after
337 // lowering but before ISEL.
338 bool isAMXSDNode(SDNode *N) const {
339 // Check if N is AMX SDNode:
340 // 1. check result type;
341 // 2. check operand type;
342 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
343 if (N->getValueType(Idx) == MVT::x86amx)
344 return true;
345 }
346 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
347 SDValue Op = N->getOperand(Idx);
348 if (Op.getValueType() == MVT::x86amx)
349 return true;
350 }
351 return false;
352 }
353
354 // Utility function to determine whether we should avoid selecting
355 // immediate forms of instructions for better code size or not.
356 // At a high level, we'd like to avoid such instructions when
357 // we have similar constants used within the same basic block
358 // that can be kept in a register.
359 //
360 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
361 uint32_t UseCount = 0;
362
363 // Do not want to hoist if we're not optimizing for size.
364 // TODO: We'd like to remove this restriction.
365 // See the comment in X86InstrInfo.td for more info.
366 if (!CurDAG->shouldOptForSize())
367 return false;
368
369 // Walk all the users of the immediate.
370 for (const SDNode *User : N->users()) {
371 if (UseCount >= 2)
372 break;
373
374 // This user is already selected. Count it as a legitimate use and
375 // move on.
376 if (User->isMachineOpcode()) {
377 UseCount++;
378 continue;
379 }
380
381 // We want to count stores of immediates as real uses.
382 if (User->getOpcode() == ISD::STORE &&
383 User->getOperand(1).getNode() == N) {
384 UseCount++;
385 continue;
386 }
387
388 // We don't currently match users that have > 2 operands (except
389 // for stores, which are handled above)
390 // Those instruction won't match in ISEL, for now, and would
391 // be counted incorrectly.
392 // This may change in the future as we add additional instruction
393 // types.
394 if (User->getNumOperands() != 2)
395 continue;
396
397 // If this is a sign-extended 8-bit integer immediate used in an ALU
398 // instruction, there is probably an opcode encoding to save space.
400 if (C && isInt<8>(C->getSExtValue()))
401 continue;
402
403 // Immediates that are used for offsets as part of stack
404 // manipulation should be left alone. These are typically
405 // used to indicate SP offsets for argument passing and
406 // will get pulled into stores/pushes (implicitly).
407 if (User->getOpcode() == X86ISD::ADD ||
408 User->getOpcode() == ISD::ADD ||
409 User->getOpcode() == X86ISD::SUB ||
410 User->getOpcode() == ISD::SUB) {
411
412 // Find the other operand of the add/sub.
413 SDValue OtherOp = User->getOperand(0);
414 if (OtherOp.getNode() == N)
415 OtherOp = User->getOperand(1);
416
417 // Don't count if the other operand is SP.
418 RegisterSDNode *RegNode;
419 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
421 OtherOp->getOperand(1).getNode())))
422 if ((RegNode->getReg() == X86::ESP) ||
423 (RegNode->getReg() == X86::RSP))
424 continue;
425 }
426
427 // ... otherwise, count this and move on.
428 UseCount++;
429 }
430
431 // If we have more than 1 use, then recommend for hoisting.
432 return (UseCount > 1);
433 }
434
435 /// Return a target constant with the specified value of type i8.
436 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
437 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
438 }
439
440 /// Return a target constant with the specified value, of type i32.
441 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
442 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
443 }
444
445 /// Return a target constant with the specified value, of type i64.
446 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
447 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
448 }
449
450 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
451 const SDLoc &DL) {
452 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
453 uint64_t Index = N->getConstantOperandVal(1);
454 MVT VecVT = N->getOperand(0).getSimpleValueType();
455 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
456 }
457
458 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
459 const SDLoc &DL) {
460 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
461 uint64_t Index = N->getConstantOperandVal(2);
462 MVT VecVT = N->getSimpleValueType(0);
463 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
464 }
465
466 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
467 const SDLoc &DL) {
468 assert(VecWidth == 128 && "Unexpected vector width");
469 uint64_t Index = N->getConstantOperandVal(2);
470 MVT VecVT = N->getSimpleValueType(0);
471 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
472 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
473 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
474 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
475 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
476 }
477
478 SDValue getSBBZero(SDNode *N) {
479 SDLoc dl(N);
480 MVT VT = N->getSimpleValueType(0);
481
482 // Create zero.
483 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
484 SDValue Zero =
485 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
486 if (VT == MVT::i64) {
487 Zero = SDValue(
488 CurDAG->getMachineNode(
489 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
490 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
491 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
492 0);
493 }
494
495 // Copy flags to the EFLAGS register and glue it to next node.
496 unsigned Opcode = N->getOpcode();
497 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
498 "Unexpected opcode for SBB materialization");
499 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
500 SDValue EFLAGS =
501 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
502 N->getOperand(FlagOpIndex), SDValue());
503
504 // Create a 64-bit instruction if the result is 64-bits otherwise use the
505 // 32-bit version.
506 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
507 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
508 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
509 return SDValue(
510 CurDAG->getMachineNode(Opc, dl, VTs,
511 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
512 0);
513 }
514
515 // Helper to detect unneeded and instructions on shift amounts. Called
516 // from PatFrags in tablegen.
517 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
518 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
519 const APInt &Val = N->getConstantOperandAPInt(1);
520
521 if (Val.countr_one() >= Width)
522 return true;
523
524 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
525 return Mask.countr_one() >= Width;
526 }
527
528 /// Return an SDNode that returns the value of the global base register.
529 /// Output instructions required to initialize the global base register,
530 /// if necessary.
531 SDNode *getGlobalBaseReg();
532
533 /// Return a reference to the TargetMachine, casted to the target-specific
534 /// type.
535 const X86TargetMachine &getTargetMachine() const {
536 return static_cast<const X86TargetMachine &>(TM);
537 }
538
539 /// Return a reference to the TargetInstrInfo, casted to the target-specific
540 /// type.
541 const X86InstrInfo *getInstrInfo() const {
542 return Subtarget->getInstrInfo();
543 }
544
545 /// Return a condition code of the given SDNode
546 X86::CondCode getCondFromNode(SDNode *N) const;
547
548 /// Address-mode matching performs shift-of-and to and-of-shift
549 /// reassociation in order to expose more scaled addressing
550 /// opportunities.
551 bool ComplexPatternFuncMutatesDAG() const override {
552 return true;
553 }
554
555 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
556
557 // Indicates we should prefer to use a non-temporal load for this load.
558 bool useNonTemporalLoad(LoadSDNode *N) const {
559 if (!N->isNonTemporal())
560 return false;
561
562 unsigned StoreSize = N->getMemoryVT().getStoreSize();
563
564 if (N->getAlign().value() < StoreSize)
565 return false;
566
567 switch (StoreSize) {
568 default: llvm_unreachable("Unsupported store size");
569 case 4:
570 case 8:
571 return false;
572 case 16:
573 return Subtarget->hasSSE41();
574 case 32:
575 return Subtarget->hasAVX2();
576 case 64:
577 return Subtarget->hasAVX512();
578 }
579 }
580
581 bool foldLoadStoreIntoMemOperand(SDNode *Node);
582 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
583 bool matchBitExtract(SDNode *Node);
584 bool shrinkAndImmediate(SDNode *N);
585 bool isMaskZeroExtended(SDNode *N) const;
586 bool tryShiftAmountMod(SDNode *N);
587 bool tryShrinkShlLogicImm(SDNode *N);
588 bool tryVPTERNLOG(SDNode *N);
589 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
590 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
591 uint8_t Imm);
592 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
593 bool tryMatchBitSelect(SDNode *N);
594
595 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
596 const SDLoc &dl, MVT VT, SDNode *Node);
597 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
598 const SDLoc &dl, MVT VT, SDNode *Node,
599 SDValue &InGlue);
600
601 bool tryOptimizeRem8Extend(SDNode *N);
602
603 bool onlyUsesZeroFlag(SDValue Flags) const;
604 bool hasNoSignFlagUses(SDValue Flags) const;
605 bool hasNoCarryFlagUses(SDValue Flags) const;
606 };
607
608 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
609 public:
610 static char ID;
611 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
612 CodeGenOptLevel OptLevel)
613 : SelectionDAGISelLegacy(
614 ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
615 };
616}
617
618char X86DAGToDAGISelLegacy::ID = 0;
619
620INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
621
622// Returns true if this masked compare can be implemented legally with this
623// type.
624static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
625 unsigned Opcode = N->getOpcode();
626 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
627 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
628 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
629 // We can get 256-bit 8 element types here without VLX being enabled. When
630 // this happens we will use 512-bit operations and the mask will not be
631 // zero extended.
632 EVT OpVT = N->getOperand(0).getValueType();
633 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
634 // second operand.
635 if (Opcode == X86ISD::STRICT_CMPM)
636 OpVT = N->getOperand(1).getValueType();
637 if (OpVT.is256BitVector() || OpVT.is128BitVector())
638 return Subtarget->hasVLX();
639
640 return true;
641 }
642 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
643 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
644 Opcode == X86ISD::FSETCCM_SAE)
645 return true;
646
647 return false;
648}
649
650// Returns true if we can assume the writer of the mask has zero extended it
651// for us.
652bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
653 // If this is an AND, check if we have a compare on either side. As long as
654 // one side guarantees the mask is zero extended, the AND will preserve those
655 // zeros.
656 if (N->getOpcode() == ISD::AND)
657 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
658 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
659
660 return isLegalMaskCompare(N, Subtarget);
661}
662
663bool
664X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
665 if (OptLevel == CodeGenOptLevel::None)
666 return false;
667
668 if (!N.hasOneUse())
669 return false;
670
671 if (N.getOpcode() != ISD::LOAD)
672 return true;
673
674 // Don't fold non-temporal loads if we have an instruction for them.
675 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
676 return false;
677
678 // If N is a load, do additional profitability checks.
679 if (U == Root) {
680 switch (U->getOpcode()) {
681 default: break;
682 case X86ISD::ADD:
683 case X86ISD::ADC:
684 case X86ISD::SUB:
685 case X86ISD::SBB:
686 case X86ISD::AND:
687 case X86ISD::XOR:
688 case X86ISD::OR:
689 case ISD::ADD:
690 case ISD::UADDO_CARRY:
691 case ISD::AND:
692 case ISD::OR:
693 case ISD::XOR: {
694 SDValue Op1 = U->getOperand(1);
695
696 // If the other operand is a 8-bit immediate we should fold the immediate
697 // instead. This reduces code size.
698 // e.g.
699 // movl 4(%esp), %eax
700 // addl $4, %eax
701 // vs.
702 // movl $4, %eax
703 // addl 4(%esp), %eax
704 // The former is 2 bytes shorter. In case where the increment is 1, then
705 // the saving can be 4 bytes (by using incl %eax).
706 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
707 if (Imm->getAPIntValue().isSignedIntN(8))
708 return false;
709
710 // If this is a 64-bit AND with an immediate that fits in 32-bits,
711 // prefer using the smaller and over folding the load. This is needed to
712 // make sure immediates created by shrinkAndImmediate are always folded.
713 // Ideally we would narrow the load during DAG combine and get the
714 // best of both worlds.
715 if (U->getOpcode() == ISD::AND &&
716 Imm->getAPIntValue().getBitWidth() == 64 &&
717 Imm->getAPIntValue().isIntN(32))
718 return false;
719
720 // If this really a zext_inreg that can be represented with a movzx
721 // instruction, prefer that.
722 // TODO: We could shrink the load and fold if it is non-volatile.
723 if (U->getOpcode() == ISD::AND &&
724 (Imm->getAPIntValue() == UINT8_MAX ||
725 Imm->getAPIntValue() == UINT16_MAX ||
726 Imm->getAPIntValue() == UINT32_MAX))
727 return false;
728
729 // ADD/SUB with can negate the immediate and use the opposite operation
730 // to fit 128 into a sign extended 8 bit immediate.
731 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
732 (-Imm->getAPIntValue()).isSignedIntN(8))
733 return false;
734
735 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
736 (-Imm->getAPIntValue()).isSignedIntN(8) &&
737 hasNoCarryFlagUses(SDValue(U, 1)))
738 return false;
739 }
740
741 // If the other operand is a TLS address, we should fold it instead.
742 // This produces
743 // movl %gs:0, %eax
744 // leal i@NTPOFF(%eax), %eax
745 // instead of
746 // movl $i@NTPOFF, %eax
747 // addl %gs:0, %eax
748 // if the block also has an access to a second TLS address this will save
749 // a load.
750 // FIXME: This is probably also true for non-TLS addresses.
751 if (Op1.getOpcode() == X86ISD::Wrapper) {
752 SDValue Val = Op1.getOperand(0);
754 return false;
755 }
756
757 // Don't fold load if this matches the BTS/BTR/BTC patterns.
758 // BTS: (or X, (shl 1, n))
759 // BTR: (and X, (rotl -2, n))
760 // BTC: (xor X, (shl 1, n))
761 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
762 if (U->getOperand(0).getOpcode() == ISD::SHL &&
763 isOneConstant(U->getOperand(0).getOperand(0)))
764 return false;
765
766 if (U->getOperand(1).getOpcode() == ISD::SHL &&
767 isOneConstant(U->getOperand(1).getOperand(0)))
768 return false;
769 }
770 if (U->getOpcode() == ISD::AND) {
771 SDValue U0 = U->getOperand(0);
772 SDValue U1 = U->getOperand(1);
773 if (U0.getOpcode() == ISD::ROTL) {
775 if (C && C->getSExtValue() == -2)
776 return false;
777 }
778
779 if (U1.getOpcode() == ISD::ROTL) {
781 if (C && C->getSExtValue() == -2)
782 return false;
783 }
784 }
785
786 break;
787 }
788 case ISD::SHL:
789 case ISD::SRA:
790 case ISD::SRL:
791 // Don't fold a load into a shift by immediate. The BMI2 instructions
792 // support folding a load, but not an immediate. The legacy instructions
793 // support folding an immediate, but can't fold a load. Folding an
794 // immediate is preferable to folding a load.
795 if (isa<ConstantSDNode>(U->getOperand(1)))
796 return false;
797
798 break;
799 }
800 }
801
802 // Prevent folding a load if this can implemented with an insert_subreg or
803 // a move that implicitly zeroes.
804 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
805 isNullConstant(Root->getOperand(2)) &&
806 (Root->getOperand(0).isUndef() ||
808 return false;
809
810 return true;
811}
812
813// Indicates it is profitable to form an AVX512 masked operation. Returning
814// false will favor a masked register-register masked move or vblendm and the
815// operation will be selected separately.
816bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
817 assert(
818 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
819 "Unexpected opcode!");
820
821 // If the operation has additional users, the operation will be duplicated.
822 // Check the use count to prevent that.
823 // FIXME: Are there cheap opcodes we might want to duplicate?
824 return N->getOperand(1).hasOneUse();
825}
826
827/// Replace the original chain operand of the call with
828/// load's chain operand and move load below the call's chain operand.
829static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
830 SDValue Call, SDValue OrigChain) {
832 SDValue Chain = OrigChain.getOperand(0);
833 if (Chain.getNode() == Load.getNode())
834 Ops.push_back(Load.getOperand(0));
835 else {
836 assert(Chain.getOpcode() == ISD::TokenFactor &&
837 "Unexpected chain operand");
838 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
839 if (Chain.getOperand(i).getNode() == Load.getNode())
840 Ops.push_back(Load.getOperand(0));
841 else
842 Ops.push_back(Chain.getOperand(i));
843 SDValue NewChain =
844 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
845 Ops.clear();
846 Ops.push_back(NewChain);
847 }
848 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
849 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
850 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
851 Load.getOperand(1), Load.getOperand(2));
852
853 Ops.clear();
854 Ops.push_back(SDValue(Load.getNode(), 1));
855 Ops.append(Call->op_begin() + 1, Call->op_end());
856 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
857}
858
859/// Return true if call address is a load and it can be
860/// moved below CALLSEQ_START and the chains leading up to the call.
861/// Return the CALLSEQ_START by reference as a second output.
862/// In the case of a tail call, there isn't a callseq node between the call
863/// chain and the load.
864static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
865 // The transformation is somewhat dangerous if the call's chain was glued to
866 // the call. After MoveBelowOrigChain the load is moved between the call and
867 // the chain, this can create a cycle if the load is not folded. So it is
868 // *really* important that we are sure the load will be folded.
869 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
870 return false;
871 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
872 if (!LD ||
873 !LD->isSimple() ||
874 LD->getAddressingMode() != ISD::UNINDEXED ||
875 LD->getExtensionType() != ISD::NON_EXTLOAD)
876 return false;
877
878 // Now let's find the callseq_start.
879 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
880 if (!Chain.hasOneUse())
881 return false;
882 Chain = Chain.getOperand(0);
883 }
884
885 if (!Chain.getNumOperands())
886 return false;
887 // Since we are not checking for AA here, conservatively abort if the chain
888 // writes to memory. It's not safe to move the callee (a load) across a store.
889 if (isa<MemSDNode>(Chain.getNode()) &&
890 cast<MemSDNode>(Chain.getNode())->writeMem())
891 return false;
892 if (Chain.getOperand(0).getNode() == Callee.getNode())
893 return true;
894 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
895 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
896 Callee.getValue(1).hasOneUse())
897 return true;
898 return false;
899}
900
901static bool isEndbrImm64(uint64_t Imm) {
902// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
903// i.g: 0xF3660F1EFA, 0xF3670F1EFA
904 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
905 return false;
906
907 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
908 0x65, 0x66, 0x67, 0xf0, 0xf2};
909 int i = 24; // 24bit 0x0F1EFA has matched
910 while (i < 64) {
911 uint8_t Byte = (Imm >> i) & 0xFF;
912 if (Byte == 0xF3)
913 return true;
914 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
915 return false;
916 i += 8;
917 }
918
919 return false;
920}
921
922static bool needBWI(MVT VT) {
923 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
924}
925
926void X86DAGToDAGISel::PreprocessISelDAG() {
927 bool MadeChange = false;
928 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
929 E = CurDAG->allnodes_end(); I != E; ) {
930 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
931
932 // This is for CET enhancement.
933 //
934 // ENDBR32 and ENDBR64 have specific opcodes:
935 // ENDBR32: F3 0F 1E FB
936 // ENDBR64: F3 0F 1E FA
937 // And we want that attackers won’t find unintended ENDBR32/64
938 // opcode matches in the binary
939 // Here’s an example:
940 // If the compiler had to generate asm for the following code:
941 // a = 0xF30F1EFA
942 // it could, for example, generate:
943 // mov 0xF30F1EFA, dword ptr[a]
944 // In such a case, the binary would include a gadget that starts
945 // with a fake ENDBR64 opcode. Therefore, we split such generation
946 // into multiple operations, let it not shows in the binary
947 if (N->getOpcode() == ISD::Constant) {
948 MVT VT = N->getSimpleValueType(0);
949 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
950 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
951 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
952 // Check that the cf-protection-branch is enabled.
953 Metadata *CFProtectionBranch =
955 "cf-protection-branch");
956 if (CFProtectionBranch || IndirectBranchTracking) {
957 SDLoc dl(N);
958 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
959 Complement = CurDAG->getNOT(dl, Complement, VT);
960 --I;
961 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
962 ++I;
963 MadeChange = true;
964 continue;
965 }
966 }
967 }
968
969 // If this is a target specific AND node with no flag usages, turn it back
970 // into ISD::AND to enable test instruction matching.
971 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
972 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
973 N->getOperand(0), N->getOperand(1));
974 --I;
975 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
976 ++I;
977 MadeChange = true;
978 continue;
979 }
980
981 // Convert vector increment or decrement to sub/add with an all-ones
982 // constant:
983 // add X, <1, 1...> --> sub X, <-1, -1...>
984 // sub X, <1, 1...> --> add X, <-1, -1...>
985 // The all-ones vector constant can be materialized using a pcmpeq
986 // instruction that is commonly recognized as an idiom (has no register
987 // dependency), so that's better/smaller than loading a splat 1 constant.
988 //
989 // But don't do this if it would inhibit a potentially profitable load
990 // folding opportunity for the other operand. That only occurs with the
991 // intersection of:
992 // (1) The other operand (op0) is load foldable.
993 // (2) The op is an add (otherwise, we are *creating* an add and can still
994 // load fold the other op).
995 // (3) The target has AVX (otherwise, we have a destructive add and can't
996 // load fold the other op without killing the constant op).
997 // (4) The constant 1 vector has multiple uses (so it is profitable to load
998 // into a register anyway).
999 auto mayPreventLoadFold = [&]() {
1000 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
1001 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1002 !N->getOperand(1).hasOneUse();
1003 };
1004 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1005 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1006 APInt SplatVal;
1007 if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
1008 SplatVal.isOne()) {
1009 SDLoc DL(N);
1010
1011 MVT VT = N->getSimpleValueType(0);
1012 unsigned NumElts = VT.getSizeInBits() / 32;
1014 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
1015 AllOnes = CurDAG->getBitcast(VT, AllOnes);
1016
1017 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1018 SDValue Res =
1019 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
1020 --I;
1021 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1022 ++I;
1023 MadeChange = true;
1024 continue;
1025 }
1026 }
1027
1028 switch (N->getOpcode()) {
1029 case X86ISD::VBROADCAST: {
1030 MVT VT = N->getSimpleValueType(0);
1031 // Emulate v32i16/v64i8 broadcast without BWI.
1032 if (!Subtarget->hasBWI() && needBWI(VT)) {
1033 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1034 SDLoc dl(N);
1035 SDValue NarrowBCast =
1036 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1037 SDValue Res =
1038 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1039 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1040 unsigned Index = NarrowVT.getVectorMinNumElements();
1041 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1042 CurDAG->getIntPtrConstant(Index, dl));
1043
1044 --I;
1045 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1046 ++I;
1047 MadeChange = true;
1048 continue;
1049 }
1050
1051 break;
1052 }
1054 MVT VT = N->getSimpleValueType(0);
1055 // Emulate v32i16/v64i8 broadcast without BWI.
1056 if (!Subtarget->hasBWI() && needBWI(VT)) {
1057 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1058 auto *MemNode = cast<MemSDNode>(N);
1059 SDLoc dl(N);
1060 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1061 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1062 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1063 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1064 MemNode->getMemOperand());
1065 SDValue Res =
1066 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1067 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1068 unsigned Index = NarrowVT.getVectorMinNumElements();
1069 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1070 CurDAG->getIntPtrConstant(Index, dl));
1071
1072 --I;
1073 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1074 CurDAG->ReplaceAllUsesWith(N, To);
1075 ++I;
1076 MadeChange = true;
1077 continue;
1078 }
1079
1080 break;
1081 }
1082 case ISD::LOAD: {
1083 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1084 // load, then just extract the lower subvector and avoid the second load.
1085 auto *Ld = cast<LoadSDNode>(N);
1086 MVT VT = N->getSimpleValueType(0);
1087 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1088 !(VT.is128BitVector() || VT.is256BitVector()))
1089 break;
1090
1091 MVT MaxVT = VT;
1092 SDNode *MaxLd = nullptr;
1093 SDValue Ptr = Ld->getBasePtr();
1094 SDValue Chain = Ld->getChain();
1095 for (SDNode *User : Ptr->users()) {
1096 auto *UserLd = dyn_cast<LoadSDNode>(User);
1097 MVT UserVT = User->getSimpleValueType(0);
1098 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1099 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1100 !User->hasAnyUseOfValue(1) &&
1101 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1102 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1103 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1104 MaxLd = User;
1105 MaxVT = UserVT;
1106 }
1107 }
1108 if (MaxLd) {
1109 SDLoc dl(N);
1110 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1111 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1112 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1113 SDValue(MaxLd, 0),
1114 CurDAG->getIntPtrConstant(0, dl));
1115 SDValue Res = CurDAG->getBitcast(VT, Extract);
1116
1117 --I;
1118 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1119 CurDAG->ReplaceAllUsesWith(N, To);
1120 ++I;
1121 MadeChange = true;
1122 continue;
1123 }
1124 break;
1125 }
1126 case ISD::VSELECT: {
1127 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1128 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1129 if (EleVT == MVT::i1)
1130 break;
1131
1132 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1133 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1134 "We can't replace VSELECT with BLENDV in vXi16!");
1135 SDValue R;
1136 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1137 EleVT.getSizeInBits()) {
1138 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1139 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1140 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1141 } else {
1142 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1143 N->getOperand(0), N->getOperand(1),
1144 N->getOperand(2));
1145 }
1146 --I;
1147 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1148 ++I;
1149 MadeChange = true;
1150 continue;
1151 }
1152 case ISD::FP_ROUND:
1154 case ISD::FP_TO_SINT:
1155 case ISD::FP_TO_UINT:
1158 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1159 // don't need 2 sets of patterns.
1160 if (!N->getSimpleValueType(0).isVector())
1161 break;
1162
1163 unsigned NewOpc;
1164 switch (N->getOpcode()) {
1165 default: llvm_unreachable("Unexpected opcode!");
1166 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1167 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1168 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1169 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1170 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1171 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1172 }
1173 SDValue Res;
1174 if (N->isStrictFPOpcode())
1175 Res =
1176 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1177 {N->getOperand(0), N->getOperand(1)});
1178 else
1179 Res =
1180 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1181 N->getOperand(0));
1182 --I;
1183 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1184 ++I;
1185 MadeChange = true;
1186 continue;
1187 }
1188 case ISD::SHL:
1189 case ISD::SRA:
1190 case ISD::SRL: {
1191 // Replace vector shifts with their X86 specific equivalent so we don't
1192 // need 2 sets of patterns.
1193 if (!N->getValueType(0).isVector())
1194 break;
1195
1196 unsigned NewOpc;
1197 switch (N->getOpcode()) {
1198 default: llvm_unreachable("Unexpected opcode!");
1199 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1200 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1201 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1202 }
1203 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1204 N->getOperand(0), N->getOperand(1));
1205 --I;
1206 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1207 ++I;
1208 MadeChange = true;
1209 continue;
1210 }
1211 case ISD::ANY_EXTEND:
1213 // Replace vector any extend with the zero extend equivalents so we don't
1214 // need 2 sets of patterns. Ignore vXi1 extensions.
1215 if (!N->getValueType(0).isVector())
1216 break;
1217
1218 unsigned NewOpc;
1219 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1220 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1221 "Unexpected opcode for mask vector!");
1222 NewOpc = ISD::SIGN_EXTEND;
1223 } else {
1224 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1227 }
1228
1229 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1230 N->getOperand(0));
1231 --I;
1232 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1233 ++I;
1234 MadeChange = true;
1235 continue;
1236 }
1237 case ISD::FCEIL:
1238 case ISD::STRICT_FCEIL:
1239 case ISD::FFLOOR:
1240 case ISD::STRICT_FFLOOR:
1241 case ISD::FTRUNC:
1242 case ISD::STRICT_FTRUNC:
1243 case ISD::FROUNDEVEN:
1245 case ISD::FNEARBYINT:
1247 case ISD::FRINT:
1248 case ISD::STRICT_FRINT: {
1249 // Replace fp rounding with their X86 specific equivalent so we don't
1250 // need 2 sets of patterns.
1251 unsigned Imm;
1252 switch (N->getOpcode()) {
1253 default: llvm_unreachable("Unexpected opcode!");
1254 case ISD::STRICT_FCEIL:
1255 case ISD::FCEIL: Imm = 0xA; break;
1256 case ISD::STRICT_FFLOOR:
1257 case ISD::FFLOOR: Imm = 0x9; break;
1258 case ISD::STRICT_FTRUNC:
1259 case ISD::FTRUNC: Imm = 0xB; break;
1261 case ISD::FROUNDEVEN: Imm = 0x8; break;
1263 case ISD::FNEARBYINT: Imm = 0xC; break;
1264 case ISD::STRICT_FRINT:
1265 case ISD::FRINT: Imm = 0x4; break;
1266 }
1267 SDLoc dl(N);
1268 bool IsStrict = N->isStrictFPOpcode();
1269 SDValue Res;
1270 if (IsStrict)
1271 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1272 {N->getValueType(0), MVT::Other},
1273 {N->getOperand(0), N->getOperand(1),
1274 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1275 else
1276 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1277 N->getOperand(0),
1278 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1279 --I;
1280 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1281 ++I;
1282 MadeChange = true;
1283 continue;
1284 }
1285 case X86ISD::FANDN:
1286 case X86ISD::FAND:
1287 case X86ISD::FOR:
1288 case X86ISD::FXOR: {
1289 // Widen scalar fp logic ops to vector to reduce isel patterns.
1290 // FIXME: Can we do this during lowering/combine.
1291 MVT VT = N->getSimpleValueType(0);
1292 if (VT.isVector() || VT == MVT::f128)
1293 break;
1294
1295 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1296 : VT == MVT::f32 ? MVT::v4f32
1297 : MVT::v8f16;
1298
1299 SDLoc dl(N);
1300 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1301 N->getOperand(0));
1302 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1303 N->getOperand(1));
1304
1305 SDValue Res;
1306 if (Subtarget->hasSSE2()) {
1307 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1308 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1309 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1310 unsigned Opc;
1311 switch (N->getOpcode()) {
1312 default: llvm_unreachable("Unexpected opcode!");
1313 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1314 case X86ISD::FAND: Opc = ISD::AND; break;
1315 case X86ISD::FOR: Opc = ISD::OR; break;
1316 case X86ISD::FXOR: Opc = ISD::XOR; break;
1317 }
1318 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1319 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1320 } else {
1321 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1322 }
1323 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1324 CurDAG->getIntPtrConstant(0, dl));
1325 --I;
1326 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1327 ++I;
1328 MadeChange = true;
1329 continue;
1330 }
1331 }
1332
1333 if (OptLevel != CodeGenOptLevel::None &&
1334 // Only do this when the target can fold the load into the call or
1335 // jmp.
1336 !Subtarget->useIndirectThunkCalls() &&
1337 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1338 (N->getOpcode() == X86ISD::TC_RETURN &&
1339 (Subtarget->is64Bit() ||
1340 !getTargetMachine().isPositionIndependent())))) {
1341 /// Also try moving call address load from outside callseq_start to just
1342 /// before the call to allow it to be folded.
1343 ///
1344 /// [Load chain]
1345 /// ^
1346 /// |
1347 /// [Load]
1348 /// ^ ^
1349 /// | |
1350 /// / \--
1351 /// / |
1352 ///[CALLSEQ_START] |
1353 /// ^ |
1354 /// | |
1355 /// [LOAD/C2Reg] |
1356 /// | |
1357 /// \ /
1358 /// \ /
1359 /// [CALL]
1360 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1361 SDValue Chain = N->getOperand(0);
1362 SDValue Load = N->getOperand(1);
1363 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1364 continue;
1365 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1366 ++NumLoadMoved;
1367 MadeChange = true;
1368 continue;
1369 }
1370
1371 // Lower fpround and fpextend nodes that target the FP stack to be store and
1372 // load to the stack. This is a gross hack. We would like to simply mark
1373 // these as being illegal, but when we do that, legalize produces these when
1374 // it expands calls, then expands these in the same legalize pass. We would
1375 // like dag combine to be able to hack on these between the call expansion
1376 // and the node legalization. As such this pass basically does "really
1377 // late" legalization of these inline with the X86 isel pass.
1378 // FIXME: This should only happen when not compiled with -O0.
1379 switch (N->getOpcode()) {
1380 default: continue;
1381 case ISD::FP_ROUND:
1382 case ISD::FP_EXTEND:
1383 {
1384 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1385 MVT DstVT = N->getSimpleValueType(0);
1386
1387 // If any of the sources are vectors, no fp stack involved.
1388 if (SrcVT.isVector() || DstVT.isVector())
1389 continue;
1390
1391 // If the source and destination are SSE registers, then this is a legal
1392 // conversion that should not be lowered.
1393 const X86TargetLowering *X86Lowering =
1394 static_cast<const X86TargetLowering *>(TLI);
1395 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1396 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1397 if (SrcIsSSE && DstIsSSE)
1398 continue;
1399
1400 if (!SrcIsSSE && !DstIsSSE) {
1401 // If this is an FPStack extension, it is a noop.
1402 if (N->getOpcode() == ISD::FP_EXTEND)
1403 continue;
1404 // If this is a value-preserving FPStack truncation, it is a noop.
1405 if (N->getConstantOperandVal(1))
1406 continue;
1407 }
1408
1409 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1410 // FPStack has extload and truncstore. SSE can fold direct loads into other
1411 // operations. Based on this, decide what we want to do.
1412 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1413 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1414 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1415 MachinePointerInfo MPI =
1416 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1417 SDLoc dl(N);
1418
1419 // FIXME: optimize the case where the src/dest is a load or store?
1420
1421 SDValue Store = CurDAG->getTruncStore(
1422 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1423 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1424 MemTmp, MPI, MemVT);
1425
1426 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1427 // extload we created. This will cause general havok on the dag because
1428 // anything below the conversion could be folded into other existing nodes.
1429 // To avoid invalidating 'I', back it up to the convert node.
1430 --I;
1431 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1432 break;
1433 }
1434
1435 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1436 //dealing with the chain differently, as there is already a preexisting chain.
1439 {
1440 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1441 MVT DstVT = N->getSimpleValueType(0);
1442
1443 // If any of the sources are vectors, no fp stack involved.
1444 if (SrcVT.isVector() || DstVT.isVector())
1445 continue;
1446
1447 // If the source and destination are SSE registers, then this is a legal
1448 // conversion that should not be lowered.
1449 const X86TargetLowering *X86Lowering =
1450 static_cast<const X86TargetLowering *>(TLI);
1451 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1452 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1453 if (SrcIsSSE && DstIsSSE)
1454 continue;
1455
1456 if (!SrcIsSSE && !DstIsSSE) {
1457 // If this is an FPStack extension, it is a noop.
1458 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1459 continue;
1460 // If this is a value-preserving FPStack truncation, it is a noop.
1461 if (N->getConstantOperandVal(2))
1462 continue;
1463 }
1464
1465 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1466 // FPStack has extload and truncstore. SSE can fold direct loads into other
1467 // operations. Based on this, decide what we want to do.
1468 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1469 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1470 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1471 MachinePointerInfo MPI =
1472 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1473 SDLoc dl(N);
1474
1475 // FIXME: optimize the case where the src/dest is a load or store?
1476
1477 //Since the operation is StrictFP, use the preexisting chain.
1479 if (!SrcIsSSE) {
1480 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1481 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1482 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1483 MPI, /*Align*/ std::nullopt,
1485 if (N->getFlags().hasNoFPExcept()) {
1486 SDNodeFlags Flags = Store->getFlags();
1487 Flags.setNoFPExcept(true);
1488 Store->setFlags(Flags);
1489 }
1490 } else {
1491 assert(SrcVT == MemVT && "Unexpected VT!");
1492 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1493 MPI);
1494 }
1495
1496 if (!DstIsSSE) {
1497 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1498 SDValue Ops[] = {Store, MemTmp};
1499 Result = CurDAG->getMemIntrinsicNode(
1500 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1501 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1502 if (N->getFlags().hasNoFPExcept()) {
1503 SDNodeFlags Flags = Result->getFlags();
1504 Flags.setNoFPExcept(true);
1505 Result->setFlags(Flags);
1506 }
1507 } else {
1508 assert(DstVT == MemVT && "Unexpected VT!");
1509 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1510 }
1511
1512 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1513 // extload we created. This will cause general havok on the dag because
1514 // anything below the conversion could be folded into other existing nodes.
1515 // To avoid invalidating 'I', back it up to the convert node.
1516 --I;
1517 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1518 break;
1519 }
1520 }
1521
1522
1523 // Now that we did that, the node is dead. Increment the iterator to the
1524 // next node to process, then delete N.
1525 ++I;
1526 MadeChange = true;
1527 }
1528
1529 // Remove any dead nodes that may have been left behind.
1530 if (MadeChange)
1531 CurDAG->RemoveDeadNodes();
1532}
1533
1534// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1535bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1536 unsigned Opc = N->getMachineOpcode();
1537 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1538 Opc != X86::MOVSX64rr8)
1539 return false;
1540
1541 SDValue N0 = N->getOperand(0);
1542
1543 // We need to be extracting the lower bit of an extend.
1544 if (!N0.isMachineOpcode() ||
1545 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1546 N0.getConstantOperandVal(1) != X86::sub_8bit)
1547 return false;
1548
1549 // We're looking for either a movsx or movzx to match the original opcode.
1550 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1551 : X86::MOVSX32rr8_NOREX;
1552 SDValue N00 = N0.getOperand(0);
1553 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1554 return false;
1555
1556 if (Opc == X86::MOVSX64rr8) {
1557 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1558 // to 64.
1559 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1560 MVT::i64, N00);
1561 ReplaceUses(N, Extend);
1562 } else {
1563 // Ok we can drop this extend and just use the original extend.
1564 ReplaceUses(N, N00.getNode());
1565 }
1566
1567 return true;
1568}
1569
1570void X86DAGToDAGISel::PostprocessISelDAG() {
1571 // Skip peepholes at -O0.
1572 if (TM.getOptLevel() == CodeGenOptLevel::None)
1573 return;
1574
1575 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1576
1577 bool MadeChange = false;
1578 while (Position != CurDAG->allnodes_begin()) {
1579 SDNode *N = &*--Position;
1580 // Skip dead nodes and any non-machine opcodes.
1581 if (N->use_empty() || !N->isMachineOpcode())
1582 continue;
1583
1584 if (tryOptimizeRem8Extend(N)) {
1585 MadeChange = true;
1586 continue;
1587 }
1588
1589 unsigned Opc = N->getMachineOpcode();
1590 switch (Opc) {
1591 default:
1592 continue;
1593 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1594 case X86::TEST8rr:
1595 case X86::TEST16rr:
1596 case X86::TEST32rr:
1597 case X86::TEST64rr:
1598 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1599 case X86::CTEST8rr:
1600 case X86::CTEST16rr:
1601 case X86::CTEST32rr:
1602 case X86::CTEST64rr: {
1603 auto &Op0 = N->getOperand(0);
1604 if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1605 !Op0.isMachineOpcode())
1606 continue;
1607 SDValue And = N->getOperand(0);
1608#define CASE_ND(OP) \
1609 case X86::OP: \
1610 case X86::OP##_ND:
1611 switch (And.getMachineOpcode()) {
1612 default:
1613 continue;
1614 CASE_ND(AND8rr)
1615 CASE_ND(AND16rr)
1616 CASE_ND(AND32rr)
1617 CASE_ND(AND64rr) {
1618 if (And->hasAnyUseOfValue(1))
1619 continue;
1620 SmallVector<SDValue> Ops(N->op_values());
1621 Ops[0] = And.getOperand(0);
1622 Ops[1] = And.getOperand(1);
1623 MachineSDNode *Test =
1624 CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1625 ReplaceUses(N, Test);
1626 MadeChange = true;
1627 continue;
1628 }
1629 CASE_ND(AND8rm)
1630 CASE_ND(AND16rm)
1631 CASE_ND(AND32rm)
1632 CASE_ND(AND64rm) {
1633 if (And->hasAnyUseOfValue(1))
1634 continue;
1635 unsigned NewOpc;
1636 bool IsCTESTCC = X86::isCTESTCC(Opc);
1637#define FROM_TO(A, B) \
1638 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1639 break;
1640 switch (And.getMachineOpcode()) {
1641 FROM_TO(AND8rm, TEST8mr);
1642 FROM_TO(AND16rm, TEST16mr);
1643 FROM_TO(AND32rm, TEST32mr);
1644 FROM_TO(AND64rm, TEST64mr);
1645 }
1646#undef FROM_TO
1647#undef CASE_ND
1648 // Need to swap the memory and register operand.
1649 SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1650 And.getOperand(3), And.getOperand(4),
1651 And.getOperand(5), And.getOperand(0)};
1652 // CC, Cflags.
1653 if (IsCTESTCC) {
1654 Ops.push_back(N->getOperand(2));
1655 Ops.push_back(N->getOperand(3));
1656 }
1657 // Chain of memory load
1658 Ops.push_back(And.getOperand(6));
1659 // Glue
1660 if (IsCTESTCC)
1661 Ops.push_back(N->getOperand(4));
1662
1663 MachineSDNode *Test = CurDAG->getMachineNode(
1664 NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1665 CurDAG->setNodeMemRefs(
1666 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1667 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1668 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1669 MadeChange = true;
1670 continue;
1671 }
1672 }
1673 }
1674 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1675 // used. We're doing this late so we can prefer to fold the AND into masked
1676 // comparisons. Doing that can be better for the live range of the mask
1677 // register.
1678 case X86::KORTESTBkk:
1679 case X86::KORTESTWkk:
1680 case X86::KORTESTDkk:
1681 case X86::KORTESTQkk: {
1682 SDValue Op0 = N->getOperand(0);
1683 if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1684 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1685 continue;
1686#define CASE(A) \
1687 case X86::A: \
1688 break;
1689 switch (Op0.getMachineOpcode()) {
1690 default:
1691 continue;
1692 CASE(KANDBkk)
1693 CASE(KANDWkk)
1694 CASE(KANDDkk)
1695 CASE(KANDQkk)
1696 }
1697 unsigned NewOpc;
1698#define FROM_TO(A, B) \
1699 case X86::A: \
1700 NewOpc = X86::B; \
1701 break;
1702 switch (Opc) {
1703 FROM_TO(KORTESTBkk, KTESTBkk)
1704 FROM_TO(KORTESTWkk, KTESTWkk)
1705 FROM_TO(KORTESTDkk, KTESTDkk)
1706 FROM_TO(KORTESTQkk, KTESTQkk)
1707 }
1708 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1709 // KAND instructions and KTEST use the same ISA feature.
1710 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1711 continue;
1712#undef FROM_TO
1713 MachineSDNode *KTest = CurDAG->getMachineNode(
1714 NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1715 ReplaceUses(N, KTest);
1716 MadeChange = true;
1717 continue;
1718 }
1719 // Attempt to remove vectors moves that were inserted to zero upper bits.
1720 case TargetOpcode::SUBREG_TO_REG: {
1721 unsigned SubRegIdx = N->getConstantOperandVal(2);
1722 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1723 continue;
1724
1725 SDValue Move = N->getOperand(1);
1726 if (!Move.isMachineOpcode())
1727 continue;
1728
1729 // Make sure its one of the move opcodes we recognize.
1730 switch (Move.getMachineOpcode()) {
1731 default:
1732 continue;
1733 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1734 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1735 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1736 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1737 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1738 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1739 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1740 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1741 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1742 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1743 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1744 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1745 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1746 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1747 }
1748#undef CASE
1749
1750 SDValue In = Move.getOperand(0);
1751 if (!In.isMachineOpcode() ||
1752 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1753 continue;
1754
1755 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1756 // the SHA instructions which use a legacy encoding.
1757 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1758 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1759 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1760 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1761 continue;
1762
1763 // Producing instruction is another vector instruction. We can drop the
1764 // move.
1765 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1766 MadeChange = true;
1767 }
1768 }
1769 }
1770
1771 if (MadeChange)
1772 CurDAG->RemoveDeadNodes();
1773}
1774
1775
1776/// Emit any code that needs to be executed only in the main function.
1777void X86DAGToDAGISel::emitSpecialCodeForMain() {
1778 if (Subtarget->isTargetCygMing()) {
1779 TargetLowering::ArgListTy Args;
1780 auto &DL = CurDAG->getDataLayout();
1781
1782 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1783 CLI.setChain(CurDAG->getRoot())
1784 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1785 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1786 std::move(Args));
1787 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1788 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1789 CurDAG->setRoot(Result.second);
1790 }
1791}
1792
1793void X86DAGToDAGISel::emitFunctionEntryCode() {
1794 // If this is main, emit special code for main.
1795 const Function &F = MF->getFunction();
1796 if (F.hasExternalLinkage() && F.getName() == "main")
1797 emitSpecialCodeForMain();
1798}
1799
1800static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1801 // We can run into an issue where a frame index or a register base
1802 // includes a displacement that, when added to the explicit displacement,
1803 // will overflow the displacement field. Assuming that the
1804 // displacement fits into a 31-bit integer (which is only slightly more
1805 // aggressive than the current fundamental assumption that it fits into
1806 // a 32-bit integer), a 31-bit disp should always be safe.
1807 return isInt<31>(Val);
1808}
1809
1810bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1811 X86ISelAddressMode &AM) {
1812 // We may have already matched a displacement and the caller just added the
1813 // symbolic displacement. So we still need to do the checks even if Offset
1814 // is zero.
1815
1816 int64_t Val = AM.Disp + Offset;
1817
1818 // Cannot combine ExternalSymbol displacements with integer offsets.
1819 if (Val != 0 && (AM.ES || AM.MCSym))
1820 return true;
1821
1822 CodeModel::Model M = TM.getCodeModel();
1823 if (Subtarget->is64Bit()) {
1824 if (Val != 0 &&
1826 AM.hasSymbolicDisplacement()))
1827 return true;
1828 // In addition to the checks required for a register base, check that
1829 // we do not try to use an unsafe Disp with a frame index.
1830 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1832 return true;
1833 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1834 // 64 bits. Instructions with 32-bit register addresses perform this zero
1835 // extension for us and we can safely ignore the high bits of Offset.
1836 // Instructions with only a 32-bit immediate address do not, though: they
1837 // sign extend instead. This means only address the low 2GB of address space
1838 // is directly addressable, we need indirect addressing for the high 2GB of
1839 // address space.
1840 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1841 // implicit zero extension of instructions would cover up any problem.
1842 // However, we have asserts elsewhere that get triggered if we do, so keep
1843 // the checks for now.
1844 // TODO: We would actually be able to accept these, as well as the same
1845 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1846 // to get an address size override to be emitted. However, this
1847 // pseudo-register is not part of any register class and therefore causes
1848 // MIR verification to fail.
1849 if (Subtarget->isTarget64BitILP32() &&
1850 !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) &&
1851 !AM.hasBaseOrIndexReg())
1852 return true;
1853 } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1854 // For 32-bit X86, make sure the displacement still isn't close to the
1855 // expressible limit.
1856 return true;
1857 AM.Disp = Val;
1858 return false;
1859}
1860
1861bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1862 bool AllowSegmentRegForX32) {
1863 SDValue Address = N->getOperand(1);
1864
1865 // load gs:0 -> GS segment register.
1866 // load fs:0 -> FS segment register.
1867 //
1868 // This optimization is generally valid because the GNU TLS model defines that
1869 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1870 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1871 // zero-extended to 64 bits and then added it to the base address, which gives
1872 // unwanted results when the register holds a negative value.
1873 // For more information see http://people.redhat.com/drepper/tls.pdf
1874 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1875 !IndirectTlsSegRefs &&
1876 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1877 Subtarget->isTargetFuchsia())) {
1878 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1879 return true;
1880 switch (N->getPointerInfo().getAddrSpace()) {
1881 case X86AS::GS:
1882 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1883 return false;
1884 case X86AS::FS:
1885 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1886 return false;
1887 // Address space X86AS::SS is not handled here, because it is not used to
1888 // address TLS areas.
1889 }
1890 }
1891
1892 return true;
1893}
1894
1895/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1896/// mode. These wrap things that will resolve down into a symbol reference.
1897/// If no match is possible, this returns true, otherwise it returns false.
1898bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1899 // If the addressing mode already has a symbol as the displacement, we can
1900 // never match another symbol.
1901 if (AM.hasSymbolicDisplacement())
1902 return true;
1903
1904 bool IsRIPRelTLS = false;
1905 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1906 if (IsRIPRel) {
1907 SDValue Val = N.getOperand(0);
1909 IsRIPRelTLS = true;
1910 }
1911
1912 // We can't use an addressing mode in the 64-bit large code model.
1913 // Global TLS addressing is an exception. In the medium code model,
1914 // we use can use a mode when RIP wrappers are present.
1915 // That signifies access to globals that are known to be "near",
1916 // such as the GOT itself.
1917 CodeModel::Model M = TM.getCodeModel();
1918 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1919 return true;
1920
1921 // Base and index reg must be 0 in order to use %rip as base.
1922 if (IsRIPRel && AM.hasBaseOrIndexReg())
1923 return true;
1924
1925 // Make a local copy in case we can't do this fold.
1926 X86ISelAddressMode Backup = AM;
1927
1928 int64_t Offset = 0;
1929 SDValue N0 = N.getOperand(0);
1930 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1931 AM.GV = G->getGlobal();
1932 AM.SymbolFlags = G->getTargetFlags();
1933 Offset = G->getOffset();
1934 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1935 AM.CP = CP->getConstVal();
1936 AM.Alignment = CP->getAlign();
1937 AM.SymbolFlags = CP->getTargetFlags();
1938 Offset = CP->getOffset();
1939 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1940 AM.ES = S->getSymbol();
1941 AM.SymbolFlags = S->getTargetFlags();
1942 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1943 AM.MCSym = S->getMCSymbol();
1944 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1945 AM.JT = J->getIndex();
1946 AM.SymbolFlags = J->getTargetFlags();
1947 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1948 AM.BlockAddr = BA->getBlockAddress();
1949 AM.SymbolFlags = BA->getTargetFlags();
1950 Offset = BA->getOffset();
1951 } else
1952 llvm_unreachable("Unhandled symbol reference node.");
1953
1954 // Can't use an addressing mode with large globals.
1955 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1956 TM.isLargeGlobalValue(AM.GV)) {
1957 AM = Backup;
1958 return true;
1959 }
1960
1961 if (foldOffsetIntoAddress(Offset, AM)) {
1962 AM = Backup;
1963 return true;
1964 }
1965
1966 if (IsRIPRel)
1967 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1968
1969 // Commit the changes now that we know this fold is safe.
1970 return false;
1971}
1972
1973/// Add the specified node to the specified addressing mode, returning true if
1974/// it cannot be done. This just pattern matches for the addressing mode.
1975bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1976 if (matchAddressRecursively(N, AM, 0))
1977 return true;
1978
1979 // Post-processing: Make a second attempt to fold a load, if we now know
1980 // that there will not be any other register. This is only performed for
1981 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1982 // any foldable load the first time.
1983 if (Subtarget->isTarget64BitILP32() &&
1984 AM.BaseType == X86ISelAddressMode::RegBase &&
1985 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1986 SDValue Save_Base_Reg = AM.Base_Reg;
1987 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1988 AM.Base_Reg = SDValue();
1989 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1990 AM.Base_Reg = Save_Base_Reg;
1991 }
1992 }
1993
1994 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1995 // a smaller encoding and avoids a scaled-index.
1996 if (AM.Scale == 2 &&
1997 AM.BaseType == X86ISelAddressMode::RegBase &&
1998 AM.Base_Reg.getNode() == nullptr) {
1999 AM.Base_Reg = AM.IndexReg;
2000 AM.Scale = 1;
2001 }
2002
2003 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2004 // because it has a smaller encoding.
2005 if (TM.getCodeModel() != CodeModel::Large &&
2006 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
2007 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2008 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2009 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2010 // However, when GV is a local function symbol and in the same section as
2011 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2012 // referencing GV+Disp generates a relocation referencing the section symbol
2013 // with an even smaller offset, which might underflow. We should bail out if
2014 // the negative offset is too close to INT32_MIN. Actually, we are more
2015 // conservative here, using a smaller magic number also used by
2016 // isOffsetSuitableForCodeModel.
2017 if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
2018 return true;
2019
2020 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
2021 }
2022
2023 return false;
2024}
2025
2026bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2027 unsigned Depth) {
2028 // Add an artificial use to this node so that we can keep track of
2029 // it if it gets CSE'd with a different node.
2030 HandleSDNode Handle(N);
2031
2032 X86ISelAddressMode Backup = AM;
2033 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
2034 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
2035 return false;
2036 AM = Backup;
2037
2038 // Try again after commutating the operands.
2039 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
2040 Depth + 1) &&
2041 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
2042 return false;
2043 AM = Backup;
2044
2045 // If we couldn't fold both operands into the address at the same time,
2046 // see if we can just put each operand into a register and fold at least
2047 // the add.
2048 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2049 !AM.Base_Reg.getNode() &&
2050 !AM.IndexReg.getNode()) {
2051 N = Handle.getValue();
2052 AM.Base_Reg = N.getOperand(0);
2053 AM.IndexReg = N.getOperand(1);
2054 AM.Scale = 1;
2055 return false;
2056 }
2057 N = Handle.getValue();
2058 return true;
2059}
2060
2061// Insert a node into the DAG at least before the Pos node's position. This
2062// will reposition the node as needed, and will assign it a node ID that is <=
2063// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2064// IDs! The selection DAG must no longer depend on their uniqueness when this
2065// is used.
2066static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2067 if (N->getNodeId() == -1 ||
2070 DAG.RepositionNode(Pos->getIterator(), N.getNode());
2071 // Mark Node as invalid for pruning as after this it may be a successor to a
2072 // selected node but otherwise be in the same position of Pos.
2073 // Conservatively mark it with the same -abs(Id) to assure node id
2074 // invariant is preserved.
2075 N->setNodeId(Pos->getNodeId());
2077 }
2078}
2079
2080// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2081// safe. This allows us to convert the shift and and into an h-register
2082// extract and a scaled index. Returns false if the simplification is
2083// performed.
2085 uint64_t Mask,
2086 SDValue Shift, SDValue X,
2087 X86ISelAddressMode &AM) {
2088 if (Shift.getOpcode() != ISD::SRL ||
2089 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2090 !Shift.hasOneUse())
2091 return true;
2092
2093 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2094 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2095 Mask != (0xffu << ScaleLog))
2096 return true;
2097
2098 MVT XVT = X.getSimpleValueType();
2099 MVT VT = N.getSimpleValueType();
2100 SDLoc DL(N);
2101 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2102 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2103 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2104 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2105 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2106 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2107 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2108
2109 // Insert the new nodes into the topological ordering. We must do this in
2110 // a valid topological ordering as nothing is going to go back and re-sort
2111 // these nodes. We continually insert before 'N' in sequence as this is
2112 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2113 // hierarchy left to express.
2114 insertDAGNode(DAG, N, Eight);
2115 insertDAGNode(DAG, N, NewMask);
2116 insertDAGNode(DAG, N, Srl);
2117 insertDAGNode(DAG, N, And);
2118 insertDAGNode(DAG, N, Ext);
2119 insertDAGNode(DAG, N, ShlCount);
2120 insertDAGNode(DAG, N, Shl);
2121 DAG.ReplaceAllUsesWith(N, Shl);
2122 DAG.RemoveDeadNode(N.getNode());
2123 AM.IndexReg = Ext;
2124 AM.Scale = (1 << ScaleLog);
2125 return false;
2126}
2127
2128// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2129// allows us to fold the shift into this addressing mode. Returns false if the
2130// transform succeeded.
2132 X86ISelAddressMode &AM) {
2133 SDValue Shift = N.getOperand(0);
2134
2135 // Use a signed mask so that shifting right will insert sign bits. These
2136 // bits will be removed when we shift the result left so it doesn't matter
2137 // what we use. This might allow a smaller immediate encoding.
2138 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2139
2140 // If we have an any_extend feeding the AND, look through it to see if there
2141 // is a shift behind it. But only if the AND doesn't use the extended bits.
2142 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2143 bool FoundAnyExtend = false;
2144 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2145 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2146 isUInt<32>(Mask)) {
2147 FoundAnyExtend = true;
2148 Shift = Shift.getOperand(0);
2149 }
2150
2151 if (Shift.getOpcode() != ISD::SHL ||
2153 return true;
2154
2155 SDValue X = Shift.getOperand(0);
2156
2157 // Not likely to be profitable if either the AND or SHIFT node has more
2158 // than one use (unless all uses are for address computation). Besides,
2159 // isel mechanism requires their node ids to be reused.
2160 if (!N.hasOneUse() || !Shift.hasOneUse())
2161 return true;
2162
2163 // Verify that the shift amount is something we can fold.
2164 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2165 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2166 return true;
2167
2168 MVT VT = N.getSimpleValueType();
2169 SDLoc DL(N);
2170 if (FoundAnyExtend) {
2171 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2172 insertDAGNode(DAG, N, NewX);
2173 X = NewX;
2174 }
2175
2176 SDValue NewMask = DAG.getSignedConstant(Mask >> ShiftAmt, DL, VT);
2177 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2178 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2179
2180 // Insert the new nodes into the topological ordering. We must do this in
2181 // a valid topological ordering as nothing is going to go back and re-sort
2182 // these nodes. We continually insert before 'N' in sequence as this is
2183 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2184 // hierarchy left to express.
2185 insertDAGNode(DAG, N, NewMask);
2186 insertDAGNode(DAG, N, NewAnd);
2187 insertDAGNode(DAG, N, NewShift);
2188 DAG.ReplaceAllUsesWith(N, NewShift);
2189 DAG.RemoveDeadNode(N.getNode());
2190
2191 AM.Scale = 1 << ShiftAmt;
2192 AM.IndexReg = NewAnd;
2193 return false;
2194}
2195
2196// Implement some heroics to detect shifts of masked values where the mask can
2197// be replaced by extending the shift and undoing that in the addressing mode
2198// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2199// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2200// the addressing mode. This results in code such as:
2201//
2202// int f(short *y, int *lookup_table) {
2203// ...
2204// return *y + lookup_table[*y >> 11];
2205// }
2206//
2207// Turning into:
2208// movzwl (%rdi), %eax
2209// movl %eax, %ecx
2210// shrl $11, %ecx
2211// addl (%rsi,%rcx,4), %eax
2212//
2213// Instead of:
2214// movzwl (%rdi), %eax
2215// movl %eax, %ecx
2216// shrl $9, %ecx
2217// andl $124, %rcx
2218// addl (%rsi,%rcx), %eax
2219//
2220// Note that this function assumes the mask is provided as a mask *after* the
2221// value is shifted. The input chain may or may not match that, but computing
2222// such a mask is trivial.
2224 uint64_t Mask,
2225 SDValue Shift, SDValue X,
2226 X86ISelAddressMode &AM) {
2227 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2229 return true;
2230
2231 // We need to ensure that mask is a continuous run of bits.
2232 unsigned MaskIdx, MaskLen;
2233 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2234 return true;
2235 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2236
2237 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2238
2239 // The amount of shift we're trying to fit into the addressing mode is taken
2240 // from the shifted mask index (number of trailing zeros of the mask).
2241 unsigned AMShiftAmt = MaskIdx;
2242
2243 // There is nothing we can do here unless the mask is removing some bits.
2244 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2245 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2246
2247 // Scale the leading zero count down based on the actual size of the value.
2248 // Also scale it down based on the size of the shift.
2249 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2250 if (MaskLZ < ScaleDown)
2251 return true;
2252 MaskLZ -= ScaleDown;
2253
2254 // The final check is to ensure that any masked out high bits of X are
2255 // already known to be zero. Otherwise, the mask has a semantic impact
2256 // other than masking out a couple of low bits. Unfortunately, because of
2257 // the mask, zero extensions will be removed from operands in some cases.
2258 // This code works extra hard to look through extensions because we can
2259 // replace them with zero extensions cheaply if necessary.
2260 bool ReplacingAnyExtend = false;
2261 if (X.getOpcode() == ISD::ANY_EXTEND) {
2262 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2263 X.getOperand(0).getSimpleValueType().getSizeInBits();
2264 // Assume that we'll replace the any-extend with a zero-extend, and
2265 // narrow the search to the extended value.
2266 X = X.getOperand(0);
2267 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2268 ReplacingAnyExtend = true;
2269 }
2270 APInt MaskedHighBits =
2271 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2272 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2273 return true;
2274
2275 // We've identified a pattern that can be transformed into a single shift
2276 // and an addressing mode. Make it so.
2277 MVT VT = N.getSimpleValueType();
2278 if (ReplacingAnyExtend) {
2279 assert(X.getValueType() != VT);
2280 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2281 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2282 insertDAGNode(DAG, N, NewX);
2283 X = NewX;
2284 }
2285
2286 MVT XVT = X.getSimpleValueType();
2287 SDLoc DL(N);
2288 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2289 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2290 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2291 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2292 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2293
2294 // Insert the new nodes into the topological ordering. We must do this in
2295 // a valid topological ordering as nothing is going to go back and re-sort
2296 // these nodes. We continually insert before 'N' in sequence as this is
2297 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2298 // hierarchy left to express.
2299 insertDAGNode(DAG, N, NewSRLAmt);
2300 insertDAGNode(DAG, N, NewSRL);
2301 insertDAGNode(DAG, N, NewExt);
2302 insertDAGNode(DAG, N, NewSHLAmt);
2303 insertDAGNode(DAG, N, NewSHL);
2304 DAG.ReplaceAllUsesWith(N, NewSHL);
2305 DAG.RemoveDeadNode(N.getNode());
2306
2307 AM.Scale = 1 << AMShiftAmt;
2308 AM.IndexReg = NewExt;
2309 return false;
2310}
2311
2312// Transform "(X >> SHIFT) & (MASK << C1)" to
2313// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2314// matched to a BEXTR later. Returns false if the simplification is performed.
2316 uint64_t Mask,
2317 SDValue Shift, SDValue X,
2318 X86ISelAddressMode &AM,
2319 const X86Subtarget &Subtarget) {
2320 if (Shift.getOpcode() != ISD::SRL ||
2321 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2322 !Shift.hasOneUse() || !N.hasOneUse())
2323 return true;
2324
2325 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2326 if (!Subtarget.hasTBM() &&
2327 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2328 return true;
2329
2330 // We need to ensure that mask is a continuous run of bits.
2331 unsigned MaskIdx, MaskLen;
2332 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2333 return true;
2334
2335 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2336
2337 // The amount of shift we're trying to fit into the addressing mode is taken
2338 // from the shifted mask index (number of trailing zeros of the mask).
2339 unsigned AMShiftAmt = MaskIdx;
2340
2341 // There is nothing we can do here unless the mask is removing some bits.
2342 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2343 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2344
2345 MVT XVT = X.getSimpleValueType();
2346 MVT VT = N.getSimpleValueType();
2347 SDLoc DL(N);
2348 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2349 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2350 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2351 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2352 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2353 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2354 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2355
2356 // Insert the new nodes into the topological ordering. We must do this in
2357 // a valid topological ordering as nothing is going to go back and re-sort
2358 // these nodes. We continually insert before 'N' in sequence as this is
2359 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2360 // hierarchy left to express.
2361 insertDAGNode(DAG, N, NewSRLAmt);
2362 insertDAGNode(DAG, N, NewSRL);
2363 insertDAGNode(DAG, N, NewMask);
2364 insertDAGNode(DAG, N, NewAnd);
2365 insertDAGNode(DAG, N, NewExt);
2366 insertDAGNode(DAG, N, NewSHLAmt);
2367 insertDAGNode(DAG, N, NewSHL);
2368 DAG.ReplaceAllUsesWith(N, NewSHL);
2369 DAG.RemoveDeadNode(N.getNode());
2370
2371 AM.Scale = 1 << AMShiftAmt;
2372 AM.IndexReg = NewExt;
2373 return false;
2374}
2375
2376// Attempt to peek further into a scaled index register, collecting additional
2377// extensions / offsets / etc. Returns /p N if we can't peek any further.
2378SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2379 X86ISelAddressMode &AM,
2380 unsigned Depth) {
2381 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2382 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2383 "Illegal index scale");
2384
2385 // Limit recursion.
2387 return N;
2388
2389 EVT VT = N.getValueType();
2390 unsigned Opc = N.getOpcode();
2391
2392 // index: add(x,c) -> index: x, disp + c
2393 if (CurDAG->isBaseWithConstantOffset(N)) {
2394 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2395 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2396 if (!foldOffsetIntoAddress(Offset, AM))
2397 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2398 }
2399
2400 // index: add(x,x) -> index: x, scale * 2
2401 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2402 if (AM.Scale <= 4) {
2403 AM.Scale *= 2;
2404 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2405 }
2406 }
2407
2408 // index: shl(x,i) -> index: x, scale * (1 << i)
2409 if (Opc == X86ISD::VSHLI) {
2410 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2411 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2412 if ((AM.Scale * ScaleAmt) <= 8) {
2413 AM.Scale *= ScaleAmt;
2414 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2415 }
2416 }
2417
2418 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2419 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2420 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2421 SDValue Src = N.getOperand(0);
2422 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2423 Src.hasOneUse()) {
2424 if (CurDAG->isBaseWithConstantOffset(Src)) {
2425 SDValue AddSrc = Src.getOperand(0);
2426 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2427 int64_t Offset = AddVal->getSExtValue();
2428 if (!foldOffsetIntoAddress((uint64_t)Offset * AM.Scale, AM)) {
2429 SDLoc DL(N);
2430 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2431 SDValue ExtVal = CurDAG->getSignedConstant(Offset, DL, VT);
2432 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2433 insertDAGNode(*CurDAG, N, ExtSrc);
2434 insertDAGNode(*CurDAG, N, ExtVal);
2435 insertDAGNode(*CurDAG, N, ExtAdd);
2436 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2437 CurDAG->RemoveDeadNode(N.getNode());
2438 return ExtSrc;
2439 }
2440 }
2441 }
2442 }
2443
2444 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2445 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2446 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2447 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2448 SDValue Src = N.getOperand(0);
2449 unsigned SrcOpc = Src.getOpcode();
2450 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2451 CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2452 Src.hasOneUse()) {
2453 if (CurDAG->isBaseWithConstantOffset(Src)) {
2454 SDValue AddSrc = Src.getOperand(0);
2455 uint64_t Offset = Src.getConstantOperandVal(1);
2456 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2457 SDLoc DL(N);
2458 SDValue Res;
2459 // If we're also scaling, see if we can use that as well.
2460 if (AddSrc.getOpcode() == ISD::SHL &&
2461 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2462 SDValue ShVal = AddSrc.getOperand(0);
2463 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2464 APInt HiBits =
2466 uint64_t ScaleAmt = 1ULL << ShAmt;
2467 if ((AM.Scale * ScaleAmt) <= 8 &&
2468 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2469 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2470 AM.Scale *= ScaleAmt;
2471 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2472 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2473 AddSrc.getOperand(1));
2474 insertDAGNode(*CurDAG, N, ExtShVal);
2475 insertDAGNode(*CurDAG, N, ExtShift);
2476 AddSrc = ExtShift;
2477 Res = ExtShVal;
2478 }
2479 }
2480 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2481 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2482 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2483 insertDAGNode(*CurDAG, N, ExtSrc);
2484 insertDAGNode(*CurDAG, N, ExtVal);
2485 insertDAGNode(*CurDAG, N, ExtAdd);
2486 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2487 CurDAG->RemoveDeadNode(N.getNode());
2488 return Res ? Res : ExtSrc;
2489 }
2490 }
2491 }
2492 }
2493
2494 // TODO: Handle extensions, shifted masks etc.
2495 return N;
2496}
2497
2498bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2499 unsigned Depth) {
2500 LLVM_DEBUG({
2501 dbgs() << "MatchAddress: ";
2502 AM.dump(CurDAG);
2503 });
2504 // Limit recursion.
2506 return matchAddressBase(N, AM);
2507
2508 // If this is already a %rip relative address, we can only merge immediates
2509 // into it. Instead of handling this in every case, we handle it here.
2510 // RIP relative addressing: %rip + 32-bit displacement!
2511 if (AM.isRIPRelative()) {
2512 // FIXME: JumpTable and ExternalSymbol address currently don't like
2513 // displacements. It isn't very important, but this should be fixed for
2514 // consistency.
2515 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2516 return true;
2517
2518 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2519 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2520 return false;
2521 return true;
2522 }
2523
2524 switch (N.getOpcode()) {
2525 default: break;
2526 case ISD::LOCAL_RECOVER: {
2527 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2528 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2529 // Use the symbol and don't prefix it.
2530 AM.MCSym = ESNode->getMCSymbol();
2531 return false;
2532 }
2533 break;
2534 }
2535 case ISD::Constant: {
2536 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2537 if (!foldOffsetIntoAddress(Val, AM))
2538 return false;
2539 break;
2540 }
2541
2542 case X86ISD::Wrapper:
2543 case X86ISD::WrapperRIP:
2544 if (!matchWrapper(N, AM))
2545 return false;
2546 break;
2547
2548 case ISD::LOAD:
2549 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2550 return false;
2551 break;
2552
2553 case ISD::FrameIndex:
2554 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2555 AM.Base_Reg.getNode() == nullptr &&
2556 (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) {
2557 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2558 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2559 return false;
2560 }
2561 break;
2562
2563 case ISD::SHL:
2564 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2565 break;
2566
2567 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2568 unsigned Val = CN->getZExtValue();
2569 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2570 // that the base operand remains free for further matching. If
2571 // the base doesn't end up getting used, a post-processing step
2572 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2573 if (Val == 1 || Val == 2 || Val == 3) {
2574 SDValue ShVal = N.getOperand(0);
2575 AM.Scale = 1 << Val;
2576 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2577 return false;
2578 }
2579 }
2580 break;
2581
2582 case ISD::SRL: {
2583 // Scale must not be used already.
2584 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2585
2586 // We only handle up to 64-bit values here as those are what matter for
2587 // addressing mode optimizations.
2588 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2589 "Unexpected value size!");
2590
2591 SDValue And = N.getOperand(0);
2592 if (And.getOpcode() != ISD::AND) break;
2593 SDValue X = And.getOperand(0);
2594
2595 // The mask used for the transform is expected to be post-shift, but we
2596 // found the shift first so just apply the shift to the mask before passing
2597 // it down.
2598 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2599 !isa<ConstantSDNode>(And.getOperand(1)))
2600 break;
2601 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2602
2603 // Try to fold the mask and shift into the scale, and return false if we
2604 // succeed.
2605 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2606 return false;
2607 break;
2608 }
2609
2610 case ISD::SMUL_LOHI:
2611 case ISD::UMUL_LOHI:
2612 // A mul_lohi where we need the low part can be folded as a plain multiply.
2613 if (N.getResNo() != 0) break;
2614 [[fallthrough]];
2615 case ISD::MUL:
2616 case X86ISD::MUL_IMM:
2617 // X*[3,5,9] -> X+X*[2,4,8]
2618 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2619 AM.Base_Reg.getNode() == nullptr &&
2620 AM.IndexReg.getNode() == nullptr) {
2621 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2622 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2623 CN->getZExtValue() == 9) {
2624 AM.Scale = unsigned(CN->getZExtValue())-1;
2625
2626 SDValue MulVal = N.getOperand(0);
2627 SDValue Reg;
2628
2629 // Okay, we know that we have a scale by now. However, if the scaled
2630 // value is an add of something and a constant, we can fold the
2631 // constant into the disp field here.
2632 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2633 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2634 Reg = MulVal.getOperand(0);
2635 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2636 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2637 if (foldOffsetIntoAddress(Disp, AM))
2638 Reg = N.getOperand(0);
2639 } else {
2640 Reg = N.getOperand(0);
2641 }
2642
2643 AM.IndexReg = AM.Base_Reg = Reg;
2644 return false;
2645 }
2646 }
2647 break;
2648
2649 case ISD::SUB: {
2650 // Given A-B, if A can be completely folded into the address and
2651 // the index field with the index field unused, use -B as the index.
2652 // This is a win if a has multiple parts that can be folded into
2653 // the address. Also, this saves a mov if the base register has
2654 // other uses, since it avoids a two-address sub instruction, however
2655 // it costs an additional mov if the index register has other uses.
2656
2657 // Add an artificial use to this node so that we can keep track of
2658 // it if it gets CSE'd with a different node.
2659 HandleSDNode Handle(N);
2660
2661 // Test if the LHS of the sub can be folded.
2662 X86ISelAddressMode Backup = AM;
2663 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2664 N = Handle.getValue();
2665 AM = Backup;
2666 break;
2667 }
2668 N = Handle.getValue();
2669 // Test if the index field is free for use.
2670 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2671 AM = Backup;
2672 break;
2673 }
2674
2675 int Cost = 0;
2676 SDValue RHS = N.getOperand(1);
2677 // If the RHS involves a register with multiple uses, this
2678 // transformation incurs an extra mov, due to the neg instruction
2679 // clobbering its operand.
2680 if (!RHS.getNode()->hasOneUse() ||
2681 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2682 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2683 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2684 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2685 RHS.getOperand(0).getValueType() == MVT::i32))
2686 ++Cost;
2687 // If the base is a register with multiple uses, this
2688 // transformation may save a mov.
2689 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2690 !AM.Base_Reg.getNode()->hasOneUse()) ||
2691 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2692 --Cost;
2693 // If the folded LHS was interesting, this transformation saves
2694 // address arithmetic.
2695 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2696 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2697 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2698 --Cost;
2699 // If it doesn't look like it may be an overall win, don't do it.
2700 if (Cost >= 0) {
2701 AM = Backup;
2702 break;
2703 }
2704
2705 // Ok, the transformation is legal and appears profitable. Go for it.
2706 // Negation will be emitted later to avoid creating dangling nodes if this
2707 // was an unprofitable LEA.
2708 AM.IndexReg = RHS;
2709 AM.NegateIndex = true;
2710 AM.Scale = 1;
2711 return false;
2712 }
2713
2714 case ISD::OR:
2715 case ISD::XOR:
2716 // See if we can treat the OR/XOR node as an ADD node.
2717 if (!CurDAG->isADDLike(N))
2718 break;
2719 [[fallthrough]];
2720 case ISD::ADD:
2721 if (!matchAdd(N, AM, Depth))
2722 return false;
2723 break;
2724
2725 case ISD::AND: {
2726 // Perform some heroic transforms on an and of a constant-count shift
2727 // with a constant to enable use of the scaled offset field.
2728
2729 // Scale must not be used already.
2730 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2731
2732 // We only handle up to 64-bit values here as those are what matter for
2733 // addressing mode optimizations.
2734 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2735 "Unexpected value size!");
2736
2737 if (!isa<ConstantSDNode>(N.getOperand(1)))
2738 break;
2739
2740 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2741 SDValue Shift = N.getOperand(0);
2742 SDValue X = Shift.getOperand(0);
2743
2744 uint64_t Mask = N.getConstantOperandVal(1);
2745
2746 // Try to fold the mask and shift into an extract and scale.
2747 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2748 return false;
2749
2750 // Try to fold the mask and shift directly into the scale.
2751 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2752 return false;
2753
2754 // Try to fold the mask and shift into BEXTR and scale.
2755 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2756 return false;
2757 }
2758
2759 // Try to swap the mask and shift to place shifts which can be done as
2760 // a scale on the outside of the mask.
2761 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2762 return false;
2763
2764 break;
2765 }
2766 case ISD::ZERO_EXTEND: {
2767 // Try to widen a zexted shift left to the same size as its use, so we can
2768 // match the shift as a scale factor.
2769 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2770 break;
2771
2772 SDValue Src = N.getOperand(0);
2773
2774 // See if we can match a zext(addlike(x,c)).
2775 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2776 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2777 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2778 if (Index != N) {
2779 AM.IndexReg = Index;
2780 return false;
2781 }
2782
2783 // Peek through mask: zext(and(shl(x,c1),c2))
2784 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2785 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2786 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2787 Mask = MaskC->getAPIntValue();
2788 Src = Src.getOperand(0);
2789 }
2790
2791 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2792 // Give up if the shift is not a valid scale factor [1,2,3].
2793 SDValue ShlSrc = Src.getOperand(0);
2794 SDValue ShlAmt = Src.getOperand(1);
2795 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2796 if (!ShAmtC)
2797 break;
2798 unsigned ShAmtV = ShAmtC->getZExtValue();
2799 if (ShAmtV > 3)
2800 break;
2801
2802 // The narrow shift must only shift out zero bits (it must be 'nuw').
2803 // That makes it safe to widen to the destination type.
2804 APInt HighZeros =
2805 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2806 if (!Src->getFlags().hasNoUnsignedWrap() &&
2807 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2808 break;
2809
2810 // zext (shl nuw i8 %x, C1) to i32
2811 // --> shl (zext i8 %x to i32), (zext C1)
2812 // zext (and (shl nuw i8 %x, C1), C2) to i32
2813 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2814 MVT SrcVT = ShlSrc.getSimpleValueType();
2815 MVT VT = N.getSimpleValueType();
2816 SDLoc DL(N);
2817
2818 SDValue Res = ShlSrc;
2819 if (!Mask.isAllOnes()) {
2820 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2821 insertDAGNode(*CurDAG, N, Res);
2822 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2823 insertDAGNode(*CurDAG, N, Res);
2824 }
2825 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2826 insertDAGNode(*CurDAG, N, Zext);
2827 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2828 insertDAGNode(*CurDAG, N, NewShl);
2829 CurDAG->ReplaceAllUsesWith(N, NewShl);
2830 CurDAG->RemoveDeadNode(N.getNode());
2831
2832 // Convert the shift to scale factor.
2833 AM.Scale = 1 << ShAmtV;
2834 // If matchIndexRecursively is not called here,
2835 // Zext may be replaced by other nodes but later used to call a builder
2836 // method
2837 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2838 return false;
2839 }
2840
2841 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2842 // Try to fold the mask and shift into an extract and scale.
2843 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2844 Src.getOperand(0), AM))
2845 return false;
2846
2847 // Try to fold the mask and shift directly into the scale.
2848 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2849 Src.getOperand(0), AM))
2850 return false;
2851
2852 // Try to fold the mask and shift into BEXTR and scale.
2853 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2854 Src.getOperand(0), AM, *Subtarget))
2855 return false;
2856 }
2857
2858 break;
2859 }
2860 }
2861
2862 return matchAddressBase(N, AM);
2863}
2864
2865/// Helper for MatchAddress. Add the specified node to the
2866/// specified addressing mode without any further recursion.
2867bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2868 // Is the base register already occupied?
2869 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2870 // If so, check to see if the scale index register is set.
2871 if (!AM.IndexReg.getNode()) {
2872 AM.IndexReg = N;
2873 AM.Scale = 1;
2874 return false;
2875 }
2876
2877 // Otherwise, we cannot select it.
2878 return true;
2879 }
2880
2881 // Default, generate it as a register.
2882 AM.BaseType = X86ISelAddressMode::RegBase;
2883 AM.Base_Reg = N;
2884 return false;
2885}
2886
2887bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2888 X86ISelAddressMode &AM,
2889 unsigned Depth) {
2890 LLVM_DEBUG({
2891 dbgs() << "MatchVectorAddress: ";
2892 AM.dump(CurDAG);
2893 });
2894 // Limit recursion.
2896 return matchAddressBase(N, AM);
2897
2898 // TODO: Support other operations.
2899 switch (N.getOpcode()) {
2900 case ISD::Constant: {
2901 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2902 if (!foldOffsetIntoAddress(Val, AM))
2903 return false;
2904 break;
2905 }
2906 case X86ISD::Wrapper:
2907 if (!matchWrapper(N, AM))
2908 return false;
2909 break;
2910 case ISD::ADD: {
2911 // Add an artificial use to this node so that we can keep track of
2912 // it if it gets CSE'd with a different node.
2913 HandleSDNode Handle(N);
2914
2915 X86ISelAddressMode Backup = AM;
2916 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2917 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2918 Depth + 1))
2919 return false;
2920 AM = Backup;
2921
2922 // Try again after commuting the operands.
2923 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2924 Depth + 1) &&
2925 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2926 Depth + 1))
2927 return false;
2928 AM = Backup;
2929
2930 N = Handle.getValue();
2931 break;
2932 }
2933 }
2934
2935 return matchAddressBase(N, AM);
2936}
2937
2938/// Helper for selectVectorAddr. Handles things that can be folded into a
2939/// gather/scatter address. The index register and scale should have already
2940/// been handled.
2941bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2942 return matchVectorAddressRecursively(N, AM, 0);
2943}
2944
2945bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2946 SDValue IndexOp, SDValue ScaleOp,
2947 SDValue &Base, SDValue &Scale,
2948 SDValue &Index, SDValue &Disp,
2949 SDValue &Segment) {
2950 X86ISelAddressMode AM;
2951 AM.Scale = ScaleOp->getAsZExtVal();
2952
2953 // Attempt to match index patterns, as long as we're not relying on implicit
2954 // sign-extension, which is performed BEFORE scale.
2955 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2956 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2957 else
2958 AM.IndexReg = IndexOp;
2959
2960 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2961 if (AddrSpace == X86AS::GS)
2962 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2963 if (AddrSpace == X86AS::FS)
2964 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2965 if (AddrSpace == X86AS::SS)
2966 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2967
2968 SDLoc DL(BasePtr);
2969 MVT VT = BasePtr.getSimpleValueType();
2970
2971 // Try to match into the base and displacement fields.
2972 if (matchVectorAddress(BasePtr, AM))
2973 return false;
2974
2975 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2976 return true;
2977}
2978
2979/// Returns true if it is able to pattern match an addressing mode.
2980/// It returns the operands which make up the maximal addressing mode it can
2981/// match by reference.
2982///
2983/// Parent is the parent node of the addr operand that is being matched. It
2984/// is always a load, store, atomic node, or null. It is only null when
2985/// checking memory operands for inline asm nodes.
2986bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2987 SDValue &Scale, SDValue &Index,
2988 SDValue &Disp, SDValue &Segment) {
2989 X86ISelAddressMode AM;
2990
2991 if (Parent &&
2992 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2993 // that are not a MemSDNode, and thus don't have proper addrspace info.
2994 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2995 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2996 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2997 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2998 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2999 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3000 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3001 unsigned AddrSpace =
3002 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
3003 if (AddrSpace == X86AS::GS)
3004 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3005 if (AddrSpace == X86AS::FS)
3006 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3007 if (AddrSpace == X86AS::SS)
3008 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3009 }
3010
3011 // Save the DL and VT before calling matchAddress, it can invalidate N.
3012 SDLoc DL(N);
3013 MVT VT = N.getSimpleValueType();
3014
3015 if (matchAddress(N, AM))
3016 return false;
3017
3018 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3019 return true;
3020}
3021
3022bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3023 // Cannot use 32 bit constants to reference objects in kernel/large code
3024 // model.
3025 if (TM.getCodeModel() == CodeModel::Kernel ||
3026 TM.getCodeModel() == CodeModel::Large)
3027 return false;
3028
3029 // In static codegen with small code model, we can get the address of a label
3030 // into a register with 'movl'
3031 if (N->getOpcode() != X86ISD::Wrapper)
3032 return false;
3033
3034 N = N.getOperand(0);
3035
3036 // At least GNU as does not accept 'movl' for TPOFF relocations.
3037 // FIXME: We could use 'movl' when we know we are targeting MC.
3038 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3039 return false;
3040
3041 Imm = N;
3042 // Small/medium code model can reference non-TargetGlobalAddress objects with
3043 // 32 bit constants.
3044 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3045 return TM.getCodeModel() == CodeModel::Small ||
3046 TM.getCodeModel() == CodeModel::Medium;
3047 }
3048
3049 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3050 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3051 return CR->getUnsignedMax().ult(1ull << 32);
3052
3053 return !TM.isLargeGlobalValue(GV);
3054}
3055
3056bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3057 SDValue &Index, SDValue &Disp,
3058 SDValue &Segment) {
3059 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3060 SDLoc DL(N);
3061
3062 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3063 return false;
3064
3065 EVT BaseType = Base.getValueType();
3066 unsigned SubReg;
3067 if (BaseType == MVT::i8)
3068 SubReg = X86::sub_8bit;
3069 else if (BaseType == MVT::i16)
3070 SubReg = X86::sub_16bit;
3071 else
3072 SubReg = X86::sub_32bit;
3073
3075 if (RN && RN->getReg() == 0)
3076 Base = CurDAG->getRegister(0, MVT::i64);
3077 else if ((BaseType == MVT::i8 || BaseType == MVT::i16 ||
3078 BaseType == MVT::i32) &&
3080 // Base could already be %rip, particularly in the x32 ABI.
3081 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3082 MVT::i64), 0);
3083 Base = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Base);
3084 }
3085
3086 [[maybe_unused]] EVT IndexType = Index.getValueType();
3088 if (RN && RN->getReg() == 0)
3089 Index = CurDAG->getRegister(0, MVT::i64);
3090 else {
3091 assert((IndexType == BaseType) &&
3092 "Expect to be extending 8/16/32-bit registers for use in LEA");
3093 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3094 MVT::i64), 0);
3095 Index = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Index);
3096 }
3097
3098 return true;
3099}
3100
3101/// Calls SelectAddr and determines if the maximal addressing
3102/// mode it matches can be cost effectively emitted as an LEA instruction.
3103bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3104 SDValue &Base, SDValue &Scale,
3105 SDValue &Index, SDValue &Disp,
3106 SDValue &Segment) {
3107 X86ISelAddressMode AM;
3108
3109 // Save the DL and VT before calling matchAddress, it can invalidate N.
3110 SDLoc DL(N);
3111 MVT VT = N.getSimpleValueType();
3112
3113 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3114 // segments.
3115 SDValue Copy = AM.Segment;
3116 SDValue T = CurDAG->getRegister(0, MVT::i32);
3117 AM.Segment = T;
3118 if (matchAddress(N, AM))
3119 return false;
3120 assert (T == AM.Segment);
3121 AM.Segment = Copy;
3122
3123 unsigned Complexity = 0;
3124 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3125 Complexity = 1;
3126 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3127 Complexity = 4;
3128
3129 if (AM.IndexReg.getNode())
3130 Complexity++;
3131
3132 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3133 // a simple shift.
3134 if (AM.Scale > 1)
3135 Complexity++;
3136
3137 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3138 // to a LEA. This is determined with some experimentation but is by no means
3139 // optimal (especially for code size consideration). LEA is nice because of
3140 // its three-address nature. Tweak the cost function again when we can run
3141 // convertToThreeAddress() at register allocation time.
3142 if (AM.hasSymbolicDisplacement()) {
3143 // For X86-64, always use LEA to materialize RIP-relative addresses.
3144 if (Subtarget->is64Bit())
3145 Complexity = 4;
3146 else
3147 Complexity += 2;
3148 }
3149
3150 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3151 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3152 // duplicating flag-producing instructions later in the pipeline.
3153 if (N.getOpcode() == ISD::ADD) {
3154 auto isMathWithFlags = [](SDValue V) {
3155 switch (V.getOpcode()) {
3156 case X86ISD::ADD:
3157 case X86ISD::SUB:
3158 case X86ISD::ADC:
3159 case X86ISD::SBB:
3160 case X86ISD::SMUL:
3161 case X86ISD::UMUL:
3162 /* TODO: These opcodes can be added safely, but we may want to justify
3163 their inclusion for different reasons (better for reg-alloc).
3164 case X86ISD::OR:
3165 case X86ISD::XOR:
3166 case X86ISD::AND:
3167 */
3168 // Value 1 is the flag output of the node - verify it's not dead.
3169 return !SDValue(V.getNode(), 1).use_empty();
3170 default:
3171 return false;
3172 }
3173 };
3174 // TODO: We might want to factor in whether there's a load folding
3175 // opportunity for the math op that disappears with LEA.
3176 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3177 Complexity++;
3178 }
3179
3180 if (AM.Disp)
3181 Complexity++;
3182
3183 // If it isn't worth using an LEA, reject it.
3184 if (Complexity <= 2)
3185 return false;
3186
3187 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3188 return true;
3189}
3190
3191/// This is only run on TargetGlobalTLSAddress nodes.
3192bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3193 SDValue &Scale, SDValue &Index,
3194 SDValue &Disp, SDValue &Segment) {
3195 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3196 N.getOpcode() == ISD::TargetExternalSymbol);
3197
3198 X86ISelAddressMode AM;
3199 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3200 AM.GV = GA->getGlobal();
3201 AM.Disp += GA->getOffset();
3202 AM.SymbolFlags = GA->getTargetFlags();
3203 } else {
3204 auto *SA = cast<ExternalSymbolSDNode>(N);
3205 AM.ES = SA->getSymbol();
3206 AM.SymbolFlags = SA->getTargetFlags();
3207 }
3208
3209 if (Subtarget->is32Bit()) {
3210 AM.Scale = 1;
3211 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3212 }
3213
3214 MVT VT = N.getSimpleValueType();
3215 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3216 return true;
3217}
3218
3219bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3220 // Keep track of the original value type and whether this value was
3221 // truncated. If we see a truncation from pointer type to VT that truncates
3222 // bits that are known to be zero, we can use a narrow reference.
3223 EVT VT = N.getValueType();
3224 bool WasTruncated = false;
3225 if (N.getOpcode() == ISD::TRUNCATE) {
3226 WasTruncated = true;
3227 N = N.getOperand(0);
3228 }
3229
3230 if (N.getOpcode() != X86ISD::Wrapper)
3231 return false;
3232
3233 // We can only use non-GlobalValues as immediates if they were not truncated,
3234 // as we do not have any range information. If we have a GlobalValue and the
3235 // address was not truncated, we can select it as an operand directly.
3236 unsigned Opc = N.getOperand(0)->getOpcode();
3237 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3238 Op = N.getOperand(0);
3239 // We can only select the operand directly if we didn't have to look past a
3240 // truncate.
3241 return !WasTruncated;
3242 }
3243
3244 // Check that the global's range fits into VT.
3245 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3246 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3247 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3248 return false;
3249
3250 // Okay, we can use a narrow reference.
3251 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3252 GA->getOffset(), GA->getTargetFlags());
3253 return true;
3254}
3255
3256bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3257 SDValue &Base, SDValue &Scale,
3258 SDValue &Index, SDValue &Disp,
3259 SDValue &Segment) {
3260 assert(Root && P && "Unknown root/parent nodes");
3261 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3262 !IsProfitableToFold(N, P, Root) ||
3263 !IsLegalToFold(N, P, Root, OptLevel))
3264 return false;
3265
3266 return selectAddr(N.getNode(),
3267 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3268}
3269
3270bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3271 SDValue &Base, SDValue &Scale,
3272 SDValue &Index, SDValue &Disp,
3273 SDValue &Segment) {
3274 assert(Root && P && "Unknown root/parent nodes");
3275 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3276 !IsProfitableToFold(N, P, Root) ||
3277 !IsLegalToFold(N, P, Root, OptLevel))
3278 return false;
3279
3280 return selectAddr(N.getNode(),
3281 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3282}
3283
3284/// Return an SDNode that returns the value of the global base register.
3285/// Output instructions required to initialize the global base register,
3286/// if necessary.
3287SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3288 Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3289 auto &DL = MF->getDataLayout();
3290 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3291}
3292
3293bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3294 if (N->getOpcode() == ISD::TRUNCATE)
3295 N = N->getOperand(0).getNode();
3296 if (N->getOpcode() != X86ISD::Wrapper)
3297 return false;
3298
3299 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3300 if (!GA)
3301 return false;
3302
3303 auto *GV = GA->getGlobal();
3304 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3305 if (CR)
3306 return CR->getSignedMin().sge(-1ull << Width) &&
3307 CR->getSignedMax().slt(1ull << Width);
3308 // In the kernel code model, globals are in the negative 2GB of the address
3309 // space, so globals can be a sign extended 32-bit immediate.
3310 // In other code models, small globals are in the low 2GB of the address
3311 // space, so sign extending them is equivalent to zero extending them.
3312 return Width == 32 && !TM.isLargeGlobalValue(GV);
3313}
3314
3315X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3316 assert(N->isMachineOpcode() && "Unexpected node");
3317 unsigned Opc = N->getMachineOpcode();
3318 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3319 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3320 if (CondNo < 0)
3321 return X86::COND_INVALID;
3322
3323 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3324}
3325
3326/// Test whether the given X86ISD::CMP node has any users that use a flag
3327/// other than ZF.
3328bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3329 // Examine each user of the node.
3330 for (SDUse &Use : Flags->uses()) {
3331 // Only check things that use the flags.
3332 if (Use.getResNo() != Flags.getResNo())
3333 continue;
3334 SDNode *User = Use.getUser();
3335 // Only examine CopyToReg uses that copy to EFLAGS.
3336 if (User->getOpcode() != ISD::CopyToReg ||
3337 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3338 return false;
3339 // Examine each user of the CopyToReg use.
3340 for (SDUse &FlagUse : User->uses()) {
3341 // Only examine the Flag result.
3342 if (FlagUse.getResNo() != 1)
3343 continue;
3344 // Anything unusual: assume conservatively.
3345 if (!FlagUse.getUser()->isMachineOpcode())
3346 return false;
3347 // Examine the condition code of the user.
3348 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3349
3350 switch (CC) {
3351 // Comparisons which only use the zero flag.
3352 case X86::COND_E: case X86::COND_NE:
3353 continue;
3354 // Anything else: assume conservatively.
3355 default:
3356 return false;
3357 }
3358 }
3359 }
3360 return true;
3361}
3362
3363/// Test whether the given X86ISD::CMP node has any uses which require the SF
3364/// flag to be accurate.
3365bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3366 // Examine each user of the node.
3367 for (SDUse &Use : Flags->uses()) {
3368 // Only check things that use the flags.
3369 if (Use.getResNo() != Flags.getResNo())
3370 continue;
3371 SDNode *User = Use.getUser();
3372 // Only examine CopyToReg uses that copy to EFLAGS.
3373 if (User->getOpcode() != ISD::CopyToReg ||
3374 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3375 return false;
3376 // Examine each user of the CopyToReg use.
3377 for (SDUse &FlagUse : User->uses()) {
3378 // Only examine the Flag result.
3379 if (FlagUse.getResNo() != 1)
3380 continue;
3381 // Anything unusual: assume conservatively.
3382 if (!FlagUse.getUser()->isMachineOpcode())
3383 return false;
3384 // Examine the condition code of the user.
3385 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3386
3387 switch (CC) {
3388 // Comparisons which don't examine the SF flag.
3389 case X86::COND_A: case X86::COND_AE:
3390 case X86::COND_B: case X86::COND_BE:
3391 case X86::COND_E: case X86::COND_NE:
3392 case X86::COND_O: case X86::COND_NO:
3393 case X86::COND_P: case X86::COND_NP:
3394 continue;
3395 // Anything else: assume conservatively.
3396 default:
3397 return false;
3398 }
3399 }
3400 }
3401 return true;
3402}
3403
3405 switch (CC) {
3406 // Comparisons which don't examine the CF flag.
3407 case X86::COND_O: case X86::COND_NO:
3408 case X86::COND_E: case X86::COND_NE:
3409 case X86::COND_S: case X86::COND_NS:
3410 case X86::COND_P: case X86::COND_NP:
3411 case X86::COND_L: case X86::COND_GE:
3412 case X86::COND_G: case X86::COND_LE:
3413 return false;
3414 // Anything else: assume conservatively.
3415 default:
3416 return true;
3417 }
3418}
3419
3420/// Test whether the given node which sets flags has any uses which require the
3421/// CF flag to be accurate.
3422 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3423 // Examine each user of the node.
3424 for (SDUse &Use : Flags->uses()) {
3425 // Only check things that use the flags.
3426 if (Use.getResNo() != Flags.getResNo())
3427 continue;
3428
3429 SDNode *User = Use.getUser();
3430 unsigned UserOpc = User->getOpcode();
3431
3432 if (UserOpc == ISD::CopyToReg) {
3433 // Only examine CopyToReg uses that copy to EFLAGS.
3434 if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3435 return false;
3436 // Examine each user of the CopyToReg use.
3437 for (SDUse &FlagUse : User->uses()) {
3438 // Only examine the Flag result.
3439 if (FlagUse.getResNo() != 1)
3440 continue;
3441 // Anything unusual: assume conservatively.
3442 if (!FlagUse.getUser()->isMachineOpcode())
3443 return false;
3444 // Examine the condition code of the user.
3445 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3446
3447 if (mayUseCarryFlag(CC))
3448 return false;
3449 }
3450
3451 // This CopyToReg is ok. Move on to the next user.
3452 continue;
3453 }
3454
3455 // This might be an unselected node. So look for the pre-isel opcodes that
3456 // use flags.
3457 unsigned CCOpNo;
3458 switch (UserOpc) {
3459 default:
3460 // Something unusual. Be conservative.
3461 return false;
3462 case X86ISD::SETCC: CCOpNo = 0; break;
3463 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3464 case X86ISD::CMOV: CCOpNo = 2; break;
3465 case X86ISD::BRCOND: CCOpNo = 2; break;
3466 }
3467
3468 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
3469 if (mayUseCarryFlag(CC))
3470 return false;
3471 }
3472 return true;
3473}
3474
3475/// Check whether or not the chain ending in StoreNode is suitable for doing
3476/// the {load; op; store} to modify transformation.
3478 SDValue StoredVal, SelectionDAG *CurDAG,
3479 unsigned LoadOpNo,
3480 LoadSDNode *&LoadNode,
3481 SDValue &InputChain) {
3482 // Is the stored value result 0 of the operation?
3483 if (StoredVal.getResNo() != 0) return false;
3484
3485 // Are there other uses of the operation other than the store?
3486 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3487
3488 // Is the store non-extending and non-indexed?
3489 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3490 return false;
3491
3492 SDValue Load = StoredVal->getOperand(LoadOpNo);
3493 // Is the stored value a non-extending and non-indexed load?
3494 if (!ISD::isNormalLoad(Load.getNode())) return false;
3495
3496 // Return LoadNode by reference.
3497 LoadNode = cast<LoadSDNode>(Load);
3498
3499 // Is store the only read of the loaded value?
3500 if (!Load.hasOneUse())
3501 return false;
3502
3503 // Is the address of the store the same as the load?
3504 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3505 LoadNode->getOffset() != StoreNode->getOffset())
3506 return false;
3507
3508 bool FoundLoad = false;
3509 SmallVector<SDValue, 4> ChainOps;
3510 SmallVector<const SDNode *, 4> LoopWorklist;
3512 const unsigned int Max = 1024;
3513
3514 // Visualization of Load-Op-Store fusion:
3515 // -------------------------
3516 // Legend:
3517 // *-lines = Chain operand dependencies.
3518 // |-lines = Normal operand dependencies.
3519 // Dependencies flow down and right. n-suffix references multiple nodes.
3520 //
3521 // C Xn C
3522 // * * *
3523 // * * *
3524 // Xn A-LD Yn TF Yn
3525 // * * \ | * |
3526 // * * \ | * |
3527 // * * \ | => A--LD_OP_ST
3528 // * * \| \
3529 // TF OP \
3530 // * | \ Zn
3531 // * | \
3532 // A-ST Zn
3533 //
3534
3535 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3536 // #2: Yn -> LD
3537 // #3: ST -> Zn
3538
3539 // Ensure the transform is safe by checking for the dual
3540 // dependencies to make sure we do not induce a loop.
3541
3542 // As LD is a predecessor to both OP and ST we can do this by checking:
3543 // a). if LD is a predecessor to a member of Xn or Yn.
3544 // b). if a Zn is a predecessor to ST.
3545
3546 // However, (b) can only occur through being a chain predecessor to
3547 // ST, which is the same as Zn being a member or predecessor of Xn,
3548 // which is a subset of LD being a predecessor of Xn. So it's
3549 // subsumed by check (a).
3550
3551 SDValue Chain = StoreNode->getChain();
3552
3553 // Gather X elements in ChainOps.
3554 if (Chain == Load.getValue(1)) {
3555 FoundLoad = true;
3556 ChainOps.push_back(Load.getOperand(0));
3557 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3558 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3559 SDValue Op = Chain.getOperand(i);
3560 if (Op == Load.getValue(1)) {
3561 FoundLoad = true;
3562 // Drop Load, but keep its chain. No cycle check necessary.
3563 ChainOps.push_back(Load.getOperand(0));
3564 continue;
3565 }
3566 LoopWorklist.push_back(Op.getNode());
3567 ChainOps.push_back(Op);
3568 }
3569 }
3570
3571 if (!FoundLoad)
3572 return false;
3573
3574 // Worklist is currently Xn. Add Yn to worklist.
3575 for (SDValue Op : StoredVal->ops())
3576 if (Op.getNode() != LoadNode)
3577 LoopWorklist.push_back(Op.getNode());
3578
3579 // Check (a) if Load is a predecessor to Xn + Yn
3580 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3581 true))
3582 return false;
3583
3584 InputChain =
3585 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3586 return true;
3587}
3588
3589// Change a chain of {load; op; store} of the same value into a simple op
3590// through memory of that value, if the uses of the modified value and its
3591// address are suitable.
3592//
3593// The tablegen pattern memory operand pattern is currently not able to match
3594// the case where the EFLAGS on the original operation are used.
3595//
3596// To move this to tablegen, we'll need to improve tablegen to allow flags to
3597// be transferred from a node in the pattern to the result node, probably with
3598// a new keyword. For example, we have this
3599// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3600// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3601// but maybe need something like this
3602// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3603// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3604// (transferrable EFLAGS)]>;
3605//
3606// Until then, we manually fold these and instruction select the operation
3607// here.
3608bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3609 auto *StoreNode = cast<StoreSDNode>(Node);
3610 SDValue StoredVal = StoreNode->getOperand(1);
3611 unsigned Opc = StoredVal->getOpcode();
3612
3613 // Before we try to select anything, make sure this is memory operand size
3614 // and opcode we can handle. Note that this must match the code below that
3615 // actually lowers the opcodes.
3616 EVT MemVT = StoreNode->getMemoryVT();
3617 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3618 MemVT != MVT::i8)
3619 return false;
3620
3621 bool IsCommutable = false;
3622 bool IsNegate = false;
3623 switch (Opc) {
3624 default:
3625 return false;
3626 case X86ISD::SUB:
3627 IsNegate = isNullConstant(StoredVal.getOperand(0));
3628 break;
3629 case X86ISD::SBB:
3630 break;
3631 case X86ISD::ADD:
3632 case X86ISD::ADC:
3633 case X86ISD::AND:
3634 case X86ISD::OR:
3635 case X86ISD::XOR:
3636 IsCommutable = true;
3637 break;
3638 }
3639
3640 unsigned LoadOpNo = IsNegate ? 1 : 0;
3641 LoadSDNode *LoadNode = nullptr;
3642 SDValue InputChain;
3643 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3644 LoadNode, InputChain)) {
3645 if (!IsCommutable)
3646 return false;
3647
3648 // This operation is commutable, try the other operand.
3649 LoadOpNo = 1;
3650 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3651 LoadNode, InputChain))
3652 return false;
3653 }
3654
3655 SDValue Base, Scale, Index, Disp, Segment;
3656 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3657 Segment))
3658 return false;
3659
3660 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3661 unsigned Opc8) {
3662 switch (MemVT.getSimpleVT().SimpleTy) {
3663 case MVT::i64:
3664 return Opc64;
3665 case MVT::i32:
3666 return Opc32;
3667 case MVT::i16:
3668 return Opc16;
3669 case MVT::i8:
3670 return Opc8;
3671 default:
3672 llvm_unreachable("Invalid size!");
3673 }
3674 };
3675
3676 MachineSDNode *Result;
3677 switch (Opc) {
3678 case X86ISD::SUB:
3679 // Handle negate.
3680 if (IsNegate) {
3681 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3682 X86::NEG8m);
3683 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3684 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3685 MVT::Other, Ops);
3686 break;
3687 }
3688 [[fallthrough]];
3689 case X86ISD::ADD:
3690 // Try to match inc/dec.
3691 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3692 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3693 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3694 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3695 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3696 unsigned NewOpc =
3697 ((Opc == X86ISD::ADD) == IsOne)
3698 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3699 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3700 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3701 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3702 MVT::Other, Ops);
3703 break;
3704 }
3705 }
3706 [[fallthrough]];
3707 case X86ISD::ADC:
3708 case X86ISD::SBB:
3709 case X86ISD::AND:
3710 case X86ISD::OR:
3711 case X86ISD::XOR: {
3712 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3713 switch (Opc) {
3714 case X86ISD::ADD:
3715 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3716 X86::ADD8mr);
3717 case X86ISD::ADC:
3718 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3719 X86::ADC8mr);
3720 case X86ISD::SUB:
3721 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3722 X86::SUB8mr);
3723 case X86ISD::SBB:
3724 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3725 X86::SBB8mr);
3726 case X86ISD::AND:
3727 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3728 X86::AND8mr);
3729 case X86ISD::OR:
3730 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3731 case X86ISD::XOR:
3732 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3733 X86::XOR8mr);
3734 default:
3735 llvm_unreachable("Invalid opcode!");
3736 }
3737 };
3738 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3739 switch (Opc) {
3740 case X86ISD::ADD:
3741 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3742 X86::ADD8mi);
3743 case X86ISD::ADC:
3744 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3745 X86::ADC8mi);
3746 case X86ISD::SUB:
3747 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3748 X86::SUB8mi);
3749 case X86ISD::SBB:
3750 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3751 X86::SBB8mi);
3752 case X86ISD::AND:
3753 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3754 X86::AND8mi);
3755 case X86ISD::OR:
3756 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3757 X86::OR8mi);
3758 case X86ISD::XOR:
3759 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3760 X86::XOR8mi);
3761 default:
3762 llvm_unreachable("Invalid opcode!");
3763 }
3764 };
3765
3766 unsigned NewOpc = SelectRegOpcode(Opc);
3767 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3768
3769 // See if the operand is a constant that we can fold into an immediate
3770 // operand.
3771 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3772 int64_t OperandV = OperandC->getSExtValue();
3773
3774 // Check if we can shrink the operand enough to fit in an immediate (or
3775 // fit into a smaller immediate) by negating it and switching the
3776 // operation.
3777 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3778 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3779 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3780 isInt<32>(-OperandV))) &&
3781 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3782 OperandV = -OperandV;
3784 }
3785
3786 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3787 Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
3788 NewOpc = SelectImmOpcode(Opc);
3789 }
3790 }
3791
3792 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3793 SDValue CopyTo =
3794 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3795 StoredVal.getOperand(2), SDValue());
3796
3797 const SDValue Ops[] = {Base, Scale, Index, Disp,
3798 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3799 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3800 Ops);
3801 } else {
3802 const SDValue Ops[] = {Base, Scale, Index, Disp,
3803 Segment, Operand, InputChain};
3804 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3805 Ops);
3806 }
3807 break;
3808 }
3809 default:
3810 llvm_unreachable("Invalid opcode!");
3811 }
3812
3813 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3814 LoadNode->getMemOperand()};
3815 CurDAG->setNodeMemRefs(Result, MemOps);
3816
3817 // Update Load Chain uses as well.
3818 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3819 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3820 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3821 CurDAG->RemoveDeadNode(Node);
3822 return true;
3823}
3824
3825// See if this is an X & Mask that we can match to BEXTR/BZHI.
3826// Where Mask is one of the following patterns:
3827// a) x & (1 << nbits) - 1
3828// b) x & ~(-1 << nbits)
3829// c) x & (-1 >> (32 - y))
3830// d) x << (32 - y) >> (32 - y)
3831// e) (1 << nbits) - 1
3832bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3833 assert(
3834 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3835 Node->getOpcode() == ISD::SRL) &&
3836 "Should be either an and-mask, or right-shift after clearing high bits.");
3837
3838 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3839 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3840 return false;
3841
3842 MVT NVT = Node->getSimpleValueType(0);
3843
3844 // Only supported for 32 and 64 bits.
3845 if (NVT != MVT::i32 && NVT != MVT::i64)
3846 return false;
3847
3848 SDValue NBits;
3849 bool NegateNBits;
3850
3851 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3852 // Else, if we only have BMI1's BEXTR, we require one-use.
3853 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3854 auto checkUses = [AllowExtraUsesByDefault](
3855 SDValue Op, unsigned NUses,
3856 std::optional<bool> AllowExtraUses) {
3857 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3858 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3859 };
3860 auto checkOneUse = [checkUses](SDValue Op,
3861 std::optional<bool> AllowExtraUses =
3862 std::nullopt) {
3863 return checkUses(Op, 1, AllowExtraUses);
3864 };
3865 auto checkTwoUse = [checkUses](SDValue Op,
3866 std::optional<bool> AllowExtraUses =
3867 std::nullopt) {
3868 return checkUses(Op, 2, AllowExtraUses);
3869 };
3870
3871 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3872 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3873 assert(V.getSimpleValueType() == MVT::i32 &&
3874 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3875 "Expected i64 -> i32 truncation");
3876 V = V.getOperand(0);
3877 }
3878 return V;
3879 };
3880
3881 // a) x & ((1 << nbits) + (-1))
3882 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3883 &NegateNBits](SDValue Mask) -> bool {
3884 // Match `add`. Must only have one use!
3885 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3886 return false;
3887 // We should be adding all-ones constant (i.e. subtracting one.)
3888 if (!isAllOnesConstant(Mask->getOperand(1)))
3889 return false;
3890 // Match `1 << nbits`. Might be truncated. Must only have one use!
3891 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3892 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3893 return false;
3894 if (!isOneConstant(M0->getOperand(0)))
3895 return false;
3896 NBits = M0->getOperand(1);
3897 NegateNBits = false;
3898 return true;
3899 };
3900
3901 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3902 V = peekThroughOneUseTruncation(V);
3903 return CurDAG->MaskedValueIsAllOnes(
3904 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3905 NVT.getSizeInBits()));
3906 };
3907
3908 // b) x & ~(-1 << nbits)
3909 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3910 &NBits, &NegateNBits](SDValue Mask) -> bool {
3911 // Match `~()`. Must only have one use!
3912 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3913 return false;
3914 // The -1 only has to be all-ones for the final Node's NVT.
3915 if (!isAllOnes(Mask->getOperand(1)))
3916 return false;
3917 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3918 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3919 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3920 return false;
3921 // The -1 only has to be all-ones for the final Node's NVT.
3922 if (!isAllOnes(M0->getOperand(0)))
3923 return false;
3924 NBits = M0->getOperand(1);
3925 NegateNBits = false;
3926 return true;
3927 };
3928
3929 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3930 // or leave the shift amount as-is, but then we'll have to negate it.
3931 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3932 unsigned Bitwidth) {
3933 NBits = ShiftAmt;
3934 NegateNBits = true;
3935 // Skip over a truncate of the shift amount, if any.
3936 if (NBits.getOpcode() == ISD::TRUNCATE)
3937 NBits = NBits.getOperand(0);
3938 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3939 // If it doesn't match, that's fine, we'll just negate it ourselves.
3940 if (NBits.getOpcode() != ISD::SUB)
3941 return;
3942 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3943 if (!V0 || V0->getZExtValue() != Bitwidth)
3944 return;
3945 NBits = NBits.getOperand(1);
3946 NegateNBits = false;
3947 };
3948
3949 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3950 // or
3951 // c) x & (-1 >> (32 - y))
3952 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3953 canonicalizeShiftAmt](SDValue Mask) -> bool {
3954 // The mask itself may be truncated.
3955 Mask = peekThroughOneUseTruncation(Mask);
3956 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3957 // Match `l>>`. Must only have one use!
3958 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3959 return false;
3960 // We should be shifting truly all-ones constant.
3961 if (!isAllOnesConstant(Mask.getOperand(0)))
3962 return false;
3963 SDValue M1 = Mask.getOperand(1);
3964 // The shift amount should not be used externally.
3965 if (!checkOneUse(M1))
3966 return false;
3967 canonicalizeShiftAmt(M1, Bitwidth);
3968 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3969 // is no extra use of the mask. Clearly, there was one since we are here.
3970 // But at the same time, if we need to negate the shift amount,
3971 // then we don't want the mask to stick around, else it's unprofitable.
3972 return !NegateNBits;
3973 };
3974
3975 SDValue X;
3976
3977 // d) x << z >> z but then we'll have to subtract z from bitwidth
3978 // or
3979 // d) x << (32 - y) >> (32 - y)
3980 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3981 AllowExtraUsesByDefault, &NegateNBits,
3982 &X](SDNode *Node) -> bool {
3983 if (Node->getOpcode() != ISD::SRL)
3984 return false;
3985 SDValue N0 = Node->getOperand(0);
3986 if (N0->getOpcode() != ISD::SHL)
3987 return false;
3988 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3989 SDValue N1 = Node->getOperand(1);
3990 SDValue N01 = N0->getOperand(1);
3991 // Both of the shifts must be by the exact same value.
3992 if (N1 != N01)
3993 return false;
3994 canonicalizeShiftAmt(N1, Bitwidth);
3995 // There should not be any external uses of the inner shift / shift amount.
3996 // Note that while we are generally okay with external uses given BMI2,
3997 // iff we need to negate the shift amount, we are not okay with extra uses.
3998 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3999 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
4000 return false;
4001 X = N0->getOperand(0);
4002 return true;
4003 };
4004
4005 auto matchLowBitMask = [matchPatternA, matchPatternB,
4006 matchPatternC](SDValue Mask) -> bool {
4007 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4008 };
4009
4010 if (Node->getOpcode() == ISD::AND) {
4011 X = Node->getOperand(0);
4012 SDValue Mask = Node->getOperand(1);
4013
4014 if (matchLowBitMask(Mask)) {
4015 // Great.
4016 } else {
4017 std::swap(X, Mask);
4018 if (!matchLowBitMask(Mask))
4019 return false;
4020 }
4021 } else if (matchLowBitMask(SDValue(Node, 0))) {
4022 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
4023 } else if (!matchPatternD(Node))
4024 return false;
4025
4026 // If we need to negate the shift amount, require BMI2 BZHI support.
4027 // It's just too unprofitable for BMI1 BEXTR.
4028 if (NegateNBits && !Subtarget->hasBMI2())
4029 return false;
4030
4031 SDLoc DL(Node);
4032
4033 // Truncate the shift amount.
4034 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
4035 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4036
4037 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4038 // All the other bits are undefined, we do not care about them.
4039 SDValue ImplDef = SDValue(
4040 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
4041 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
4042
4043 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
4044 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
4045 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
4046 MVT::i32, ImplDef, NBits, SRIdxVal),
4047 0);
4048 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4049
4050 // We might have matched the amount of high bits to be cleared,
4051 // but we want the amount of low bits to be kept, so negate it then.
4052 if (NegateNBits) {
4053 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4054 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4055
4056 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4057 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4058 }
4059
4060 if (Subtarget->hasBMI2()) {
4061 // Great, just emit the BZHI..
4062 if (NVT != MVT::i32) {
4063 // But have to place the bit count into the wide-enough register first.
4064 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4065 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4066 }
4067
4068 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4069 ReplaceNode(Node, Extract.getNode());
4070 SelectCode(Extract.getNode());
4071 return true;
4072 }
4073
4074 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4075 // *logically* shifted (potentially with one-use trunc inbetween),
4076 // and the truncation was the only use of the shift,
4077 // and if so look past one-use truncation.
4078 {
4079 SDValue RealX = peekThroughOneUseTruncation(X);
4080 // FIXME: only if the shift is one-use?
4081 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4082 X = RealX;
4083 }
4084
4085 MVT XVT = X.getSimpleValueType();
4086
4087 // Else, emitting BEXTR requires one more step.
4088 // The 'control' of BEXTR has the pattern of:
4089 // [15...8 bit][ 7...0 bit] location
4090 // [ bit count][ shift] name
4091 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4092
4093 // Shift NBits left by 8 bits, thus producing 'control'.
4094 // This makes the low 8 bits to be zero.
4095 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4096 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4097 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4098 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4099
4100 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4101 // FIXME: only if the shift is one-use?
4102 if (X.getOpcode() == ISD::SRL) {
4103 SDValue ShiftAmt = X.getOperand(1);
4104 X = X.getOperand(0);
4105
4106 assert(ShiftAmt.getValueType() == MVT::i8 &&
4107 "Expected shift amount to be i8");
4108
4109 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4110 // We could zext to i16 in some form, but we intentionally don't do that.
4111 SDValue OrigShiftAmt = ShiftAmt;
4112 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4113 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4114
4115 // And now 'or' these low 8 bits of shift amount into the 'control'.
4116 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4117 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4118 }
4119
4120 // But have to place the 'control' into the wide-enough register first.
4121 if (XVT != MVT::i32) {
4122 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4123 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4124 }
4125
4126 // And finally, form the BEXTR itself.
4127 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4128
4129 // The 'X' was originally truncated. Do that now.
4130 if (XVT != NVT) {
4131 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4132 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4133 }
4134
4135 ReplaceNode(Node, Extract.getNode());
4136 SelectCode(Extract.getNode());
4137
4138 return true;
4139}
4140
4141// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4142MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4143 MVT NVT = Node->getSimpleValueType(0);
4144 SDLoc dl(Node);
4145
4146 SDValue N0 = Node->getOperand(0);
4147 SDValue N1 = Node->getOperand(1);
4148
4149 // If we have TBM we can use an immediate for the control. If we have BMI
4150 // we should only do this if the BEXTR instruction is implemented well.
4151 // Otherwise moving the control into a register makes this more costly.
4152 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4153 // hoisting the move immediate would make it worthwhile with a less optimal
4154 // BEXTR?
4155 bool PreferBEXTR =
4156 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4157 if (!PreferBEXTR && !Subtarget->hasBMI2())
4158 return nullptr;
4159
4160 // Must have a shift right.
4161 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4162 return nullptr;
4163
4164 // Shift can't have additional users.
4165 if (!N0->hasOneUse())
4166 return nullptr;
4167
4168 // Only supported for 32 and 64 bits.
4169 if (NVT != MVT::i32 && NVT != MVT::i64)
4170 return nullptr;
4171
4172 // Shift amount and RHS of and must be constant.
4173 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4174 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4175 if (!MaskCst || !ShiftCst)
4176 return nullptr;
4177
4178 // And RHS must be a mask.
4179 uint64_t Mask = MaskCst->getZExtValue();
4180 if (!isMask_64(Mask))
4181 return nullptr;
4182
4183 uint64_t Shift = ShiftCst->getZExtValue();
4184 uint64_t MaskSize = llvm::popcount(Mask);
4185
4186 // Don't interfere with something that can be handled by extracting AH.
4187 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4188 if (Shift == 8 && MaskSize == 8)
4189 return nullptr;
4190
4191 // Make sure we are only using bits that were in the original value, not
4192 // shifted in.
4193 if (Shift + MaskSize > NVT.getSizeInBits())
4194 return nullptr;
4195
4196 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4197 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4198 // does not fit into 32 bits. Load folding is not a sufficient reason.
4199 if (!PreferBEXTR && MaskSize <= 32)
4200 return nullptr;
4201
4202 SDValue Control;
4203 unsigned ROpc, MOpc;
4204
4205#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4206 if (!PreferBEXTR) {
4207 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4208 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4209 // Let's perform the mask first, and apply shift later. Note that we need to
4210 // widen the mask to account for the fact that we'll apply shift afterwards!
4211 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4212 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4213 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4214 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4215 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4216 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4217 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4218 } else {
4219 // The 'control' of BEXTR has the pattern of:
4220 // [15...8 bit][ 7...0 bit] location
4221 // [ bit count][ shift] name
4222 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4223 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4224 if (Subtarget->hasTBM()) {
4225 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4226 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4227 } else {
4228 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4229 // BMI requires the immediate to placed in a register.
4230 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4231 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4232 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4233 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4234 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4235 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4236 }
4237 }
4238
4239 MachineSDNode *NewNode;
4240 SDValue Input = N0->getOperand(0);
4241 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4242 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4243 SDValue Ops[] = {
4244 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4245 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4246 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4247 // Update the chain.
4248 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4249 // Record the mem-refs
4250 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4251 } else {
4252 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4253 }
4254
4255 if (!PreferBEXTR) {
4256 // We still need to apply the shift.
4257 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4258 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4259 : GET_ND_IF_ENABLED(X86::SHR32ri);
4260 NewNode =
4261 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4262 }
4263
4264 return NewNode;
4265}
4266
4267// Emit a PCMISTR(I/M) instruction.
4268MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4269 bool MayFoldLoad, const SDLoc &dl,
4270 MVT VT, SDNode *Node) {
4271 SDValue N0 = Node->getOperand(0);
4272 SDValue N1 = Node->getOperand(1);
4273 SDValue Imm = Node->getOperand(2);
4274 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4275 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4276
4277 // Try to fold a load. No need to check alignment.
4278 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4279 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4280 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4281 N1.getOperand(0) };
4282 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4283 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4284 // Update the chain.
4285 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4286 // Record the mem-refs
4287 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4288 return CNode;
4289 }
4290
4291 SDValue Ops[] = { N0, N1, Imm };
4292 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4293 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4294 return CNode;
4295}
4296
4297// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4298// to emit a second instruction after this one. This is needed since we have two
4299// copyToReg nodes glued before this and we need to continue that glue through.
4300MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4301 bool MayFoldLoad, const SDLoc &dl,
4302 MVT VT, SDNode *Node,
4303 SDValue &InGlue) {
4304 SDValue N0 = Node->getOperand(0);
4305 SDValue N2 = Node->getOperand(2);
4306 SDValue Imm = Node->getOperand(4);
4307 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4308 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4309
4310 // Try to fold a load. No need to check alignment.
4311 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4312 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4313 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4314 N2.getOperand(0), InGlue };
4315 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4316 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4317 InGlue = SDValue(CNode, 3);
4318 // Update the chain.
4319 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4320 // Record the mem-refs
4321 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4322 return CNode;
4323 }
4324
4325 SDValue Ops[] = { N0, N2, Imm, InGlue };
4326 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4327 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4328 InGlue = SDValue(CNode, 2);
4329 return CNode;
4330}
4331
4332bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4333 EVT VT = N->getValueType(0);
4334
4335 // Only handle scalar shifts.
4336 if (VT.isVector())
4337 return false;
4338
4339 // Narrower shifts only mask to 5 bits in hardware.
4340 unsigned Size = VT == MVT::i64 ? 64 : 32;
4341
4342 SDValue OrigShiftAmt = N->getOperand(1);
4343 SDValue ShiftAmt = OrigShiftAmt;
4344 SDLoc DL(N);
4345
4346 // Skip over a truncate of the shift amount.
4347 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4348 ShiftAmt = ShiftAmt->getOperand(0);
4349
4350 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4351 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4352
4353 SDValue NewShiftAmt;
4354 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4355 ShiftAmt->getOpcode() == ISD::XOR) {
4356 SDValue Add0 = ShiftAmt->getOperand(0);
4357 SDValue Add1 = ShiftAmt->getOperand(1);
4358 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4359 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4360 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4361 // to avoid the ADD/SUB/XOR.
4362 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4363 NewShiftAmt = Add0;
4364
4365 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4366 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4367 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4368 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4369 // we can replace it with a NOT. In the XOR case it may save some code
4370 // size, in the SUB case it also may save a move.
4371 assert(Add0C == nullptr || Add1C == nullptr);
4372
4373 // We can only do N-X, not X-N
4374 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4375 return false;
4376
4377 EVT OpVT = ShiftAmt.getValueType();
4378
4379 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4380 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4381 Add0C == nullptr ? Add0 : Add1, AllOnes);
4382 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4383 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4384 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4385 // -X to generate a NEG instead of a SUB of a constant.
4386 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4387 Add0C->getZExtValue() != 0) {
4388 EVT SubVT = ShiftAmt.getValueType();
4389 SDValue X;
4390 if (Add0C->getZExtValue() % Size == 0)
4391 X = Add1;
4392 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4393 Add0C->getZExtValue() % 32 == 0) {
4394 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4395 // This is mainly beneficial if we already compute (x+n*32).
4396 if (Add1.getOpcode() == ISD::TRUNCATE) {
4397 Add1 = Add1.getOperand(0);
4398 SubVT = Add1.getValueType();
4399 }
4400 if (Add0.getValueType() != SubVT) {
4401 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4402 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4403 }
4404
4405 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4406 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4407 } else
4408 return false;
4409 // Insert a negate op.
4410 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4411 // that uses it that's not a shift.
4412 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4413 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4414 NewShiftAmt = Neg;
4415
4416 // Insert these operands into a valid topological order so they can
4417 // get selected independently.
4418 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4419 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4420 } else
4421 return false;
4422 } else
4423 return false;
4424
4425 if (NewShiftAmt.getValueType() != MVT::i8) {
4426 // Need to truncate the shift amount.
4427 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4428 // Add to a correct topological ordering.
4429 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4430 }
4431
4432 // Insert a new mask to keep the shift amount legal. This should be removed
4433 // by isel patterns.
4434 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4435 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4436 // Place in a correct topological ordering.
4437 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4438
4439 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4440 NewShiftAmt);
4441 if (UpdatedNode != N) {
4442 // If we found an existing node, we should replace ourselves with that node
4443 // and wait for it to be selected after its other users.
4444 ReplaceNode(N, UpdatedNode);
4445 return true;
4446 }
4447
4448 // If the original shift amount is now dead, delete it so that we don't run
4449 // it through isel.
4450 if (OrigShiftAmt.getNode()->use_empty())
4451 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4452
4453 // Now that we've optimized the shift amount, defer to normal isel to get
4454 // load folding and legacy vs BMI2 selection without repeating it here.
4455 SelectCode(N);
4456 return true;
4457}
4458
4459bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4460 MVT NVT = N->getSimpleValueType(0);
4461 unsigned Opcode = N->getOpcode();
4462 SDLoc dl(N);
4463
4464 // For operations of the form (x << C1) op C2, check if we can use a smaller
4465 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4466 SDValue Shift = N->getOperand(0);
4467 SDValue N1 = N->getOperand(1);
4468
4469 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4470 if (!Cst)
4471 return false;
4472
4473 int64_t Val = Cst->getSExtValue();
4474
4475 // If we have an any_extend feeding the AND, look through it to see if there
4476 // is a shift behind it. But only if the AND doesn't use the extended bits.
4477 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4478 bool FoundAnyExtend = false;
4479 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4480 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4481 isUInt<32>(Val)) {
4482 FoundAnyExtend = true;
4483 Shift = Shift.getOperand(0);
4484 }
4485
4486 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4487 return false;
4488
4489 // i8 is unshrinkable, i16 should be promoted to i32.
4490 if (NVT != MVT::i32 && NVT != MVT::i64)
4491 return false;
4492
4493 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4494 if (!ShlCst)
4495 return false;
4496
4497 uint64_t ShAmt = ShlCst->getZExtValue();
4498
4499 // Make sure that we don't change the operation by removing bits.
4500 // This only matters for OR and XOR, AND is unaffected.
4501 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4502 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4503 return false;
4504
4505 // Check the minimum bitwidth for the new constant.
4506 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4507 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4508 if (Opcode == ISD::AND) {
4509 // AND32ri is the same as AND64ri32 with zext imm.
4510 // Try this before sign extended immediates below.
4511 ShiftedVal = (uint64_t)Val >> ShAmt;
4512 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4513 return true;
4514 // Also swap order when the AND can become MOVZX.
4515 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4516 return true;
4517 }
4518 ShiftedVal = Val >> ShAmt;
4519 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4520 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4521 return true;
4522 if (Opcode != ISD::AND) {
4523 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4524 ShiftedVal = (uint64_t)Val >> ShAmt;
4525 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4526 return true;
4527 }
4528 return false;
4529 };
4530
4531 int64_t ShiftedVal;
4532 if (!CanShrinkImmediate(ShiftedVal))
4533 return false;
4534
4535 // Ok, we can reorder to get a smaller immediate.
4536
4537 // But, its possible the original immediate allowed an AND to become MOVZX.
4538 // Doing this late due to avoid the MakedValueIsZero call as late as
4539 // possible.
4540 if (Opcode == ISD::AND) {
4541 // Find the smallest zext this could possibly be.
4542 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4543 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4544
4545 // Figure out which bits need to be zero to achieve that mask.
4546 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4547 ZExtWidth);
4548 NeededMask &= ~Cst->getAPIntValue();
4549
4550 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4551 return false;
4552 }
4553
4554 SDValue X = Shift.getOperand(0);
4555 if (FoundAnyExtend) {
4556 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4557 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4558 X = NewX;
4559 }
4560
4561 SDValue NewCst = CurDAG->getSignedConstant(ShiftedVal, dl, NVT);
4562 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4563 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4564 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4565 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4566 Shift.getOperand(1));
4567 ReplaceNode(N, NewSHL.getNode());
4568 SelectCode(NewSHL.getNode());
4569 return true;
4570}
4571
4572bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4573 SDNode *ParentB, SDNode *ParentC,
4575 uint8_t Imm) {
4576 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4577 C.isOperandOf(ParentC) && "Incorrect parent node");
4578
4579 auto tryFoldLoadOrBCast =
4580 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4581 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4582 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4583 return true;
4584
4585 // Not a load, check for broadcast which may be behind a bitcast.
4586 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4587 P = L.getNode();
4588 L = L.getOperand(0);
4589 }
4590
4591 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4592 return false;
4593
4594 // Only 32 and 64 bit broadcasts are supported.
4595 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4596 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4597 if (Size != 32 && Size != 64)
4598 return false;
4599
4600 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4601 };
4602
4603 bool FoldedLoad = false;
4604 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4605 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4606 FoldedLoad = true;
4607 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4608 Tmp4)) {
4609 FoldedLoad = true;
4610 std::swap(A, C);
4611 // Swap bits 1/4 and 3/6.
4612 uint8_t OldImm = Imm;
4613 Imm = OldImm & 0xa5;
4614 if (OldImm & 0x02) Imm |= 0x10;
4615 if (OldImm & 0x10) Imm |= 0x02;
4616 if (OldImm & 0x08) Imm |= 0x40;
4617 if (OldImm & 0x40) Imm |= 0x08;
4618 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4619 Tmp4)) {
4620 FoldedLoad = true;
4621 std::swap(B, C);
4622 // Swap bits 1/2 and 5/6.
4623 uint8_t OldImm = Imm;
4624 Imm = OldImm & 0x99;
4625 if (OldImm & 0x02) Imm |= 0x04;
4626 if (OldImm & 0x04) Imm |= 0x02;
4627 if (OldImm & 0x20) Imm |= 0x40;
4628 if (OldImm & 0x40) Imm |= 0x20;
4629 }
4630
4631 SDLoc DL(Root);
4632
4633 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4634
4635 MVT NVT = Root->getSimpleValueType(0);
4636
4637 MachineSDNode *MNode;
4638 if (FoldedLoad) {
4639 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4640
4641 unsigned Opc;
4642 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4643 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4644 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4645 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4646
4647 bool UseD = EltSize == 32;
4648 if (NVT.is128BitVector())
4649 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4650 else if (NVT.is256BitVector())
4651 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4652 else if (NVT.is512BitVector())
4653 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4654 else
4655 llvm_unreachable("Unexpected vector size!");
4656 } else {
4657 bool UseD = NVT.getVectorElementType() == MVT::i32;
4658 if (NVT.is128BitVector())
4659 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4660 else if (NVT.is256BitVector())
4661 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4662 else if (NVT.is512BitVector())
4663 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4664 else
4665 llvm_unreachable("Unexpected vector size!");
4666 }
4667
4668 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4669 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4670
4671 // Update the chain.
4672 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4673 // Record the mem-refs
4674 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4675 } else {
4676 bool UseD = NVT.getVectorElementType() == MVT::i32;
4677 unsigned Opc;
4678 if (NVT.is128BitVector())
4679 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4680 else if (NVT.is256BitVector())
4681 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4682 else if (NVT.is512BitVector())
4683 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4684 else
4685 llvm_unreachable("Unexpected vector size!");
4686
4687 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4688 }
4689
4690 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4691 CurDAG->RemoveDeadNode(Root);
4692 return true;
4693}
4694
4695// Try to match two logic ops to a VPTERNLOG.
4696// FIXME: Handle more complex patterns that use an operand more than once?
4697bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4698 MVT NVT = N->getSimpleValueType(0);
4699
4700 // Make sure we support VPTERNLOG.
4701 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4702 NVT.getVectorElementType() == MVT::i1)
4703 return false;
4704
4705 // We need VLX for 128/256-bit.
4706 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4707 return false;
4708
4709 auto getFoldableLogicOp = [](SDValue Op) {
4710 // Peek through single use bitcast.
4711 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4712 Op = Op.getOperand(0);
4713
4714 if (!Op.hasOneUse())
4715 return SDValue();
4716
4717 unsigned Opc = Op.getOpcode();
4718 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4719 Opc == X86ISD::ANDNP)
4720 return Op;
4721
4722 return SDValue();
4723 };
4724
4725 SDValue N0, N1, A, FoldableOp;
4726
4727 // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
4728 auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
4729 if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
4730 ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
4731 SDValue InnerOp = Op->getOperand(0);
4732
4733 if (!getFoldableLogicOp(InnerOp))
4734 return SDValue();
4735
4736 N0 = InnerOp.getOperand(0);
4737 N1 = InnerOp.getOperand(1);
4738 if ((FoldableOp = getFoldableLogicOp(N1))) {
4739 A = N0;
4740 return InnerOp;
4741 }
4742 if ((FoldableOp = getFoldableLogicOp(N0))) {
4743 A = N1;
4744 return InnerOp;
4745 }
4746 }
4747 return SDValue();
4748 };
4749
4750 bool PeeledOuterNot = false;
4751 SDNode *OriN = N;
4752 if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
4753 PeeledOuterNot = true;
4754 N = InnerOp.getNode();
4755 } else {
4756 N0 = N->getOperand(0);
4757 N1 = N->getOperand(1);
4758
4759 if ((FoldableOp = getFoldableLogicOp(N1)))
4760 A = N0;
4761 else if ((FoldableOp = getFoldableLogicOp(N0)))
4762 A = N1;
4763 else
4764 return false;
4765 }
4766
4767 SDValue B = FoldableOp.getOperand(0);
4768 SDValue C = FoldableOp.getOperand(1);
4769 SDNode *ParentA = N;
4770 SDNode *ParentB = FoldableOp.getNode();
4771 SDNode *ParentC = FoldableOp.getNode();
4772
4773 // We can build the appropriate control immediate by performing the logic
4774 // operation we're matching using these constants for A, B, and C.
4775 uint8_t TernlogMagicA = 0xf0;
4776 uint8_t TernlogMagicB = 0xcc;
4777 uint8_t TernlogMagicC = 0xaa;
4778
4779 // Some of the inputs may be inverted, peek through them and invert the
4780 // magic values accordingly.
4781 // TODO: There may be a bitcast before the xor that we should peek through.
4782 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4783 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4784 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4785 Magic = ~Magic;
4786 Parent = Op.getNode();
4787 Op = Op.getOperand(0);
4788 }
4789 };
4790
4791 PeekThroughNot(A, ParentA, TernlogMagicA);
4792 PeekThroughNot(B, ParentB, TernlogMagicB);
4793 PeekThroughNot(C, ParentC, TernlogMagicC);
4794
4795 uint8_t Imm;
4796 switch (FoldableOp.getOpcode()) {
4797 default: llvm_unreachable("Unexpected opcode!");
4798 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4799 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4800 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4801 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4802 }
4803
4804 switch (N->getOpcode()) {
4805 default: llvm_unreachable("Unexpected opcode!");
4806 case X86ISD::ANDNP:
4807 if (A == N0)
4808 Imm &= ~TernlogMagicA;
4809 else
4810 Imm = ~(Imm) & TernlogMagicA;
4811 break;
4812 case ISD::AND: Imm &= TernlogMagicA; break;
4813 case ISD::OR: Imm |= TernlogMagicA; break;
4814 case ISD::XOR: Imm ^= TernlogMagicA; break;
4815 }
4816
4817 if (PeeledOuterNot)
4818 Imm = ~Imm;
4819
4820 return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
4821}
4822
4823/// If the high bits of an 'and' operand are known zero, try setting the
4824/// high bits of an 'and' constant operand to produce a smaller encoding by
4825/// creating a small, sign-extended negative immediate rather than a large
4826/// positive one. This reverses a transform in SimplifyDemandedBits that
4827/// shrinks mask constants by clearing bits. There is also a possibility that
4828/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4829/// case, just replace the 'and'. Return 'true' if the node is replaced.
4830bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4831 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4832 // have immediate operands.
4833 MVT VT = And->getSimpleValueType(0);
4834 if (VT != MVT::i32 && VT != MVT::i64)
4835 return false;
4836
4837 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4838 if (!And1C)
4839 return false;
4840
4841 // Bail out if the mask constant is already negative. It's can't shrink more.
4842 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4843 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4844 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4845 // are negative too.
4846 APInt MaskVal = And1C->getAPIntValue();
4847 unsigned MaskLZ = MaskVal.countl_zero();
4848 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4849 return false;
4850
4851 // Don't extend into the upper 32 bits of a 64 bit mask.
4852 if (VT == MVT::i64 && MaskLZ >= 32) {
4853 MaskLZ -= 32;
4854 MaskVal = MaskVal.trunc(32);
4855 }
4856
4857 SDValue And0 = And->getOperand(0);
4858 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4859 APInt NegMaskVal = MaskVal | HighZeros;
4860
4861 // If a negative constant would not allow a smaller encoding, there's no need
4862 // to continue. Only change the constant when we know it's a win.
4863 unsigned MinWidth = NegMaskVal.getSignificantBits();
4864 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4865 return false;
4866
4867 // Extend masks if we truncated above.
4868 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4869 NegMaskVal = NegMaskVal.zext(64);
4870 HighZeros = HighZeros.zext(64);
4871 }
4872
4873 // The variable operand must be all zeros in the top bits to allow using the
4874 // new, negative constant as the mask.
4875 // TODO: Handle constant folding?
4876 KnownBits Known0 = CurDAG->computeKnownBits(And0);
4877 if (Known0.isConstant() || !HighZeros.isSubsetOf(Known0.Zero))
4878 return false;
4879
4880 // Check if the mask is -1. In that case, this is an unnecessary instruction
4881 // that escaped earlier analysis.
4882 if (NegMaskVal.isAllOnes()) {
4883 ReplaceNode(And, And0.getNode());
4884 return true;
4885 }
4886
4887 // A negative mask allows a smaller encoding. Create a new 'and' node.
4888 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4889 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4890 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4891 ReplaceNode(And, NewAnd.getNode());
4892 SelectCode(NewAnd.getNode());
4893 return true;
4894}
4895
4896static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4897 bool FoldedBCast, bool Masked) {
4898#define VPTESTM_CASE(VT, SUFFIX) \
4899case MVT::VT: \
4900 if (Masked) \
4901 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4902 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4903
4904
4905#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4906default: llvm_unreachable("Unexpected VT!"); \
4907VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4908VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4909VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4910VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4911VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4912VPTESTM_CASE(v8i64, QZ##SUFFIX)
4913
4914#define VPTESTM_FULL_CASES(SUFFIX) \
4915VPTESTM_BROADCAST_CASES(SUFFIX) \
4916VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4917VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4918VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4919VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4920VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4921VPTESTM_CASE(v32i16, WZ##SUFFIX)
4922
4923 if (FoldedBCast) {
4924 switch (TestVT.SimpleTy) {
4926 }
4927 }
4928
4929 if (FoldedLoad) {
4930 switch (TestVT.SimpleTy) {
4932 }
4933 }
4934
4935 switch (TestVT.SimpleTy) {
4937 }
4938
4939#undef VPTESTM_FULL_CASES
4940#undef VPTESTM_BROADCAST_CASES
4941#undef VPTESTM_CASE
4942}
4943
4944// Try to create VPTESTM instruction. If InMask is not null, it will be used
4945// to form a masked operation.
4946bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4947 SDValue InMask) {
4948 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4949 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4950 "Unexpected VT!");
4951
4952 // Look for equal and not equal compares.
4953 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4954 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4955 return false;
4956
4957 SDValue SetccOp0 = Setcc.getOperand(0);
4958 SDValue SetccOp1 = Setcc.getOperand(1);
4959
4960 // Canonicalize the all zero vector to the RHS.
4961 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4962 std::swap(SetccOp0, SetccOp1);
4963
4964 // See if we're comparing against zero.
4965 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4966 return false;
4967
4968 SDValue N0 = SetccOp0;
4969
4970 MVT CmpVT = N0.getSimpleValueType();
4971 MVT CmpSVT = CmpVT.getVectorElementType();
4972
4973 // Start with both operands the same. We'll try to refine this.
4974 SDValue Src0 = N0;
4975 SDValue Src1 = N0;
4976
4977 {
4978 // Look through single use bitcasts.
4979 SDValue N0Temp = N0;
4980 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4981 N0Temp = N0.getOperand(0);
4982
4983 // Look for single use AND.
4984 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4985 Src0 = N0Temp.getOperand(0);
4986 Src1 = N0Temp.getOperand(1);
4987 }
4988 }
4989
4990 // Without VLX we need to widen the operation.
4991 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4992
4993 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4994 SDValue &Base, SDValue &Scale, SDValue &Index,
4995 SDValue &Disp, SDValue &Segment) {
4996 // If we need to widen, we can't fold the load.
4997 if (!Widen)
4998 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4999 return true;
5000
5001 // If we didn't fold a load, try to match broadcast. No widening limitation
5002 // for this. But only 32 and 64 bit types are supported.
5003 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
5004 return false;
5005
5006 // Look through single use bitcasts.
5007 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
5008 P = L.getNode();
5009 L = L.getOperand(0);
5010 }
5011
5012 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
5013 return false;
5014
5015 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
5016 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
5017 return false;
5018
5019 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
5020 };
5021
5022 // We can only fold loads if the sources are unique.
5023 bool CanFoldLoads = Src0 != Src1;
5024
5025 bool FoldedLoad = false;
5026 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5027 if (CanFoldLoads) {
5028 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5029 Tmp3, Tmp4);
5030 if (!FoldedLoad) {
5031 // And is commutative.
5032 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
5033 Tmp2, Tmp3, Tmp4);
5034 if (FoldedLoad)
5035 std::swap(Src0, Src1);
5036 }
5037 }
5038
5039 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5040
5041 bool IsMasked = InMask.getNode() != nullptr;
5042
5043 SDLoc dl(Root);
5044
5045 MVT ResVT = Setcc.getSimpleValueType();
5046 MVT MaskVT = ResVT;
5047 if (Widen) {
5048 // Widen the inputs using insert_subreg or copy_to_regclass.
5049 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5050 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5051 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5052 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
5053 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
5054 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
5055 CmpVT), 0);
5056 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
5057
5058 if (!FoldedBCast)
5059 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
5060
5061 if (IsMasked) {
5062 // Widen the mask.
5063 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
5064 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5065 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5066 dl, MaskVT, InMask, RC), 0);
5067 }
5068 }
5069
5070 bool IsTestN = CC == ISD::SETEQ;
5071 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5072 IsMasked);
5073
5074 MachineSDNode *CNode;
5075 if (FoldedLoad) {
5076 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
5077
5078 if (IsMasked) {
5079 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5080 Src1.getOperand(0) };
5081 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5082 } else {
5083 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5084 Src1.getOperand(0) };
5085 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5086 }
5087
5088 // Update the chain.
5089 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5090 // Record the mem-refs
5091 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5092 } else {
5093 if (IsMasked)
5094 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5095 else
5096 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5097 }
5098
5099 // If we widened, we need to shrink the mask VT.
5100 if (Widen) {
5101 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5102 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5103 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5104 dl, ResVT, SDValue(CNode, 0), RC);
5105 }
5106
5107 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5108 CurDAG->RemoveDeadNode(Root);
5109 return true;
5110}
5111
5112// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5113// into vpternlog.
5114bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5115 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5116
5117 MVT NVT = N->getSimpleValueType(0);
5118
5119 // Make sure we support VPTERNLOG.
5120 if (!NVT.isVector() || !Subtarget->hasAVX512())
5121 return false;
5122
5123 // We need VLX for 128/256-bit.
5124 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5125 return false;
5126
5127 SDValue N0 = N->getOperand(0);
5128 SDValue N1 = N->getOperand(1);
5129
5130 // Canonicalize AND to LHS.
5131 if (N1.getOpcode() == ISD::AND)
5132 std::swap(N0, N1);
5133
5134 if (N0.getOpcode() != ISD::AND ||
5135 N1.getOpcode() != X86ISD::ANDNP ||
5136 !N0.hasOneUse() || !N1.hasOneUse())
5137 return false;
5138
5139 // ANDN is not commutable, use it to pick down A and C.
5140 SDValue A = N1.getOperand(0);
5141 SDValue C = N1.getOperand(1);
5142
5143 // AND is commutable, if one operand matches A, the other operand is B.
5144 // Otherwise this isn't a match.
5145 SDValue B;
5146 if (N0.getOperand(0) == A)
5147 B = N0.getOperand(1);
5148 else if (N0.getOperand(1) == A)
5149 B = N0.getOperand(0);
5150 else
5151 return false;
5152
5153 SDLoc dl(N);
5154 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5155 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5156 ReplaceNode(N, Ternlog.getNode());
5157
5158 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5159 Ternlog.getNode(), A, B, C, 0xCA);
5160}
5161
5162void X86DAGToDAGISel::Select(SDNode *Node) {
5163 MVT NVT = Node->getSimpleValueType(0);
5164 unsigned Opcode = Node->getOpcode();
5165 SDLoc dl(Node);
5166
5167 if (Node->isMachineOpcode()) {
5168 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5169 Node->setNodeId(-1);
5170 return; // Already selected.
5171 }
5172
5173 switch (Opcode) {
5174 default: break;
5176 unsigned IntNo = Node->getConstantOperandVal(1);
5177 switch (IntNo) {
5178 default: break;
5179 case Intrinsic::x86_encodekey128:
5180 case Intrinsic::x86_encodekey256: {
5181 if (!Subtarget->hasKL())
5182 break;
5183
5184 unsigned Opcode;
5185 switch (IntNo) {
5186 default: llvm_unreachable("Impossible intrinsic");
5187 case Intrinsic::x86_encodekey128:
5188 Opcode = X86::ENCODEKEY128;
5189 break;
5190 case Intrinsic::x86_encodekey256:
5191 Opcode = X86::ENCODEKEY256;
5192 break;
5193 }
5194
5195 SDValue Chain = Node->getOperand(0);
5196 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5197 SDValue());
5198 if (Opcode == X86::ENCODEKEY256)
5199 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5200 Chain.getValue(1));
5201
5202 MachineSDNode *Res = CurDAG->getMachineNode(
5203 Opcode, dl, Node->getVTList(),
5204 {Node->getOperand(2), Chain, Chain.getValue(1)});
5205 ReplaceNode(Node, Res);
5206 return;
5207 }
5208 case Intrinsic::x86_tileloaddrs64_internal:
5209 case Intrinsic::x86_tileloaddrst164_internal:
5210 if (!Subtarget->hasAMXMOVRS())
5211 break;
5212 [[fallthrough]];
5213 case Intrinsic::x86_tileloadd64_internal:
5214 case Intrinsic::x86_tileloaddt164_internal: {
5215 if (!Subtarget->hasAMXTILE())
5216 break;
5217 auto *MFI =
5218 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5219 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5220 unsigned Opc;
5221 switch (IntNo) {
5222 default:
5223 llvm_unreachable("Unexpected intrinsic!");
5224 case Intrinsic::x86_tileloaddrs64_internal:
5225 Opc = X86::PTILELOADDRSV;
5226 break;
5227 case Intrinsic::x86_tileloaddrst164_internal:
5228 Opc = X86::PTILELOADDRST1V;
5229 break;
5230 case Intrinsic::x86_tileloadd64_internal:
5231 Opc = X86::PTILELOADDV;
5232 break;
5233 case Intrinsic::x86_tileloaddt164_internal:
5234 Opc = X86::PTILELOADDT1V;
5235 break;
5236 }
5237 // _tile_loadd_internal(row, col, buf, STRIDE)
5238 SDValue Base = Node->getOperand(4);
5239 SDValue Scale = getI8Imm(1, dl);
5240 SDValue Index = Node->getOperand(5);
5241 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5242 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5243 SDValue Chain = Node->getOperand(0);
5244 MachineSDNode *CNode;
5245 SDValue Ops[] = {Node->getOperand(2),
5246 Node->getOperand(3),
5247 Base,
5248 Scale,
5249 Index,
5250 Disp,
5251 Segment,
5252 Chain};
5253 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5254 ReplaceNode(Node, CNode);
5255 return;
5256 }
5257 }
5258 break;
5259 }
5260 case ISD::INTRINSIC_VOID: {
5261 unsigned IntNo = Node->getConstantOperandVal(1);
5262 switch (IntNo) {
5263 default: break;
5264 case Intrinsic::x86_sse3_monitor:
5265 case Intrinsic::x86_monitorx:
5266 case Intrinsic::x86_clzero: {
5267 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5268
5269 unsigned Opc = 0;
5270 switch (IntNo) {
5271 default: llvm_unreachable("Unexpected intrinsic!");
5272 case Intrinsic::x86_sse3_monitor:
5273 if (!Subtarget->hasSSE3())
5274 break;
5275 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5276 break;
5277 case Intrinsic::x86_monitorx:
5278 if (!Subtarget->hasMWAITX())
5279 break;
5280 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5281 break;
5282 case Intrinsic::x86_clzero:
5283 if (!Subtarget->hasCLZERO())
5284 break;
5285 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5286 break;
5287 }
5288
5289 if (Opc) {
5290 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5291 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5292 Node->getOperand(2), SDValue());
5293 SDValue InGlue = Chain.getValue(1);
5294
5295 if (IntNo == Intrinsic::x86_sse3_monitor ||
5296 IntNo == Intrinsic::x86_monitorx) {
5297 // Copy the other two operands to ECX and EDX.
5298 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5299 InGlue);
5300 InGlue = Chain.getValue(1);
5301 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5302 InGlue);
5303 InGlue = Chain.getValue(1);
5304 }
5305
5306 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5307 { Chain, InGlue});
5308 ReplaceNode(Node, CNode);
5309 return;
5310 }
5311
5312 break;
5313 }
5314 case Intrinsic::x86_tilestored64_internal: {
5315 auto *MFI =
5316 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5317 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5318 unsigned Opc = X86::PTILESTOREDV;
5319 // _tile_stored_internal(row, col, buf, STRIDE, c)
5320 SDValue Base = Node->getOperand(4);
5321 SDValue Scale = getI8Imm(1, dl);
5322 SDValue Index = Node->getOperand(5);
5323 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5324 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5325 SDValue Chain = Node->getOperand(0);
5326 MachineSDNode *CNode;
5327 SDValue Ops[] = {Node->getOperand(2),
5328 Node->getOperand(3),
5329 Base,
5330 Scale,
5331 Index,
5332 Disp,
5333 Segment,
5334 Node->getOperand(6),
5335 Chain};
5336 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5337 ReplaceNode(Node, CNode);
5338 return;
5339 }
5340 case Intrinsic::x86_tileloaddrs64:
5341 case Intrinsic::x86_tileloaddrst164:
5342 if (!Subtarget->hasAMXMOVRS())
5343 break;
5344 [[fallthrough]];
5345 case Intrinsic::x86_tileloadd64:
5346 case Intrinsic::x86_tileloaddt164:
5347 case Intrinsic::x86_tilestored64: {
5348 if (!Subtarget->hasAMXTILE())
5349 break;
5350 auto *MFI =
5351 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5352 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5353 unsigned Opc;
5354 switch (IntNo) {
5355 default: llvm_unreachable("Unexpected intrinsic!");
5356 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5357 case Intrinsic::x86_tileloaddrs64:
5358 Opc = X86::PTILELOADDRS;
5359 break;
5360 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5361 case Intrinsic::x86_tileloaddrst164:
5362 Opc = X86::PTILELOADDRST1;
5363 break;
5364 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5365 }
5366 // FIXME: Match displacement and scale.
5367 unsigned TIndex = Node->getConstantOperandVal(2);
5368 SDValue TReg = getI8Imm(TIndex, dl);
5369 SDValue Base = Node->getOperand(3);
5370 SDValue Scale = getI8Imm(1, dl);
5371 SDValue Index = Node->getOperand(4);
5372 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5373 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5374 SDValue Chain = Node->getOperand(0);
5375 MachineSDNode *CNode;
5376 if (Opc == X86::PTILESTORED) {
5377 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5378 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5379 } else {
5380 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5381 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5382 }
5383 ReplaceNode(Node, CNode);
5384 return;
5385 }
5386 }
5387 break;
5388 }
5389 case ISD::BRIND:
5390 case X86ISD::NT_BRIND: {
5391 if (Subtarget->isTarget64BitILP32()) {
5392 // Converts a 32-bit register to a 64-bit, zero-extended version of
5393 // it. This is needed because x86-64 can do many things, but jmp %r32
5394 // ain't one of them.
5395 SDValue Target = Node->getOperand(1);
5396 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5397 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5398 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5399 Node->getOperand(0), ZextTarget);
5400 ReplaceNode(Node, Brind.getNode());
5401 SelectCode(ZextTarget.getNode());
5402 SelectCode(Brind.getNode());
5403 return;
5404 }
5405 break;
5406 }
5408 ReplaceNode(Node, getGlobalBaseReg());
5409 return;
5410
5411 case ISD::BITCAST:
5412 // Just drop all 128/256/512-bit bitcasts.
5413 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5414 NVT == MVT::f128) {
5415 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5416 CurDAG->RemoveDeadNode(Node);
5417 return;
5418 }
5419 break;
5420
5421 case ISD::SRL:
5422 if (matchBitExtract(Node))
5423 return;
5424 [[fallthrough]];
5425 case ISD::SRA:
5426 case ISD::SHL:
5427 if (tryShiftAmountMod(Node))
5428 return;
5429 break;
5430
5431 case X86ISD::VPTERNLOG: {
5432 uint8_t Imm = Node->getConstantOperandVal(3);
5433 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5434 Node->getOperand(1), Node->getOperand(2), Imm))
5435 return;
5436 break;
5437 }
5438
5439 case X86ISD::ANDNP:
5440 if (tryVPTERNLOG(Node))
5441 return;
5442 break;
5443
5444 case ISD::AND:
5445 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5446 // Try to form a masked VPTESTM. Operands can be in either order.
5447 SDValue N0 = Node->getOperand(0);
5448 SDValue N1 = Node->getOperand(1);
5449 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5450 tryVPTESTM(Node, N0, N1))
5451 return;
5452 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5453 tryVPTESTM(Node, N1, N0))
5454 return;
5455 }
5456
5457 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5458 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5459 CurDAG->RemoveDeadNode(Node);
5460 return;
5461 }
5462 if (matchBitExtract(Node))
5463 return;
5464 if (AndImmShrink && shrinkAndImmediate(Node))
5465 return;
5466
5467 [[fallthrough]];
5468 case ISD::OR:
5469 case ISD::XOR:
5470 if (tryShrinkShlLogicImm(Node))
5471 return;
5472 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5473 return;
5474 if (tryVPTERNLOG(Node))
5475 return;
5476
5477 [[fallthrough]];
5478 case ISD::ADD:
5479 if (Opcode == ISD::ADD && matchBitExtract(Node))
5480 return;
5481 [[fallthrough]];
5482 case ISD::SUB: {
5483 // Try to avoid folding immediates with multiple uses for optsize.
5484 // This code tries to select to register form directly to avoid going
5485 // through the isel table which might fold the immediate. We can't change
5486 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5487 // tablegen files to check immediate use count without making the patterns
5488 // unavailable to the fast-isel table.
5489 if (!CurDAG->shouldOptForSize())
5490 break;
5491
5492 // Only handle i8/i16/i32/i64.
5493 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5494 break;
5495
5496 SDValue N0 = Node->getOperand(0);
5497 SDValue N1 = Node->getOperand(1);
5498
5499 auto *Cst = dyn_cast<ConstantSDNode>(N1);
5500 if (!Cst)
5501 break;
5502
5503 int64_t Val = Cst->getSExtValue();
5504
5505 // Make sure its an immediate that is considered foldable.
5506 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5507 if (!isInt<8>(Val) && !isInt<32>(Val))
5508 break;
5509
5510 // If this can match to INC/DEC, let it go.
5511 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5512 break;
5513
5514 // Check if we should avoid folding this immediate.
5515 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5516 break;
5517
5518 // We should not fold the immediate. So we need a register form instead.
5519 unsigned ROpc, MOpc;
5520 switch (NVT.SimpleTy) {
5521 default: llvm_unreachable("Unexpected VT!");
5522 case MVT::i8:
5523 switch (Opcode) {
5524 default: llvm_unreachable("Unexpected opcode!");
5525 case ISD::ADD:
5526 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5527 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5528 break;
5529 case ISD::SUB:
5530 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5531 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5532 break;
5533 case ISD::AND:
5534 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5535 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5536 break;
5537 case ISD::OR:
5538 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5539 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5540 break;
5541 case ISD::XOR:
5542 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5543 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5544 break;
5545 }
5546 break;
5547 case MVT::i16:
5548 switch (Opcode) {
5549 default: llvm_unreachable("Unexpected opcode!");
5550 case ISD::ADD:
5551 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5552 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5553 break;
5554 case ISD::SUB:
5555 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5556 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5557 break;
5558 case ISD::AND:
5559 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5560 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5561 break;
5562 case ISD::OR:
5563 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5564 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5565 break;
5566 case ISD::XOR:
5567 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5568 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5569 break;
5570 }
5571 break;
5572 case MVT::i32:
5573 switch (Opcode) {
5574 default: llvm_unreachable("Unexpected opcode!");
5575 case ISD::ADD:
5576 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5577 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5578 break;
5579 case ISD::SUB:
5580 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5581 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5582 break;
5583 case ISD::AND:
5584 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5585 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5586 break;
5587 case ISD::OR:
5588 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5589 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5590 break;
5591 case ISD::XOR:
5592 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5593 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5594 break;
5595 }
5596 break;
5597 case MVT::i64:
5598 switch (Opcode) {
5599 default: llvm_unreachable("Unexpected opcode!");
5600 case ISD::ADD:
5601 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5602 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5603 break;
5604 case ISD::SUB:
5605 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5606 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5607 break;
5608 case ISD::AND:
5609 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5610 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5611 break;
5612 case ISD::OR:
5613 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5614 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5615 break;
5616 case ISD::XOR:
5617 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5618 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5619 break;
5620 }
5621 break;
5622 }
5623
5624 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5625
5626 // If this is a not a subtract, we can still try to fold a load.
5627 if (Opcode != ISD::SUB) {
5628 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5629 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5630 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5631 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5632 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5633 // Update the chain.
5634 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5635 // Record the mem-refs
5636 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5637 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5638 CurDAG->RemoveDeadNode(Node);
5639 return;
5640 }
5641 }
5642
5643 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5644 return;
5645 }
5646
5647 case X86ISD::SMUL:
5648 // i16/i32/i64 are handled with isel patterns.
5649 if (NVT != MVT::i8)
5650 break;
5651 [[fallthrough]];
5652 case X86ISD::UMUL: {
5653 SDValue N0 = Node->getOperand(0);
5654 SDValue N1 = Node->getOperand(1);
5655
5656 unsigned LoReg, ROpc, MOpc;
5657 switch (NVT.SimpleTy) {
5658 default: llvm_unreachable("Unsupported VT!");
5659 case MVT::i8:
5660 LoReg = X86::AL;
5661 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5662 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5663 break;
5664 case MVT::i16:
5665 LoReg = X86::AX;
5666 ROpc = X86::MUL16r;
5667 MOpc = X86::MUL16m;
5668 break;
5669 case MVT::i32:
5670 LoReg = X86::EAX;
5671 ROpc = X86::MUL32r;
5672 MOpc = X86::MUL32m;
5673 break;
5674 case MVT::i64:
5675 LoReg = X86::RAX;
5676 ROpc = X86::MUL64r;
5677 MOpc = X86::MUL64m;
5678 break;
5679 }
5680
5681 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5682 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5683 // Multiply is commutative.
5684 if (!FoldedLoad) {
5685 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5686 if (FoldedLoad)
5687 std::swap(N0, N1);
5688 }
5689
5690 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5691 N0, SDValue()).getValue(1);
5692
5693 MachineSDNode *CNode;
5694 if (FoldedLoad) {
5695 // i16/i32/i64 use an instruction that produces a low and high result even
5696 // though only the low result is used.
5697 SDVTList VTs;
5698 if (NVT == MVT::i8)
5699 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5700 else
5701 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5702
5703 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5704 InGlue };
5705 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5706
5707 // Update the chain.
5708 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5709 // Record the mem-refs
5710 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5711 } else {
5712 // i16/i32/i64 use an instruction that produces a low and high result even
5713 // though only the low result is used.
5714 SDVTList VTs;
5715 if (NVT == MVT::i8)
5716 VTs = CurDAG->getVTList(NVT, MVT::i32);
5717 else
5718 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5719
5720 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5721 }
5722
5723 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5724 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5725 CurDAG->RemoveDeadNode(Node);
5726 return;
5727 }
5728
5729 case ISD::SMUL_LOHI:
5730 case ISD::UMUL_LOHI: {
5731 SDValue N0 = Node->getOperand(0);
5732 SDValue N1 = Node->getOperand(1);
5733
5734 unsigned Opc, MOpc;
5735 unsigned LoReg, HiReg;
5736 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5737 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5738 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5739 switch (NVT.SimpleTy) {
5740 default: llvm_unreachable("Unsupported VT!");
5741 case MVT::i32:
5742 Opc = UseMULXHi ? X86::MULX32Hrr
5743 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5744 : IsSigned ? X86::IMUL32r
5745 : X86::MUL32r;
5746 MOpc = UseMULXHi ? X86::MULX32Hrm
5747 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5748 : IsSigned ? X86::IMUL32m
5749 : X86::MUL32m;
5750 LoReg = UseMULX ? X86::EDX : X86::EAX;
5751 HiReg = X86::EDX;
5752 break;
5753 case MVT::i64:
5754 Opc = UseMULXHi ? X86::MULX64Hrr
5755 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5756 : IsSigned ? X86::IMUL64r
5757 : X86::MUL64r;
5758 MOpc = UseMULXHi ? X86::MULX64Hrm
5759 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5760 : IsSigned ? X86::IMUL64m
5761 : X86::MUL64m;
5762 LoReg = UseMULX ? X86::RDX : X86::RAX;
5763 HiReg = X86::RDX;
5764 break;
5765 }
5766
5767 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5768 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5769 // Multiply is commutative.
5770 if (!foldedLoad) {
5771 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5772 if (foldedLoad)
5773 std::swap(N0, N1);
5774 }
5775
5776 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5777 N0, SDValue()).getValue(1);
5778 SDValue ResHi, ResLo;
5779 if (foldedLoad) {
5780 SDValue Chain;
5781 MachineSDNode *CNode = nullptr;
5782 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5783 InGlue };
5784 if (UseMULXHi) {
5785 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5786 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5787 ResHi = SDValue(CNode, 0);
5788 Chain = SDValue(CNode, 1);
5789 } else if (UseMULX) {
5790 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5791 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5792 ResHi = SDValue(CNode, 0);
5793 ResLo = SDValue(CNode, 1);
5794 Chain = SDValue(CNode, 2);
5795 } else {
5796 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5797 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5798 Chain = SDValue(CNode, 0);
5799 InGlue = SDValue(CNode, 1);
5800 }
5801
5802 // Update the chain.
5803 ReplaceUses(N1.getValue(1), Chain);
5804 // Record the mem-refs
5805 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5806 } else {
5807 SDValue Ops[] = { N1, InGlue };
5808 if (UseMULXHi) {
5809 SDVTList VTs = CurDAG->getVTList(NVT);
5810 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5811 ResHi = SDValue(CNode, 0);
5812 } else if (UseMULX) {
5813 SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5814 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5815 ResHi = SDValue(CNode, 0);
5816 ResLo = SDValue(CNode, 1);
5817 } else {
5818 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5819 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5820 InGlue = SDValue(CNode, 0);
5821 }
5822 }
5823
5824 // Copy the low half of the result, if it is needed.
5825 if (!SDValue(Node, 0).use_empty()) {
5826 if (!ResLo) {
5827 assert(LoReg && "Register for low half is not defined!");
5828 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5829 NVT, InGlue);
5830 InGlue = ResLo.getValue(2);
5831 }
5832 ReplaceUses(SDValue(Node, 0), ResLo);
5833 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5834 dbgs() << '\n');
5835 }
5836 // Copy the high half of the result, if it is needed.
5837 if (!SDValue(Node, 1).use_empty()) {
5838 if (!ResHi) {
5839 assert(HiReg && "Register for high half is not defined!");
5840 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5841 NVT, InGlue);
5842 InGlue = ResHi.getValue(2);
5843 }
5844 ReplaceUses(SDValue(Node, 1), ResHi);
5845 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5846 dbgs() << '\n');
5847 }
5848
5849 CurDAG->RemoveDeadNode(Node);
5850 return;
5851 }
5852
5853 case ISD::SDIVREM:
5854 case ISD::UDIVREM: {
5855 SDValue N0 = Node->getOperand(0);
5856 SDValue N1 = Node->getOperand(1);
5857
5858 unsigned ROpc, MOpc;
5859 bool isSigned = Opcode == ISD::SDIVREM;
5860 if (!isSigned) {
5861 switch (NVT.SimpleTy) {
5862 default: llvm_unreachable("Unsupported VT!");
5863 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5864 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5865 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5866 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5867 }
5868 } else {
5869 switch (NVT.SimpleTy) {
5870 default: llvm_unreachable("Unsupported VT!");
5871 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5872 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5873 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5874 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5875 }
5876 }
5877
5878 unsigned LoReg, HiReg, ClrReg;
5879 unsigned SExtOpcode;
5880 switch (NVT.SimpleTy) {
5881 default: llvm_unreachable("Unsupported VT!");
5882 case MVT::i8:
5883 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5884 SExtOpcode = 0; // Not used.
5885 break;
5886 case MVT::i16:
5887 LoReg = X86::AX; HiReg = X86::DX;
5888 ClrReg = X86::DX;
5889 SExtOpcode = X86::CWD;
5890 break;
5891 case MVT::i32:
5892 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5893 SExtOpcode = X86::CDQ;
5894 break;
5895 case MVT::i64:
5896 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5897 SExtOpcode = X86::CQO;
5898 break;
5899 }
5900
5901 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5902 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5903 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5904
5905 SDValue InGlue;
5906 if (NVT == MVT::i8) {
5907 // Special case for div8, just use a move with zero extension to AX to
5908 // clear the upper 8 bits (AH).
5909 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5910 MachineSDNode *Move;
5911 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5912 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5913 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5914 : X86::MOVZX16rm8;
5915 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5916 Chain = SDValue(Move, 1);
5917 ReplaceUses(N0.getValue(1), Chain);
5918 // Record the mem-refs
5919 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5920 } else {
5921 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5922 : X86::MOVZX16rr8;
5923 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5924 Chain = CurDAG->getEntryNode();
5925 }
5926 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5927 SDValue());
5928 InGlue = Chain.getValue(1);
5929 } else {
5930 InGlue =
5931 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5932 LoReg, N0, SDValue()).getValue(1);
5933 if (isSigned && !signBitIsZero) {
5934 // Sign extend the low part into the high part.
5935 InGlue =
5936 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5937 } else {
5938 // Zero out the high part, effectively zero extending the input.
5939 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5940 SDValue ClrNode =
5941 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
5942 switch (NVT.SimpleTy) {
5943 case MVT::i16:
5944 ClrNode =
5945 SDValue(CurDAG->getMachineNode(
5946 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5947 CurDAG->getTargetConstant(X86::sub_16bit, dl,
5948 MVT::i32)),
5949 0);
5950 break;
5951 case MVT::i32:
5952 break;
5953 case MVT::i64:
5954 ClrNode =
5955 SDValue(CurDAG->getMachineNode(
5956 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5957 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5958 CurDAG->getTargetConstant(X86::sub_32bit, dl,
5959 MVT::i32)),
5960 0);
5961 break;
5962 default:
5963 llvm_unreachable("Unexpected division source");
5964 }
5965
5966 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5967 ClrNode, InGlue).getValue(1);
5968 }
5969 }
5970
5971 if (foldedLoad) {
5972 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5973 InGlue };
5974 MachineSDNode *CNode =
5975 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5976 InGlue = SDValue(CNode, 1);
5977 // Update the chain.
5978 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5979 // Record the mem-refs
5980 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5981 } else {
5982 InGlue =
5983 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
5984 }
5985
5986 // Prevent use of AH in a REX instruction by explicitly copying it to
5987 // an ABCD_L register.
5988 //
5989 // The current assumption of the register allocator is that isel
5990 // won't generate explicit references to the GR8_ABCD_H registers. If
5991 // the allocator and/or the backend get enhanced to be more robust in
5992 // that regard, this can be, and should be, removed.
5993 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
5994 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
5995 unsigned AHExtOpcode =
5996 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5997
5998 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
5999 MVT::Glue, AHCopy, InGlue);
6000 SDValue Result(RNode, 0);
6001 InGlue = SDValue(RNode, 1);
6002
6003 Result =
6004 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
6005
6006 ReplaceUses(SDValue(Node, 1), Result);
6007 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6008 dbgs() << '\n');
6009 }
6010 // Copy the division (low) result, if it is needed.
6011 if (!SDValue(Node, 0).use_empty()) {
6012 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6013 LoReg, NVT, InGlue);
6014 InGlue = Result.getValue(2);
6015 ReplaceUses(SDValue(Node, 0), Result);
6016 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6017 dbgs() << '\n');
6018 }
6019 // Copy the remainder (high) result, if it is needed.
6020 if (!SDValue(Node, 1).use_empty()) {
6021 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6022 HiReg, NVT, InGlue);
6023 InGlue = Result.getValue(2);
6024 ReplaceUses(SDValue(Node, 1), Result);
6025 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6026 dbgs() << '\n');
6027 }
6028 CurDAG->RemoveDeadNode(Node);
6029 return;
6030 }
6031
6032 case X86ISD::FCMP:
6034 case X86ISD::STRICT_FCMPS: {
6035 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6036 Node->getOpcode() == X86ISD::STRICT_FCMPS;
6037 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
6038 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
6039
6040 // Save the original VT of the compare.
6041 MVT CmpVT = N0.getSimpleValueType();
6042
6043 // Floating point needs special handling if we don't have FCOMI.
6044 if (Subtarget->canUseCMOV())
6045 break;
6046
6047 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6048
6049 unsigned Opc;
6050 switch (CmpVT.SimpleTy) {
6051 default: llvm_unreachable("Unexpected type!");
6052 case MVT::f32:
6053 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6054 break;
6055 case MVT::f64:
6056 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6057 break;
6058 case MVT::f80:
6059 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6060 break;
6061 }
6062
6063 SDValue Chain =
6064 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
6065 SDValue Glue;
6066 if (IsStrictCmp) {
6067 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
6068 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
6069 Glue = Chain.getValue(1);
6070 } else {
6071 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
6072 }
6073
6074 // Move FPSW to AX.
6075 SDValue FNSTSW =
6076 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
6077
6078 // Extract upper 8-bits of AX.
6079 SDValue Extract =
6080 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
6081
6082 // Move AH into flags.
6083 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6084 assert(Subtarget->canUseLAHFSAHF() &&
6085 "Target doesn't support SAHF or FCOMI?");
6086 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
6087 Chain = AH;
6088 SDValue SAHF = SDValue(
6089 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
6090
6091 if (IsStrictCmp)
6092 ReplaceUses(SDValue(Node, 1), Chain);
6093
6094 ReplaceUses(SDValue(Node, 0), SAHF);
6095 CurDAG->RemoveDeadNode(Node);
6096 return;
6097 }
6098
6099 case X86ISD::CMP: {
6100 SDValue N0 = Node->getOperand(0);
6101 SDValue N1 = Node->getOperand(1);
6102
6103 // Optimizations for TEST compares.
6104 if (!isNullConstant(N1))
6105 break;
6106
6107 // Save the original VT of the compare.
6108 MVT CmpVT = N0.getSimpleValueType();
6109
6110 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6111 // by a test instruction. The test should be removed later by
6112 // analyzeCompare if we are using only the zero flag.
6113 // TODO: Should we check the users and use the BEXTR flags directly?
6114 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6115 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6116 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6117 : X86::TEST32rr;
6118 SDValue BEXTR = SDValue(NewNode, 0);
6119 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6120 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6121 CurDAG->RemoveDeadNode(Node);
6122 return;
6123 }
6124 }
6125
6126 // We can peek through truncates, but we need to be careful below.
6127 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6128 N0 = N0.getOperand(0);
6129
6130 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6131 // use a smaller encoding.
6132 // Look past the truncate if CMP is the only use of it.
6133 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6134 N0.getValueType() != MVT::i8) {
6135 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6136 if (!MaskC)
6137 break;
6138
6139 // We may have looked through a truncate so mask off any bits that
6140 // shouldn't be part of the compare.
6141 uint64_t Mask = MaskC->getZExtValue();
6143
6144 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6145 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6146 // zero flag.
6147 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6148 onlyUsesZeroFlag(SDValue(Node, 0))) {
6149 unsigned ShiftOpcode = ISD::DELETED_NODE;
6150 unsigned ShiftAmt;
6151 unsigned SubRegIdx;
6152 MVT SubRegVT;
6153 unsigned TestOpcode;
6154 unsigned LeadingZeros = llvm::countl_zero(Mask);
6155 unsigned TrailingZeros = llvm::countr_zero(Mask);
6156
6157 // With leading/trailing zeros, the transform is profitable if we can
6158 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6159 // incurring any extra register moves.
6160 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6161 if (LeadingZeros == 0 && SavesBytes) {
6162 // If the mask covers the most significant bit, then we can replace
6163 // TEST+AND with a SHR and check eflags.
6164 // This emits a redundant TEST which is subsequently eliminated.
6165 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6166 ShiftAmt = TrailingZeros;
6167 SubRegIdx = 0;
6168 TestOpcode = X86::TEST64rr;
6169 } else if (TrailingZeros == 0 && SavesBytes) {
6170 // If the mask covers the least significant bit, then we can replace
6171 // TEST+AND with a SHL and check eflags.
6172 // This emits a redundant TEST which is subsequently eliminated.
6173 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6174 ShiftAmt = LeadingZeros;
6175 SubRegIdx = 0;
6176 TestOpcode = X86::TEST64rr;
6177 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6178 // If the shifted mask extends into the high half and is 8/16/32 bits
6179 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6180 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6181 if (PopCount == 8) {
6182 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6183 ShiftAmt = TrailingZeros;
6184 SubRegIdx = X86::sub_8bit;
6185 SubRegVT = MVT::i8;
6186 TestOpcode = X86::TEST8rr;
6187 } else if (PopCount == 16) {
6188 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6189 ShiftAmt = TrailingZeros;
6190 SubRegIdx = X86::sub_16bit;
6191 SubRegVT = MVT::i16;
6192 TestOpcode = X86::TEST16rr;
6193 } else if (PopCount == 32) {
6194 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6195 ShiftAmt = TrailingZeros;
6196 SubRegIdx = X86::sub_32bit;
6197 SubRegVT = MVT::i32;
6198 TestOpcode = X86::TEST32rr;
6199 }
6200 }
6201 if (ShiftOpcode != ISD::DELETED_NODE) {
6202 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6203 SDValue Shift = SDValue(
6204 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6205 N0.getOperand(0), ShiftC),
6206 0);
6207 if (SubRegIdx != 0) {
6208 Shift =
6209 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6210 }
6211 MachineSDNode *Test =
6212 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6213 ReplaceNode(Node, Test);
6214 return;
6215 }
6216 }
6217
6218 MVT VT;
6219 int SubRegOp;
6220 unsigned ROpc, MOpc;
6221
6222 // For each of these checks we need to be careful if the sign flag is
6223 // being used. It is only safe to use the sign flag in two conditions,
6224 // either the sign bit in the shrunken mask is zero or the final test
6225 // size is equal to the original compare size.
6226
6227 if (isUInt<8>(Mask) &&
6228 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6229 hasNoSignFlagUses(SDValue(Node, 0)))) {
6230 // For example, convert "testl %eax, $8" to "testb %al, $8"
6231 VT = MVT::i8;
6232 SubRegOp = X86::sub_8bit;
6233 ROpc = X86::TEST8ri;
6234 MOpc = X86::TEST8mi;
6235 } else if (OptForMinSize && isUInt<16>(Mask) &&
6236 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6237 hasNoSignFlagUses(SDValue(Node, 0)))) {
6238 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6239 // NOTE: We only want to form TESTW instructions if optimizing for
6240 // min size. Otherwise we only save one byte and possibly get a length
6241 // changing prefix penalty in the decoders.
6242 VT = MVT::i16;
6243 SubRegOp = X86::sub_16bit;
6244 ROpc = X86::TEST16ri;
6245 MOpc = X86::TEST16mi;
6246 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6247 ((!(Mask & 0x80000000) &&
6248 // Without minsize 16-bit Cmps can get here so we need to
6249 // be sure we calculate the correct sign flag if needed.
6250 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6251 CmpVT == MVT::i32 ||
6252 hasNoSignFlagUses(SDValue(Node, 0)))) {
6253 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6254 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6255 // Otherwize, we find ourselves in a position where we have to do
6256 // promotion. If previous passes did not promote the and, we assume
6257 // they had a good reason not to and do not promote here.
6258 VT = MVT::i32;
6259 SubRegOp = X86::sub_32bit;
6260 ROpc = X86::TEST32ri;
6261 MOpc = X86::TEST32mi;
6262 } else {
6263 // No eligible transformation was found.
6264 break;
6265 }
6266
6267 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6268 SDValue Reg = N0.getOperand(0);
6269
6270 // Emit a testl or testw.
6271 MachineSDNode *NewNode;
6272 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6273 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6274 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6275 if (!LoadN->isSimple()) {
6276 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6277 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6278 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6279 (MOpc == X86::TEST32mi && NumVolBits != 32))
6280 break;
6281 }
6282 }
6283 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6284 Reg.getOperand(0) };
6285 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6286 // Update the chain.
6287 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6288 // Record the mem-refs
6289 CurDAG->setNodeMemRefs(NewNode,
6290 {cast<LoadSDNode>(Reg)->getMemOperand()});
6291 } else {
6292 // Extract the subregister if necessary.
6293 if (N0.getValueType() != VT)
6294 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6295
6296 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6297 }
6298 // Replace CMP with TEST.
6299 ReplaceNode(Node, NewNode);
6300 return;
6301 }
6302 break;
6303 }
6304 case X86ISD::PCMPISTR: {
6305 if (!Subtarget->hasSSE42())
6306 break;
6307
6308 bool NeedIndex = !SDValue(Node, 0).use_empty();
6309 bool NeedMask = !SDValue(Node, 1).use_empty();
6310 // We can't fold a load if we are going to make two instructions.
6311 bool MayFoldLoad = !NeedIndex || !NeedMask;
6312
6313 MachineSDNode *CNode;
6314 if (NeedMask) {
6315 unsigned ROpc =
6316 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6317 unsigned MOpc =
6318 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6319 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6320 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6321 }
6322 if (NeedIndex || !NeedMask) {
6323 unsigned ROpc =
6324 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6325 unsigned MOpc =
6326 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6327 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6328 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6329 }
6330
6331 // Connect the flag usage to the last instruction created.
6332 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6333 CurDAG->RemoveDeadNode(Node);
6334 return;
6335 }
6336 case X86ISD::PCMPESTR: {
6337 if (!Subtarget->hasSSE42())
6338 break;
6339
6340 // Copy the two implicit register inputs.
6341 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6342 Node->getOperand(1),
6343 SDValue()).getValue(1);
6344 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6345 Node->getOperand(3), InGlue).getValue(1);
6346
6347 bool NeedIndex = !SDValue(Node, 0).use_empty();
6348 bool NeedMask = !SDValue(Node, 1).use_empty();
6349 // We can't fold a load if we are going to make two instructions.
6350 bool MayFoldLoad = !NeedIndex || !NeedMask;
6351
6352 MachineSDNode *CNode;
6353 if (NeedMask) {
6354 unsigned ROpc =
6355 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6356 unsigned MOpc =
6357 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6358 CNode =
6359 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6360 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6361 }
6362 if (NeedIndex || !NeedMask) {
6363 unsigned ROpc =
6364 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6365 unsigned MOpc =
6366 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6367 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6368 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6369 }
6370 // Connect the flag usage to the last instruction created.
6371 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6372 CurDAG->RemoveDeadNode(Node);
6373 return;
6374 }
6375
6376 case ISD::SETCC: {
6377 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6378 return;
6379
6380 break;
6381 }
6382
6383 case ISD::STORE:
6384 if (foldLoadStoreIntoMemOperand(Node))
6385 return;
6386 break;
6387
6388 case X86ISD::SETCC_CARRY: {
6389 MVT VT = Node->getSimpleValueType(0);
6391 if (Subtarget->hasSBBDepBreaking()) {
6392 // We have to do this manually because tblgen will put the eflags copy in
6393 // the wrong place if we use an extract_subreg in the pattern.
6394 // Copy flags to the EFLAGS register and glue it to next node.
6395 SDValue EFLAGS =
6396 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6397 Node->getOperand(1), SDValue());
6398
6399 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6400 // 32-bit version.
6401 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6402 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6403 Result = SDValue(
6404 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6405 0);
6406 } else {
6407 // The target does not recognize sbb with the same reg operand as a
6408 // no-source idiom, so we explicitly zero the input values.
6409 Result = getSBBZero(Node);
6410 }
6411
6412 // For less than 32-bits we need to extract from the 32-bit node.
6413 if (VT == MVT::i8 || VT == MVT::i16) {
6414 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6415 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6416 }
6417
6418 ReplaceUses(SDValue(Node, 0), Result);
6419 CurDAG->RemoveDeadNode(Node);
6420 return;
6421 }
6422 case X86ISD::SBB: {
6423 if (isNullConstant(Node->getOperand(0)) &&
6424 isNullConstant(Node->getOperand(1))) {
6425 SDValue Result = getSBBZero(Node);
6426
6427 // Replace the flag use.
6428 ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6429
6430 // Replace the result use.
6431 if (!SDValue(Node, 0).use_empty()) {
6432 // For less than 32-bits we need to extract from the 32-bit node.
6433 MVT VT = Node->getSimpleValueType(0);
6434 if (VT == MVT::i8 || VT == MVT::i16) {
6435 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6436 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6437 }
6438 ReplaceUses(SDValue(Node, 0), Result);
6439 }
6440
6441 CurDAG->RemoveDeadNode(Node);
6442 return;
6443 }
6444 break;
6445 }
6446 case X86ISD::MGATHER: {
6447 auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6448 SDValue IndexOp = Mgt->getIndex();
6449 SDValue Mask = Mgt->getMask();
6450 MVT IndexVT = IndexOp.getSimpleValueType();
6451 MVT ValueVT = Node->getSimpleValueType(0);
6452 MVT MaskVT = Mask.getSimpleValueType();
6453
6454 // This is just to prevent crashes if the nodes are malformed somehow. We're
6455 // otherwise only doing loose type checking in here based on type what
6456 // a type constraint would say just like table based isel.
6457 if (!ValueVT.isVector() || !MaskVT.isVector())
6458 break;
6459
6460 unsigned NumElts = ValueVT.getVectorNumElements();
6461 MVT ValueSVT = ValueVT.getVectorElementType();
6462
6463 bool IsFP = ValueSVT.isFloatingPoint();
6464 unsigned EltSize = ValueSVT.getSizeInBits();
6465
6466 unsigned Opc = 0;
6467 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6468 if (AVX512Gather) {
6469 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6470 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6471 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6472 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6473 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6474 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6475 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6476 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6477 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6478 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6479 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6480 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6481 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6482 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6483 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6484 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6485 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6486 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6487 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6488 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6489 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6490 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6491 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6492 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6493 } else {
6494 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6495 "Unexpected mask VT!");
6496 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6497 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6498 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6499 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6500 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6501 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6502 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6503 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6504 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6505 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6506 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6507 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6508 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6509 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6510 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6511 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6512 }
6513
6514 if (!Opc)
6515 break;
6516
6517 SDValue Base, Scale, Index, Disp, Segment;
6518 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6519 Base, Scale, Index, Disp, Segment))
6520 break;
6521
6522 SDValue PassThru = Mgt->getPassThru();
6523 SDValue Chain = Mgt->getChain();
6524 // Gather instructions have a mask output not in the ISD node.
6525 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6526
6527 MachineSDNode *NewNode;
6528 if (AVX512Gather) {
6529 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6530 Index, Disp, Segment, Chain};
6531 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6532 } else {
6533 SDValue Ops[] = {PassThru, Base, Scale, Index,
6534 Disp, Segment, Mask, Chain};
6535 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6536 }
6537 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6538 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6539 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6540 CurDAG->RemoveDeadNode(Node);
6541 return;
6542 }
6543 case X86ISD::MSCATTER: {
6544 auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6545 SDValue Value = Sc->getValue();
6546 SDValue IndexOp = Sc->getIndex();
6547 MVT IndexVT = IndexOp.getSimpleValueType();
6548 MVT ValueVT = Value.getSimpleValueType();
6549
6550 // This is just to prevent crashes if the nodes are malformed somehow. We're
6551 // otherwise only doing loose type checking in here based on type what
6552 // a type constraint would say just like table based isel.
6553 if (!ValueVT.isVector())
6554 break;
6555
6556 unsigned NumElts = ValueVT.getVectorNumElements();
6557 MVT ValueSVT = ValueVT.getVectorElementType();
6558
6559 bool IsFP = ValueSVT.isFloatingPoint();
6560 unsigned EltSize = ValueSVT.getSizeInBits();
6561
6562 unsigned Opc;
6563 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6564 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6565 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6566 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6567 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6568 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6569 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6570 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6571 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6572 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6573 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6574 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6575 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6576 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6577 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6578 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6579 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6580 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6581 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6582 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6583 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6584 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6585 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6586 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6587 else
6588 break;
6589
6590 SDValue Base, Scale, Index, Disp, Segment;
6591 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6592 Base, Scale, Index, Disp, Segment))
6593 break;
6594
6595 SDValue Mask = Sc->getMask();
6596 SDValue Chain = Sc->getChain();
6597 // Scatter instructions have a mask output not in the ISD node.
6598 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6599 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6600
6601 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6602 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6603 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6604 CurDAG->RemoveDeadNode(Node);
6605 return;
6606 }
6607 case ISD::PREALLOCATED_SETUP: {
6608 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6609 auto CallId = MFI->getPreallocatedIdForCallSite(
6610 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6611 SDValue Chain = Node->getOperand(0);
6612 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6613 MachineSDNode *New = CurDAG->getMachineNode(
6614 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6615 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6616 CurDAG->RemoveDeadNode(Node);
6617 return;
6618 }
6619 case ISD::PREALLOCATED_ARG: {
6620 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6621 auto CallId = MFI->getPreallocatedIdForCallSite(
6622 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6623 SDValue Chain = Node->getOperand(0);
6624 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6625 SDValue ArgIndex = Node->getOperand(2);
6626 SDValue Ops[3];
6627 Ops[0] = CallIdValue;
6628 Ops[1] = ArgIndex;
6629 Ops[2] = Chain;
6630 MachineSDNode *New = CurDAG->getMachineNode(
6631 TargetOpcode::PREALLOCATED_ARG, dl,
6632 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6633 MVT::Other),
6634 Ops);
6635 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6636 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6637 CurDAG->RemoveDeadNode(Node);
6638 return;
6639 }
6644 if (!Subtarget->hasWIDEKL())
6645 break;
6646
6647 unsigned Opcode;
6648 switch (Node->getOpcode()) {
6649 default:
6650 llvm_unreachable("Unexpected opcode!");
6652 Opcode = X86::AESENCWIDE128KL;
6653 break;
6655 Opcode = X86::AESDECWIDE128KL;
6656 break;
6658 Opcode = X86::AESENCWIDE256KL;
6659 break;
6661 Opcode = X86::AESDECWIDE256KL;
6662 break;
6663 }
6664
6665 SDValue Chain = Node->getOperand(0);
6666 SDValue Addr = Node->getOperand(1);
6667
6668 SDValue Base, Scale, Index, Disp, Segment;
6669 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6670 break;
6671
6672 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6673 SDValue());
6674 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6675 Chain.getValue(1));
6676 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6677 Chain.getValue(1));
6678 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6679 Chain.getValue(1));
6680 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6681 Chain.getValue(1));
6682 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6683 Chain.getValue(1));
6684 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6685 Chain.getValue(1));
6686 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6687 Chain.getValue(1));
6688
6689 MachineSDNode *Res = CurDAG->getMachineNode(
6690 Opcode, dl, Node->getVTList(),
6691 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6692 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6693 ReplaceNode(Node, Res);
6694 return;
6695 }
6697 SDValue Chain = Node->getOperand(0);
6698 Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
6699 SDValue Glue;
6700 if (Node->getNumValues() == 3)
6701 Glue = Node->getOperand(2);
6702 SDValue Copy =
6703 CurDAG->getCopyFromReg(Chain, dl, Reg, Node->getValueType(0), Glue);
6704 ReplaceNode(Node, Copy.getNode());
6705 return;
6706 }
6707 }
6708
6709 SelectCode(Node);
6710}
6711
6712bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6713 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6714 std::vector<SDValue> &OutOps) {
6715 SDValue Op0, Op1, Op2, Op3, Op4;
6716 switch (ConstraintID) {
6717 default:
6718 llvm_unreachable("Unexpected asm memory constraint");
6719 case InlineAsm::ConstraintCode::o: // offsetable ??
6720 case InlineAsm::ConstraintCode::v: // not offsetable ??
6721 case InlineAsm::ConstraintCode::m: // memory
6722 case InlineAsm::ConstraintCode::X:
6723 case InlineAsm::ConstraintCode::p: // address
6724 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6725 return true;
6726 break;
6727 }
6728
6729 OutOps.push_back(Op0);
6730 OutOps.push_back(Op1);
6731 OutOps.push_back(Op2);
6732 OutOps.push_back(Op3);
6733 OutOps.push_back(Op4);
6734 return false;
6735}
6736
6739 std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6740
6741/// This pass converts a legalized DAG into a X86-specific DAG,
6742/// ready for instruction scheduling.
6744 CodeGenOptLevel OptLevel) {
6745 return new X86DAGToDAGISelLegacy(TM, OptLevel);
6746}
unsigned SubReg
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
#define CASE(ATTRNAME, AANAME,...)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define P(N)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
BaseType
A given derived pointer can have multiple base pointers through phi/selects.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, LoadSDNode *&LoadNode, SDValue &InputChain)
static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N)
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
#define PASS_NAME
static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII)
Check if the instruction uses RIP relative addressing.
#define FROM_TO(FROM, TO)
#define GET_EGPR_IF_ENABLED(OPC)
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget)
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM)
static bool needBWI(MVT VT)
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked)
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget)
static bool mayUseCarryFlag(X86::CondCode CC)
static cl::opt< bool > EnablePromoteAnyextLoad("x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden)
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain)
Replace the original chain operand of the call with load's chain operand and move load below the call...
#define GET_ND_IF_ENABLED(OPC)
#define VPTESTM_BROADCAST_CASES(SUFFIX)
static cl::opt< bool > AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden)
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM)
#define VPTESTM_FULL_CASES(SUFFIX)
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq)
Return true if call address is a load and it can be moved below CALLSEQ_START and the chains leading ...
static bool isDispSafeForFrameIndexOrRegBase(int64_t Val)
static bool isEndbrImm64(uint64_t Imm)
cl::opt< bool > IndirectBranchTracking("x86-indirect-branch-tracking", cl::init(false), cl::Hidden, cl::desc("Enable X86 indirect branch tracking pass."))
#define GET_ND_IF_ENABLED(OPC)
#define CASE_ND(OP)
Value * RHS
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI std::optional< ConstantRange > getAbsoluteSymbolRange() const
If this is an absolute symbol reference, returns the range of the symbol, otherwise returns std::null...
Definition Globals.cpp:445
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
Machine Value Type.
bool is128BitVector() const
Return true if this is a 128-bit vector type.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
int getNodeId() const
Return the unique node id.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
const SDValue & getOperand(unsigned Num) const
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
op_iterator op_end() const
op_iterator op_begin() const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
unsigned getNumOperands() const
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
static int getUninvalidatedNodeId(SDNode *N)
virtual bool runOnMachineFunction(MachineFunction &mf)
static void InvalidateNodeId(SDNode *N)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
void RepositionNode(allnodes_iterator Position, SDNode *N)
Move node N in the AllNodes list to be immediately before the given iterator Position.
ilist< SDNode >::iterator allnodes_iterator
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
unsigned getID() const
Return the register class ID number.
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
X86ISelDAGToDAGPass(X86TargetMachine &TM)
size_t getPreallocatedIdForCallSite(const Value *CS)
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ TargetExternalSymbol
Definition ISDOpcodes.h:185
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition ISDOpcodes.h:180
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:887
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:909
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ TargetGlobalTLSAddress
Definition ISDOpcodes.h:181
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
@ GlobalBaseReg
The result of the mflr at function entry, used for PIC code.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
@ VEX
VEX - encoding using 0xC4/0xC5.
@ XOP
XOP - Opcode prefix used by XOP instructions.
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ CALL
These operations represent an abstract X86 call instruction, which includes a bunch of information.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FLD
This instruction implements an extending load to FP stack slots.
@ TC_RETURN
Tail call return.
@ FOR
Bitwise logical OR of floating point values.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ POP_FROM_X87_REG
The same as ISD::CopyFromReg except that this node makes it explicit that it may lower to an x87 FPU ...
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
constexpr uint16_t Magic
Definition SFrame.h:32
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:261
FunctionPass * createX86ISelDag(X86TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a X86-specific DAG, ready for instruction scheduling.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ And
Bitwise or logical AND of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:867
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
Matching combinators.
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
bool hasNoUnsignedWrap() const