LLVM 19.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86.h"
16#include "X86RegisterInfo.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Type.h"
29#include "llvm/Support/Debug.h"
33#include <cstdint>
34
35using namespace llvm;
36
37#define DEBUG_TYPE "x86-isel"
38#define PASS_NAME "X86 DAG->DAG Instruction Selection"
39
40STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
41
42static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
43 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45
47 "x86-promote-anyext-load", cl::init(true),
48 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
49
51
52//===----------------------------------------------------------------------===//
53// Pattern Matcher Implementation
54//===----------------------------------------------------------------------===//
55
56namespace {
57 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
58 /// numbers for the leaves of the matched tree.
59 struct X86ISelAddressMode {
60 enum {
61 RegBase,
62 FrameIndexBase
63 } BaseType = RegBase;
64
65 // This is really a union, discriminated by BaseType!
66 SDValue Base_Reg;
67 int Base_FrameIndex = 0;
68
69 unsigned Scale = 1;
70 SDValue IndexReg;
71 int32_t Disp = 0;
72 SDValue Segment;
73 const GlobalValue *GV = nullptr;
74 const Constant *CP = nullptr;
75 const BlockAddress *BlockAddr = nullptr;
76 const char *ES = nullptr;
77 MCSymbol *MCSym = nullptr;
78 int JT = -1;
79 Align Alignment; // CP alignment.
80 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
81 bool NegateIndex = false;
82
83 X86ISelAddressMode() = default;
84
85 bool hasSymbolicDisplacement() const {
86 return GV != nullptr || CP != nullptr || ES != nullptr ||
87 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
88 }
89
90 bool hasBaseOrIndexReg() const {
91 return BaseType == FrameIndexBase ||
92 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
93 }
94
95 /// Return true if this addressing mode is already RIP-relative.
96 bool isRIPRelative() const {
97 if (BaseType != RegBase) return false;
98 if (RegisterSDNode *RegNode =
99 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
100 return RegNode->getReg() == X86::RIP;
101 return false;
102 }
103
104 void setBaseReg(SDValue Reg) {
105 BaseType = RegBase;
106 Base_Reg = Reg;
107 }
108
109#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
110 void dump(SelectionDAG *DAG = nullptr) {
111 dbgs() << "X86ISelAddressMode " << this << '\n';
112 dbgs() << "Base_Reg ";
113 if (Base_Reg.getNode())
114 Base_Reg.getNode()->dump(DAG);
115 else
116 dbgs() << "nul\n";
117 if (BaseType == FrameIndexBase)
118 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
119 dbgs() << " Scale " << Scale << '\n'
120 << "IndexReg ";
121 if (NegateIndex)
122 dbgs() << "negate ";
123 if (IndexReg.getNode())
124 IndexReg.getNode()->dump(DAG);
125 else
126 dbgs() << "nul\n";
127 dbgs() << " Disp " << Disp << '\n'
128 << "GV ";
129 if (GV)
130 GV->dump();
131 else
132 dbgs() << "nul";
133 dbgs() << " CP ";
134 if (CP)
135 CP->dump();
136 else
137 dbgs() << "nul";
138 dbgs() << '\n'
139 << "ES ";
140 if (ES)
141 dbgs() << ES;
142 else
143 dbgs() << "nul";
144 dbgs() << " MCSym ";
145 if (MCSym)
146 dbgs() << MCSym;
147 else
148 dbgs() << "nul";
149 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
150 }
151#endif
152 };
153}
154
155namespace {
156 //===--------------------------------------------------------------------===//
157 /// ISel - X86-specific code to select X86 machine instructions for
158 /// SelectionDAG operations.
159 ///
160 class X86DAGToDAGISel final : public SelectionDAGISel {
161 /// Keep a pointer to the X86Subtarget around so that we can
162 /// make the right decision when generating code for different targets.
163 const X86Subtarget *Subtarget;
164
165 /// If true, selector should try to optimize for minimum code size.
166 bool OptForMinSize;
167
168 /// Disable direct TLS access through segment registers.
169 bool IndirectTlsSegRefs;
170
171 public:
172 static char ID;
173
174 X86DAGToDAGISel() = delete;
175
176 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
177 : SelectionDAGISel(ID, tm, OptLevel), Subtarget(nullptr),
178 OptForMinSize(false), IndirectTlsSegRefs(false) {}
179
180 bool runOnMachineFunction(MachineFunction &MF) override {
181 // Reset the subtarget each time through.
182 Subtarget = &MF.getSubtarget<X86Subtarget>();
183 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
184 "indirect-tls-seg-refs");
185
186 // OptFor[Min]Size are used in pattern predicates that isel is matching.
187 OptForMinSize = MF.getFunction().hasMinSize();
188 assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
189 "OptForMinSize implies OptForSize");
190
192 return true;
193 }
194
195 void emitFunctionEntryCode() override;
196
197 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
198
199 void PreprocessISelDAG() override;
200 void PostprocessISelDAG() override;
201
202// Include the pieces autogenerated from the target description.
203#include "X86GenDAGISel.inc"
204
205 private:
206 void Select(SDNode *N) override;
207
208 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
209 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
210 bool AllowSegmentRegForX32 = false);
211 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
212 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
213 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
214 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
215 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
216 unsigned Depth);
217 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218 unsigned Depth);
219 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
220 unsigned Depth);
221 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
222 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
223 SDValue &Scale, SDValue &Index, SDValue &Disp,
224 SDValue &Segment);
225 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
226 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
227 SDValue &Index, SDValue &Disp, SDValue &Segment);
228 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
229 bool selectLEAAddr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
233 SDValue &Scale, SDValue &Index, SDValue &Disp,
234 SDValue &Segment);
235 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
236 SDValue &Scale, SDValue &Index, SDValue &Disp,
237 SDValue &Segment);
238 bool selectRelocImm(SDValue N, SDValue &Op);
239
240 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
241 SDValue &Base, SDValue &Scale,
242 SDValue &Index, SDValue &Disp,
243 SDValue &Segment);
244
245 // Convenience method where P is also root.
246 bool tryFoldLoad(SDNode *P, SDValue N,
247 SDValue &Base, SDValue &Scale,
248 SDValue &Index, SDValue &Disp,
249 SDValue &Segment) {
250 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
251 }
252
253 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
254 SDValue &Base, SDValue &Scale,
255 SDValue &Index, SDValue &Disp,
256 SDValue &Segment);
257
258 bool isProfitableToFormMaskedOp(SDNode *N) const;
259
260 /// Implement addressing mode selection for inline asm expressions.
262 InlineAsm::ConstraintCode ConstraintID,
263 std::vector<SDValue> &OutOps) override;
264
265 void emitSpecialCodeForMain();
266
267 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
268 MVT VT, SDValue &Base, SDValue &Scale,
269 SDValue &Index, SDValue &Disp,
270 SDValue &Segment) {
271 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
272 Base = CurDAG->getTargetFrameIndex(
273 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
274 else if (AM.Base_Reg.getNode())
275 Base = AM.Base_Reg;
276 else
277 Base = CurDAG->getRegister(0, VT);
278
279 Scale = getI8Imm(AM.Scale, DL);
280
281#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
282 // Negate the index if needed.
283 if (AM.NegateIndex) {
284 unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
285 : GET_ND_IF_ENABLED(X86::NEG32r);
286 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
287 AM.IndexReg), 0);
288 AM.IndexReg = Neg;
289 }
290
291 if (AM.IndexReg.getNode())
292 Index = AM.IndexReg;
293 else
294 Index = CurDAG->getRegister(0, VT);
295
296 // These are 32-bit even in 64-bit mode since RIP-relative offset
297 // is 32-bit.
298 if (AM.GV)
299 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
300 MVT::i32, AM.Disp,
301 AM.SymbolFlags);
302 else if (AM.CP)
303 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
304 AM.Disp, AM.SymbolFlags);
305 else if (AM.ES) {
306 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
307 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
308 } else if (AM.MCSym) {
309 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
310 assert(AM.SymbolFlags == 0 && "oo");
311 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
312 } else if (AM.JT != -1) {
313 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
314 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
315 } else if (AM.BlockAddr)
316 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
317 AM.SymbolFlags);
318 else
319 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
320
321 if (AM.Segment.getNode())
322 Segment = AM.Segment;
323 else
324 Segment = CurDAG->getRegister(0, MVT::i16);
325 }
326
327 // Utility function to determine whether we should avoid selecting
328 // immediate forms of instructions for better code size or not.
329 // At a high level, we'd like to avoid such instructions when
330 // we have similar constants used within the same basic block
331 // that can be kept in a register.
332 //
333 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
334 uint32_t UseCount = 0;
335
336 // Do not want to hoist if we're not optimizing for size.
337 // TODO: We'd like to remove this restriction.
338 // See the comment in X86InstrInfo.td for more info.
339 if (!CurDAG->shouldOptForSize())
340 return false;
341
342 // Walk all the users of the immediate.
343 for (const SDNode *User : N->uses()) {
344 if (UseCount >= 2)
345 break;
346
347 // This user is already selected. Count it as a legitimate use and
348 // move on.
349 if (User->isMachineOpcode()) {
350 UseCount++;
351 continue;
352 }
353
354 // We want to count stores of immediates as real uses.
355 if (User->getOpcode() == ISD::STORE &&
356 User->getOperand(1).getNode() == N) {
357 UseCount++;
358 continue;
359 }
360
361 // We don't currently match users that have > 2 operands (except
362 // for stores, which are handled above)
363 // Those instruction won't match in ISEL, for now, and would
364 // be counted incorrectly.
365 // This may change in the future as we add additional instruction
366 // types.
367 if (User->getNumOperands() != 2)
368 continue;
369
370 // If this is a sign-extended 8-bit integer immediate used in an ALU
371 // instruction, there is probably an opcode encoding to save space.
372 auto *C = dyn_cast<ConstantSDNode>(N);
373 if (C && isInt<8>(C->getSExtValue()))
374 continue;
375
376 // Immediates that are used for offsets as part of stack
377 // manipulation should be left alone. These are typically
378 // used to indicate SP offsets for argument passing and
379 // will get pulled into stores/pushes (implicitly).
380 if (User->getOpcode() == X86ISD::ADD ||
381 User->getOpcode() == ISD::ADD ||
382 User->getOpcode() == X86ISD::SUB ||
383 User->getOpcode() == ISD::SUB) {
384
385 // Find the other operand of the add/sub.
386 SDValue OtherOp = User->getOperand(0);
387 if (OtherOp.getNode() == N)
388 OtherOp = User->getOperand(1);
389
390 // Don't count if the other operand is SP.
391 RegisterSDNode *RegNode;
392 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
393 (RegNode = dyn_cast_or_null<RegisterSDNode>(
394 OtherOp->getOperand(1).getNode())))
395 if ((RegNode->getReg() == X86::ESP) ||
396 (RegNode->getReg() == X86::RSP))
397 continue;
398 }
399
400 // ... otherwise, count this and move on.
401 UseCount++;
402 }
403
404 // If we have more than 1 use, then recommend for hoisting.
405 return (UseCount > 1);
406 }
407
408 /// Return a target constant with the specified value of type i8.
409 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
410 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
411 }
412
413 /// Return a target constant with the specified value, of type i32.
414 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
415 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
416 }
417
418 /// Return a target constant with the specified value, of type i64.
419 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
420 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
421 }
422
423 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
424 const SDLoc &DL) {
425 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
426 uint64_t Index = N->getConstantOperandVal(1);
427 MVT VecVT = N->getOperand(0).getSimpleValueType();
428 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
429 }
430
431 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
432 const SDLoc &DL) {
433 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
434 uint64_t Index = N->getConstantOperandVal(2);
435 MVT VecVT = N->getSimpleValueType(0);
436 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
437 }
438
439 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
440 const SDLoc &DL) {
441 assert(VecWidth == 128 && "Unexpected vector width");
442 uint64_t Index = N->getConstantOperandVal(2);
443 MVT VecVT = N->getSimpleValueType(0);
444 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
445 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
446 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
447 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
448 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
449 }
450
451 SDValue getSBBZero(SDNode *N) {
452 SDLoc dl(N);
453 MVT VT = N->getSimpleValueType(0);
454
455 // Create zero.
456 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
458 CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
459 if (VT == MVT::i64) {
460 Zero = SDValue(
461 CurDAG->getMachineNode(
462 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
463 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
464 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
465 0);
466 }
467
468 // Copy flags to the EFLAGS register and glue it to next node.
469 unsigned Opcode = N->getOpcode();
470 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
471 "Unexpected opcode for SBB materialization");
472 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
473 SDValue EFLAGS =
474 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
475 N->getOperand(FlagOpIndex), SDValue());
476
477 // Create a 64-bit instruction if the result is 64-bits otherwise use the
478 // 32-bit version.
479 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
480 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
481 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
482 return SDValue(
483 CurDAG->getMachineNode(Opc, dl, VTs,
484 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
485 0);
486 }
487
488 // Helper to detect unneeded and instructions on shift amounts. Called
489 // from PatFrags in tablegen.
490 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
491 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
492 const APInt &Val = N->getConstantOperandAPInt(1);
493
494 if (Val.countr_one() >= Width)
495 return true;
496
497 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
498 return Mask.countr_one() >= Width;
499 }
500
501 /// Return an SDNode that returns the value of the global base register.
502 /// Output instructions required to initialize the global base register,
503 /// if necessary.
504 SDNode *getGlobalBaseReg();
505
506 /// Return a reference to the TargetMachine, casted to the target-specific
507 /// type.
508 const X86TargetMachine &getTargetMachine() const {
509 return static_cast<const X86TargetMachine &>(TM);
510 }
511
512 /// Return a reference to the TargetInstrInfo, casted to the target-specific
513 /// type.
514 const X86InstrInfo *getInstrInfo() const {
515 return Subtarget->getInstrInfo();
516 }
517
518 /// Return a condition code of the given SDNode
519 X86::CondCode getCondFromNode(SDNode *N) const;
520
521 /// Address-mode matching performs shift-of-and to and-of-shift
522 /// reassociation in order to expose more scaled addressing
523 /// opportunities.
524 bool ComplexPatternFuncMutatesDAG() const override {
525 return true;
526 }
527
528 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
529
530 // Indicates we should prefer to use a non-temporal load for this load.
531 bool useNonTemporalLoad(LoadSDNode *N) const {
532 if (!N->isNonTemporal())
533 return false;
534
535 unsigned StoreSize = N->getMemoryVT().getStoreSize();
536
537 if (N->getAlign().value() < StoreSize)
538 return false;
539
540 switch (StoreSize) {
541 default: llvm_unreachable("Unsupported store size");
542 case 4:
543 case 8:
544 return false;
545 case 16:
546 return Subtarget->hasSSE41();
547 case 32:
548 return Subtarget->hasAVX2();
549 case 64:
550 return Subtarget->hasAVX512();
551 }
552 }
553
554 bool foldLoadStoreIntoMemOperand(SDNode *Node);
555 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
556 bool matchBitExtract(SDNode *Node);
557 bool shrinkAndImmediate(SDNode *N);
558 bool isMaskZeroExtended(SDNode *N) const;
559 bool tryShiftAmountMod(SDNode *N);
560 bool tryShrinkShlLogicImm(SDNode *N);
561 bool tryVPTERNLOG(SDNode *N);
562 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
563 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
564 uint8_t Imm);
565 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
566 bool tryMatchBitSelect(SDNode *N);
567
568 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
569 const SDLoc &dl, MVT VT, SDNode *Node);
570 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
571 const SDLoc &dl, MVT VT, SDNode *Node,
572 SDValue &InGlue);
573
574 bool tryOptimizeRem8Extend(SDNode *N);
575
576 bool onlyUsesZeroFlag(SDValue Flags) const;
577 bool hasNoSignFlagUses(SDValue Flags) const;
578 bool hasNoCarryFlagUses(SDValue Flags) const;
579 };
580}
581
582char X86DAGToDAGISel::ID = 0;
583
584INITIALIZE_PASS(X86DAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
585
586// Returns true if this masked compare can be implemented legally with this
587// type.
588static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
589 unsigned Opcode = N->getOpcode();
590 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
591 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
592 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
593 // We can get 256-bit 8 element types here without VLX being enabled. When
594 // this happens we will use 512-bit operations and the mask will not be
595 // zero extended.
596 EVT OpVT = N->getOperand(0).getValueType();
597 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
598 // second operand.
599 if (Opcode == X86ISD::STRICT_CMPM)
600 OpVT = N->getOperand(1).getValueType();
601 if (OpVT.is256BitVector() || OpVT.is128BitVector())
602 return Subtarget->hasVLX();
603
604 return true;
605 }
606 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
607 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
608 Opcode == X86ISD::FSETCCM_SAE)
609 return true;
610
611 return false;
612}
613
614// Returns true if we can assume the writer of the mask has zero extended it
615// for us.
616bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
617 // If this is an AND, check if we have a compare on either side. As long as
618 // one side guarantees the mask is zero extended, the AND will preserve those
619 // zeros.
620 if (N->getOpcode() == ISD::AND)
621 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
622 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
623
624 return isLegalMaskCompare(N, Subtarget);
625}
626
627bool
628X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
629 if (OptLevel == CodeGenOptLevel::None)
630 return false;
631
632 if (!N.hasOneUse())
633 return false;
634
635 if (N.getOpcode() != ISD::LOAD)
636 return true;
637
638 // Don't fold non-temporal loads if we have an instruction for them.
639 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
640 return false;
641
642 // If N is a load, do additional profitability checks.
643 if (U == Root) {
644 switch (U->getOpcode()) {
645 default: break;
646 case X86ISD::ADD:
647 case X86ISD::ADC:
648 case X86ISD::SUB:
649 case X86ISD::SBB:
650 case X86ISD::AND:
651 case X86ISD::XOR:
652 case X86ISD::OR:
653 case ISD::ADD:
654 case ISD::UADDO_CARRY:
655 case ISD::AND:
656 case ISD::OR:
657 case ISD::XOR: {
658 SDValue Op1 = U->getOperand(1);
659
660 // If the other operand is a 8-bit immediate we should fold the immediate
661 // instead. This reduces code size.
662 // e.g.
663 // movl 4(%esp), %eax
664 // addl $4, %eax
665 // vs.
666 // movl $4, %eax
667 // addl 4(%esp), %eax
668 // The former is 2 bytes shorter. In case where the increment is 1, then
669 // the saving can be 4 bytes (by using incl %eax).
670 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
671 if (Imm->getAPIntValue().isSignedIntN(8))
672 return false;
673
674 // If this is a 64-bit AND with an immediate that fits in 32-bits,
675 // prefer using the smaller and over folding the load. This is needed to
676 // make sure immediates created by shrinkAndImmediate are always folded.
677 // Ideally we would narrow the load during DAG combine and get the
678 // best of both worlds.
679 if (U->getOpcode() == ISD::AND &&
680 Imm->getAPIntValue().getBitWidth() == 64 &&
681 Imm->getAPIntValue().isIntN(32))
682 return false;
683
684 // If this really a zext_inreg that can be represented with a movzx
685 // instruction, prefer that.
686 // TODO: We could shrink the load and fold if it is non-volatile.
687 if (U->getOpcode() == ISD::AND &&
688 (Imm->getAPIntValue() == UINT8_MAX ||
689 Imm->getAPIntValue() == UINT16_MAX ||
690 Imm->getAPIntValue() == UINT32_MAX))
691 return false;
692
693 // ADD/SUB with can negate the immediate and use the opposite operation
694 // to fit 128 into a sign extended 8 bit immediate.
695 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
696 (-Imm->getAPIntValue()).isSignedIntN(8))
697 return false;
698
699 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
700 (-Imm->getAPIntValue()).isSignedIntN(8) &&
701 hasNoCarryFlagUses(SDValue(U, 1)))
702 return false;
703 }
704
705 // If the other operand is a TLS address, we should fold it instead.
706 // This produces
707 // movl %gs:0, %eax
708 // leal i@NTPOFF(%eax), %eax
709 // instead of
710 // movl $i@NTPOFF, %eax
711 // addl %gs:0, %eax
712 // if the block also has an access to a second TLS address this will save
713 // a load.
714 // FIXME: This is probably also true for non-TLS addresses.
715 if (Op1.getOpcode() == X86ISD::Wrapper) {
716 SDValue Val = Op1.getOperand(0);
718 return false;
719 }
720
721 // Don't fold load if this matches the BTS/BTR/BTC patterns.
722 // BTS: (or X, (shl 1, n))
723 // BTR: (and X, (rotl -2, n))
724 // BTC: (xor X, (shl 1, n))
725 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
726 if (U->getOperand(0).getOpcode() == ISD::SHL &&
727 isOneConstant(U->getOperand(0).getOperand(0)))
728 return false;
729
730 if (U->getOperand(1).getOpcode() == ISD::SHL &&
731 isOneConstant(U->getOperand(1).getOperand(0)))
732 return false;
733 }
734 if (U->getOpcode() == ISD::AND) {
735 SDValue U0 = U->getOperand(0);
736 SDValue U1 = U->getOperand(1);
737 if (U0.getOpcode() == ISD::ROTL) {
738 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
739 if (C && C->getSExtValue() == -2)
740 return false;
741 }
742
743 if (U1.getOpcode() == ISD::ROTL) {
744 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
745 if (C && C->getSExtValue() == -2)
746 return false;
747 }
748 }
749
750 break;
751 }
752 case ISD::SHL:
753 case ISD::SRA:
754 case ISD::SRL:
755 // Don't fold a load into a shift by immediate. The BMI2 instructions
756 // support folding a load, but not an immediate. The legacy instructions
757 // support folding an immediate, but can't fold a load. Folding an
758 // immediate is preferable to folding a load.
759 if (isa<ConstantSDNode>(U->getOperand(1)))
760 return false;
761
762 break;
763 }
764 }
765
766 // Prevent folding a load if this can implemented with an insert_subreg or
767 // a move that implicitly zeroes.
768 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
769 isNullConstant(Root->getOperand(2)) &&
770 (Root->getOperand(0).isUndef() ||
772 return false;
773
774 return true;
775}
776
777// Indicates it is profitable to form an AVX512 masked operation. Returning
778// false will favor a masked register-register masked move or vblendm and the
779// operation will be selected separately.
780bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
781 assert(
782 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
783 "Unexpected opcode!");
784
785 // If the operation has additional users, the operation will be duplicated.
786 // Check the use count to prevent that.
787 // FIXME: Are there cheap opcodes we might want to duplicate?
788 return N->getOperand(1).hasOneUse();
789}
790
791/// Replace the original chain operand of the call with
792/// load's chain operand and move load below the call's chain operand.
793static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
794 SDValue Call, SDValue OrigChain) {
796 SDValue Chain = OrigChain.getOperand(0);
797 if (Chain.getNode() == Load.getNode())
798 Ops.push_back(Load.getOperand(0));
799 else {
800 assert(Chain.getOpcode() == ISD::TokenFactor &&
801 "Unexpected chain operand");
802 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
803 if (Chain.getOperand(i).getNode() == Load.getNode())
804 Ops.push_back(Load.getOperand(0));
805 else
806 Ops.push_back(Chain.getOperand(i));
807 SDValue NewChain =
808 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
809 Ops.clear();
810 Ops.push_back(NewChain);
811 }
812 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
813 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
814 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
815 Load.getOperand(1), Load.getOperand(2));
816
817 Ops.clear();
818 Ops.push_back(SDValue(Load.getNode(), 1));
819 Ops.append(Call->op_begin() + 1, Call->op_end());
820 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
821}
822
823/// Return true if call address is a load and it can be
824/// moved below CALLSEQ_START and the chains leading up to the call.
825/// Return the CALLSEQ_START by reference as a second output.
826/// In the case of a tail call, there isn't a callseq node between the call
827/// chain and the load.
828static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
829 // The transformation is somewhat dangerous if the call's chain was glued to
830 // the call. After MoveBelowOrigChain the load is moved between the call and
831 // the chain, this can create a cycle if the load is not folded. So it is
832 // *really* important that we are sure the load will be folded.
833 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
834 return false;
835 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
836 if (!LD ||
837 !LD->isSimple() ||
838 LD->getAddressingMode() != ISD::UNINDEXED ||
839 LD->getExtensionType() != ISD::NON_EXTLOAD)
840 return false;
841
842 // Now let's find the callseq_start.
843 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
844 if (!Chain.hasOneUse())
845 return false;
846 Chain = Chain.getOperand(0);
847 }
848
849 if (!Chain.getNumOperands())
850 return false;
851 // Since we are not checking for AA here, conservatively abort if the chain
852 // writes to memory. It's not safe to move the callee (a load) across a store.
853 if (isa<MemSDNode>(Chain.getNode()) &&
854 cast<MemSDNode>(Chain.getNode())->writeMem())
855 return false;
856 if (Chain.getOperand(0).getNode() == Callee.getNode())
857 return true;
858 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
859 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
860 Callee.getValue(1).hasOneUse())
861 return true;
862 return false;
863}
864
865static bool isEndbrImm64(uint64_t Imm) {
866// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
867// i.g: 0xF3660F1EFA, 0xF3670F1EFA
868 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
869 return false;
870
871 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
872 0x65, 0x66, 0x67, 0xf0, 0xf2};
873 int i = 24; // 24bit 0x0F1EFA has matched
874 while (i < 64) {
875 uint8_t Byte = (Imm >> i) & 0xFF;
876 if (Byte == 0xF3)
877 return true;
878 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
879 return false;
880 i += 8;
881 }
882
883 return false;
884}
885
886static bool needBWI(MVT VT) {
887 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
888}
889
890void X86DAGToDAGISel::PreprocessISelDAG() {
891 bool MadeChange = false;
892 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
893 E = CurDAG->allnodes_end(); I != E; ) {
894 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
895
896 // This is for CET enhancement.
897 //
898 // ENDBR32 and ENDBR64 have specific opcodes:
899 // ENDBR32: F3 0F 1E FB
900 // ENDBR64: F3 0F 1E FA
901 // And we want that attackers won’t find unintended ENDBR32/64
902 // opcode matches in the binary
903 // Here’s an example:
904 // If the compiler had to generate asm for the following code:
905 // a = 0xF30F1EFA
906 // it could, for example, generate:
907 // mov 0xF30F1EFA, dword ptr[a]
908 // In such a case, the binary would include a gadget that starts
909 // with a fake ENDBR64 opcode. Therefore, we split such generation
910 // into multiple operations, let it not shows in the binary
911 if (N->getOpcode() == ISD::Constant) {
912 MVT VT = N->getSimpleValueType(0);
913 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
914 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
915 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
916 // Check that the cf-protection-branch is enabled.
917 Metadata *CFProtectionBranch =
918 MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
919 if (CFProtectionBranch || IndirectBranchTracking) {
920 SDLoc dl(N);
921 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
922 Complement = CurDAG->getNOT(dl, Complement, VT);
923 --I;
924 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
925 ++I;
926 MadeChange = true;
927 continue;
928 }
929 }
930 }
931
932 // If this is a target specific AND node with no flag usages, turn it back
933 // into ISD::AND to enable test instruction matching.
934 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
935 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
936 N->getOperand(0), N->getOperand(1));
937 --I;
938 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
939 ++I;
940 MadeChange = true;
941 continue;
942 }
943
944 // Convert vector increment or decrement to sub/add with an all-ones
945 // constant:
946 // add X, <1, 1...> --> sub X, <-1, -1...>
947 // sub X, <1, 1...> --> add X, <-1, -1...>
948 // The all-ones vector constant can be materialized using a pcmpeq
949 // instruction that is commonly recognized as an idiom (has no register
950 // dependency), so that's better/smaller than loading a splat 1 constant.
951 //
952 // But don't do this if it would inhibit a potentially profitable load
953 // folding opportunity for the other operand. That only occurs with the
954 // intersection of:
955 // (1) The other operand (op0) is load foldable.
956 // (2) The op is an add (otherwise, we are *creating* an add and can still
957 // load fold the other op).
958 // (3) The target has AVX (otherwise, we have a destructive add and can't
959 // load fold the other op without killing the constant op).
960 // (4) The constant 1 vector has multiple uses (so it is profitable to load
961 // into a register anyway).
962 auto mayPreventLoadFold = [&]() {
963 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
964 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
965 !N->getOperand(1).hasOneUse();
966 };
967 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
968 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
969 APInt SplatVal;
970 if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
971 SplatVal.isOne()) {
972 SDLoc DL(N);
973
974 MVT VT = N->getSimpleValueType(0);
975 unsigned NumElts = VT.getSizeInBits() / 32;
977 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
978 AllOnes = CurDAG->getBitcast(VT, AllOnes);
979
980 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
981 SDValue Res =
982 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
983 --I;
984 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
985 ++I;
986 MadeChange = true;
987 continue;
988 }
989 }
990
991 switch (N->getOpcode()) {
992 case X86ISD::VBROADCAST: {
993 MVT VT = N->getSimpleValueType(0);
994 // Emulate v32i16/v64i8 broadcast without BWI.
995 if (!Subtarget->hasBWI() && needBWI(VT)) {
996 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
997 SDLoc dl(N);
998 SDValue NarrowBCast =
999 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1000 SDValue Res =
1001 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1002 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1003 unsigned Index = NarrowVT.getVectorMinNumElements();
1004 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1005 CurDAG->getIntPtrConstant(Index, dl));
1006
1007 --I;
1008 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1009 ++I;
1010 MadeChange = true;
1011 continue;
1012 }
1013
1014 break;
1015 }
1017 MVT VT = N->getSimpleValueType(0);
1018 // Emulate v32i16/v64i8 broadcast without BWI.
1019 if (!Subtarget->hasBWI() && needBWI(VT)) {
1020 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1021 auto *MemNode = cast<MemSDNode>(N);
1022 SDLoc dl(N);
1023 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1024 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1025 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1026 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1027 MemNode->getMemOperand());
1028 SDValue Res =
1029 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1030 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1031 unsigned Index = NarrowVT.getVectorMinNumElements();
1032 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1033 CurDAG->getIntPtrConstant(Index, dl));
1034
1035 --I;
1036 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1037 CurDAG->ReplaceAllUsesWith(N, To);
1038 ++I;
1039 MadeChange = true;
1040 continue;
1041 }
1042
1043 break;
1044 }
1045 case ISD::LOAD: {
1046 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1047 // load, then just extract the lower subvector and avoid the second load.
1048 auto *Ld = cast<LoadSDNode>(N);
1049 MVT VT = N->getSimpleValueType(0);
1050 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1051 !(VT.is128BitVector() || VT.is256BitVector()))
1052 break;
1053
1054 MVT MaxVT = VT;
1055 SDNode *MaxLd = nullptr;
1056 SDValue Ptr = Ld->getBasePtr();
1057 SDValue Chain = Ld->getChain();
1058 for (SDNode *User : Ptr->uses()) {
1059 auto *UserLd = dyn_cast<LoadSDNode>(User);
1060 MVT UserVT = User->getSimpleValueType(0);
1061 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1062 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1063 !User->hasAnyUseOfValue(1) &&
1064 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1065 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1066 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1067 MaxLd = User;
1068 MaxVT = UserVT;
1069 }
1070 }
1071 if (MaxLd) {
1072 SDLoc dl(N);
1073 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1074 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1075 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1076 SDValue(MaxLd, 0),
1077 CurDAG->getIntPtrConstant(0, dl));
1078 SDValue Res = CurDAG->getBitcast(VT, Extract);
1079
1080 --I;
1081 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1082 CurDAG->ReplaceAllUsesWith(N, To);
1083 ++I;
1084 MadeChange = true;
1085 continue;
1086 }
1087 break;
1088 }
1089 case ISD::VSELECT: {
1090 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1091 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1092 if (EleVT == MVT::i1)
1093 break;
1094
1095 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1096 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1097 "We can't replace VSELECT with BLENDV in vXi16!");
1098 SDValue R;
1099 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1100 EleVT.getSizeInBits()) {
1101 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1102 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1103 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1104 } else {
1105 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1106 N->getOperand(0), N->getOperand(1),
1107 N->getOperand(2));
1108 }
1109 --I;
1110 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1111 ++I;
1112 MadeChange = true;
1113 continue;
1114 }
1115 case ISD::FP_ROUND:
1117 case ISD::FP_TO_SINT:
1118 case ISD::FP_TO_UINT:
1121 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1122 // don't need 2 sets of patterns.
1123 if (!N->getSimpleValueType(0).isVector())
1124 break;
1125
1126 unsigned NewOpc;
1127 switch (N->getOpcode()) {
1128 default: llvm_unreachable("Unexpected opcode!");
1129 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1130 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1131 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1132 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1133 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1134 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1135 }
1136 SDValue Res;
1137 if (N->isStrictFPOpcode())
1138 Res =
1139 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1140 {N->getOperand(0), N->getOperand(1)});
1141 else
1142 Res =
1143 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1144 N->getOperand(0));
1145 --I;
1146 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1147 ++I;
1148 MadeChange = true;
1149 continue;
1150 }
1151 case ISD::SHL:
1152 case ISD::SRA:
1153 case ISD::SRL: {
1154 // Replace vector shifts with their X86 specific equivalent so we don't
1155 // need 2 sets of patterns.
1156 if (!N->getValueType(0).isVector())
1157 break;
1158
1159 unsigned NewOpc;
1160 switch (N->getOpcode()) {
1161 default: llvm_unreachable("Unexpected opcode!");
1162 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1163 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1164 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1165 }
1166 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1167 N->getOperand(0), N->getOperand(1));
1168 --I;
1169 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1170 ++I;
1171 MadeChange = true;
1172 continue;
1173 }
1174 case ISD::ANY_EXTEND:
1176 // Replace vector any extend with the zero extend equivalents so we don't
1177 // need 2 sets of patterns. Ignore vXi1 extensions.
1178 if (!N->getValueType(0).isVector())
1179 break;
1180
1181 unsigned NewOpc;
1182 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1183 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1184 "Unexpected opcode for mask vector!");
1185 NewOpc = ISD::SIGN_EXTEND;
1186 } else {
1187 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1190 }
1191
1192 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1193 N->getOperand(0));
1194 --I;
1195 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1196 ++I;
1197 MadeChange = true;
1198 continue;
1199 }
1200 case ISD::FCEIL:
1201 case ISD::STRICT_FCEIL:
1202 case ISD::FFLOOR:
1203 case ISD::STRICT_FFLOOR:
1204 case ISD::FTRUNC:
1205 case ISD::STRICT_FTRUNC:
1206 case ISD::FROUNDEVEN:
1208 case ISD::FNEARBYINT:
1210 case ISD::FRINT:
1211 case ISD::STRICT_FRINT: {
1212 // Replace fp rounding with their X86 specific equivalent so we don't
1213 // need 2 sets of patterns.
1214 unsigned Imm;
1215 switch (N->getOpcode()) {
1216 default: llvm_unreachable("Unexpected opcode!");
1217 case ISD::STRICT_FCEIL:
1218 case ISD::FCEIL: Imm = 0xA; break;
1219 case ISD::STRICT_FFLOOR:
1220 case ISD::FFLOOR: Imm = 0x9; break;
1221 case ISD::STRICT_FTRUNC:
1222 case ISD::FTRUNC: Imm = 0xB; break;
1224 case ISD::FROUNDEVEN: Imm = 0x8; break;
1226 case ISD::FNEARBYINT: Imm = 0xC; break;
1227 case ISD::STRICT_FRINT:
1228 case ISD::FRINT: Imm = 0x4; break;
1229 }
1230 SDLoc dl(N);
1231 bool IsStrict = N->isStrictFPOpcode();
1232 SDValue Res;
1233 if (IsStrict)
1234 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1235 {N->getValueType(0), MVT::Other},
1236 {N->getOperand(0), N->getOperand(1),
1237 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1238 else
1239 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1240 N->getOperand(0),
1241 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1242 --I;
1243 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1244 ++I;
1245 MadeChange = true;
1246 continue;
1247 }
1248 case X86ISD::FANDN:
1249 case X86ISD::FAND:
1250 case X86ISD::FOR:
1251 case X86ISD::FXOR: {
1252 // Widen scalar fp logic ops to vector to reduce isel patterns.
1253 // FIXME: Can we do this during lowering/combine.
1254 MVT VT = N->getSimpleValueType(0);
1255 if (VT.isVector() || VT == MVT::f128)
1256 break;
1257
1258 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1259 : VT == MVT::f32 ? MVT::v4f32
1260 : MVT::v8f16;
1261
1262 SDLoc dl(N);
1263 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1264 N->getOperand(0));
1265 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1266 N->getOperand(1));
1267
1268 SDValue Res;
1269 if (Subtarget->hasSSE2()) {
1270 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1271 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1272 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1273 unsigned Opc;
1274 switch (N->getOpcode()) {
1275 default: llvm_unreachable("Unexpected opcode!");
1276 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1277 case X86ISD::FAND: Opc = ISD::AND; break;
1278 case X86ISD::FOR: Opc = ISD::OR; break;
1279 case X86ISD::FXOR: Opc = ISD::XOR; break;
1280 }
1281 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1282 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1283 } else {
1284 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1285 }
1286 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1287 CurDAG->getIntPtrConstant(0, dl));
1288 --I;
1289 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1290 ++I;
1291 MadeChange = true;
1292 continue;
1293 }
1294 }
1295
1296 if (OptLevel != CodeGenOptLevel::None &&
1297 // Only do this when the target can fold the load into the call or
1298 // jmp.
1299 !Subtarget->useIndirectThunkCalls() &&
1300 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1301 (N->getOpcode() == X86ISD::TC_RETURN &&
1302 (Subtarget->is64Bit() ||
1303 !getTargetMachine().isPositionIndependent())))) {
1304 /// Also try moving call address load from outside callseq_start to just
1305 /// before the call to allow it to be folded.
1306 ///
1307 /// [Load chain]
1308 /// ^
1309 /// |
1310 /// [Load]
1311 /// ^ ^
1312 /// | |
1313 /// / \--
1314 /// / |
1315 ///[CALLSEQ_START] |
1316 /// ^ |
1317 /// | |
1318 /// [LOAD/C2Reg] |
1319 /// | |
1320 /// \ /
1321 /// \ /
1322 /// [CALL]
1323 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1324 SDValue Chain = N->getOperand(0);
1325 SDValue Load = N->getOperand(1);
1326 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1327 continue;
1328 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1329 ++NumLoadMoved;
1330 MadeChange = true;
1331 continue;
1332 }
1333
1334 // Lower fpround and fpextend nodes that target the FP stack to be store and
1335 // load to the stack. This is a gross hack. We would like to simply mark
1336 // these as being illegal, but when we do that, legalize produces these when
1337 // it expands calls, then expands these in the same legalize pass. We would
1338 // like dag combine to be able to hack on these between the call expansion
1339 // and the node legalization. As such this pass basically does "really
1340 // late" legalization of these inline with the X86 isel pass.
1341 // FIXME: This should only happen when not compiled with -O0.
1342 switch (N->getOpcode()) {
1343 default: continue;
1344 case ISD::FP_ROUND:
1345 case ISD::FP_EXTEND:
1346 {
1347 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1348 MVT DstVT = N->getSimpleValueType(0);
1349
1350 // If any of the sources are vectors, no fp stack involved.
1351 if (SrcVT.isVector() || DstVT.isVector())
1352 continue;
1353
1354 // If the source and destination are SSE registers, then this is a legal
1355 // conversion that should not be lowered.
1356 const X86TargetLowering *X86Lowering =
1357 static_cast<const X86TargetLowering *>(TLI);
1358 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1359 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1360 if (SrcIsSSE && DstIsSSE)
1361 continue;
1362
1363 if (!SrcIsSSE && !DstIsSSE) {
1364 // If this is an FPStack extension, it is a noop.
1365 if (N->getOpcode() == ISD::FP_EXTEND)
1366 continue;
1367 // If this is a value-preserving FPStack truncation, it is a noop.
1368 if (N->getConstantOperandVal(1))
1369 continue;
1370 }
1371
1372 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1373 // FPStack has extload and truncstore. SSE can fold direct loads into other
1374 // operations. Based on this, decide what we want to do.
1375 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1376 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1377 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1378 MachinePointerInfo MPI =
1379 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1380 SDLoc dl(N);
1381
1382 // FIXME: optimize the case where the src/dest is a load or store?
1383
1384 SDValue Store = CurDAG->getTruncStore(
1385 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1386 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1387 MemTmp, MPI, MemVT);
1388
1389 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1390 // extload we created. This will cause general havok on the dag because
1391 // anything below the conversion could be folded into other existing nodes.
1392 // To avoid invalidating 'I', back it up to the convert node.
1393 --I;
1394 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1395 break;
1396 }
1397
1398 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1399 //dealing with the chain differently, as there is already a preexisting chain.
1402 {
1403 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1404 MVT DstVT = N->getSimpleValueType(0);
1405
1406 // If any of the sources are vectors, no fp stack involved.
1407 if (SrcVT.isVector() || DstVT.isVector())
1408 continue;
1409
1410 // If the source and destination are SSE registers, then this is a legal
1411 // conversion that should not be lowered.
1412 const X86TargetLowering *X86Lowering =
1413 static_cast<const X86TargetLowering *>(TLI);
1414 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1415 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1416 if (SrcIsSSE && DstIsSSE)
1417 continue;
1418
1419 if (!SrcIsSSE && !DstIsSSE) {
1420 // If this is an FPStack extension, it is a noop.
1421 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1422 continue;
1423 // If this is a value-preserving FPStack truncation, it is a noop.
1424 if (N->getConstantOperandVal(2))
1425 continue;
1426 }
1427
1428 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1429 // FPStack has extload and truncstore. SSE can fold direct loads into other
1430 // operations. Based on this, decide what we want to do.
1431 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1432 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1433 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1434 MachinePointerInfo MPI =
1435 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1436 SDLoc dl(N);
1437
1438 // FIXME: optimize the case where the src/dest is a load or store?
1439
1440 //Since the operation is StrictFP, use the preexisting chain.
1442 if (!SrcIsSSE) {
1443 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1444 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1445 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1446 MPI, /*Align*/ std::nullopt,
1448 if (N->getFlags().hasNoFPExcept()) {
1449 SDNodeFlags Flags = Store->getFlags();
1450 Flags.setNoFPExcept(true);
1451 Store->setFlags(Flags);
1452 }
1453 } else {
1454 assert(SrcVT == MemVT && "Unexpected VT!");
1455 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1456 MPI);
1457 }
1458
1459 if (!DstIsSSE) {
1460 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1461 SDValue Ops[] = {Store, MemTmp};
1462 Result = CurDAG->getMemIntrinsicNode(
1463 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1464 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1465 if (N->getFlags().hasNoFPExcept()) {
1466 SDNodeFlags Flags = Result->getFlags();
1467 Flags.setNoFPExcept(true);
1468 Result->setFlags(Flags);
1469 }
1470 } else {
1471 assert(DstVT == MemVT && "Unexpected VT!");
1472 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1473 }
1474
1475 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1476 // extload we created. This will cause general havok on the dag because
1477 // anything below the conversion could be folded into other existing nodes.
1478 // To avoid invalidating 'I', back it up to the convert node.
1479 --I;
1480 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1481 break;
1482 }
1483 }
1484
1485
1486 // Now that we did that, the node is dead. Increment the iterator to the
1487 // next node to process, then delete N.
1488 ++I;
1489 MadeChange = true;
1490 }
1491
1492 // Remove any dead nodes that may have been left behind.
1493 if (MadeChange)
1494 CurDAG->RemoveDeadNodes();
1495}
1496
1497// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1498bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1499 unsigned Opc = N->getMachineOpcode();
1500 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1501 Opc != X86::MOVSX64rr8)
1502 return false;
1503
1504 SDValue N0 = N->getOperand(0);
1505
1506 // We need to be extracting the lower bit of an extend.
1507 if (!N0.isMachineOpcode() ||
1508 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1509 N0.getConstantOperandVal(1) != X86::sub_8bit)
1510 return false;
1511
1512 // We're looking for either a movsx or movzx to match the original opcode.
1513 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1514 : X86::MOVSX32rr8_NOREX;
1515 SDValue N00 = N0.getOperand(0);
1516 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1517 return false;
1518
1519 if (Opc == X86::MOVSX64rr8) {
1520 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1521 // to 64.
1522 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1523 MVT::i64, N00);
1524 ReplaceUses(N, Extend);
1525 } else {
1526 // Ok we can drop this extend and just use the original extend.
1527 ReplaceUses(N, N00.getNode());
1528 }
1529
1530 return true;
1531}
1532
1533void X86DAGToDAGISel::PostprocessISelDAG() {
1534 // Skip peepholes at -O0.
1535 if (TM.getOptLevel() == CodeGenOptLevel::None)
1536 return;
1537
1538 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1539
1540 bool MadeChange = false;
1541 while (Position != CurDAG->allnodes_begin()) {
1542 SDNode *N = &*--Position;
1543 // Skip dead nodes and any non-machine opcodes.
1544 if (N->use_empty() || !N->isMachineOpcode())
1545 continue;
1546
1547 if (tryOptimizeRem8Extend(N)) {
1548 MadeChange = true;
1549 continue;
1550 }
1551
1552 unsigned Opc = N->getMachineOpcode();
1553 switch (Opc) {
1554 default:
1555 continue;
1556 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1557 case X86::TEST8rr:
1558 case X86::TEST16rr:
1559 case X86::TEST32rr:
1560 case X86::TEST64rr:
1561 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1562 case X86::CTEST8rr:
1563 case X86::CTEST16rr:
1564 case X86::CTEST32rr:
1565 case X86::CTEST64rr: {
1566 auto &Op0 = N->getOperand(0);
1567 if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1568 !Op0.isMachineOpcode())
1569 continue;
1570 SDValue And = N->getOperand(0);
1571#define CASE_ND(OP) \
1572 case X86::OP: \
1573 case X86::OP##_ND:
1574 switch (And.getMachineOpcode()) {
1575 default:
1576 continue;
1577 CASE_ND(AND8rr)
1578 CASE_ND(AND16rr)
1579 CASE_ND(AND32rr)
1580 CASE_ND(AND64rr) {
1581 if (And->hasAnyUseOfValue(1))
1582 continue;
1583 SmallVector<SDValue> Ops(N->op_values());
1584 Ops[0] = And.getOperand(0);
1585 Ops[1] = And.getOperand(1);
1587 CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1588 ReplaceUses(N, Test);
1589 MadeChange = true;
1590 continue;
1591 }
1592 CASE_ND(AND8rm)
1593 CASE_ND(AND16rm)
1594 CASE_ND(AND32rm)
1595 CASE_ND(AND64rm) {
1596 if (And->hasAnyUseOfValue(1))
1597 continue;
1598 unsigned NewOpc;
1599 bool IsCTESTCC = X86::isCTESTCC(Opc);
1600#define FROM_TO(A, B) \
1601 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1602 break;
1603 switch (And.getMachineOpcode()) {
1604 FROM_TO(AND8rm, TEST8mr);
1605 FROM_TO(AND16rm, TEST16mr);
1606 FROM_TO(AND32rm, TEST32mr);
1607 FROM_TO(AND64rm, TEST64mr);
1608 }
1609#undef FROM_TO
1610#undef CASE_ND
1611 // Need to swap the memory and register operand.
1612 SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1613 And.getOperand(3), And.getOperand(4),
1614 And.getOperand(5), And.getOperand(0)};
1615 // CC, Cflags.
1616 if (IsCTESTCC) {
1617 Ops.push_back(N->getOperand(2));
1618 Ops.push_back(N->getOperand(3));
1619 }
1620 // Chain of memory load
1621 Ops.push_back(And.getOperand(6));
1622 // Glue
1623 if (IsCTESTCC)
1624 Ops.push_back(N->getOperand(4));
1625
1626 MachineSDNode *Test = CurDAG->getMachineNode(
1627 NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1628 CurDAG->setNodeMemRefs(
1629 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1630 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1631 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1632 MadeChange = true;
1633 continue;
1634 }
1635 }
1636 }
1637 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1638 // used. We're doing this late so we can prefer to fold the AND into masked
1639 // comparisons. Doing that can be better for the live range of the mask
1640 // register.
1641 case X86::KORTESTBrr:
1642 case X86::KORTESTWrr:
1643 case X86::KORTESTDrr:
1644 case X86::KORTESTQrr: {
1645 SDValue Op0 = N->getOperand(0);
1646 if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1647 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1648 continue;
1649#define CASE(A) \
1650 case X86::A: \
1651 break;
1652 switch (Op0.getMachineOpcode()) {
1653 default:
1654 continue;
1655 CASE(KANDBrr)
1656 CASE(KANDWrr)
1657 CASE(KANDDrr)
1658 CASE(KANDQrr)
1659 }
1660 unsigned NewOpc;
1661#define FROM_TO(A, B) \
1662 case X86::A: \
1663 NewOpc = X86::B; \
1664 break;
1665 switch (Opc) {
1666 FROM_TO(KORTESTBrr, KTESTBrr)
1667 FROM_TO(KORTESTWrr, KTESTWrr)
1668 FROM_TO(KORTESTDrr, KTESTDrr)
1669 FROM_TO(KORTESTQrr, KTESTQrr)
1670 }
1671 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1672 // KAND instructions and KTEST use the same ISA feature.
1673 if (NewOpc == X86::KTESTWrr && !Subtarget->hasDQI())
1674 continue;
1675#undef FROM_TO
1676 MachineSDNode *KTest = CurDAG->getMachineNode(
1677 NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1678 ReplaceUses(N, KTest);
1679 MadeChange = true;
1680 continue;
1681 }
1682 // Attempt to remove vectors moves that were inserted to zero upper bits.
1683 case TargetOpcode::SUBREG_TO_REG: {
1684 unsigned SubRegIdx = N->getConstantOperandVal(2);
1685 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1686 continue;
1687
1688 SDValue Move = N->getOperand(1);
1689 if (!Move.isMachineOpcode())
1690 continue;
1691
1692 // Make sure its one of the move opcodes we recognize.
1693 switch (Move.getMachineOpcode()) {
1694 default:
1695 continue;
1696 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1697 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1698 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1699 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1700 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1701 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1702 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1703 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1704 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1705 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1706 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1707 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1708 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1709 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1710 }
1711#undef CASE
1712
1713 SDValue In = Move.getOperand(0);
1714 if (!In.isMachineOpcode() ||
1715 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1716 continue;
1717
1718 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1719 // the SHA instructions which use a legacy encoding.
1720 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1721 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1722 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1723 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1724 continue;
1725
1726 // Producing instruction is another vector instruction. We can drop the
1727 // move.
1728 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1729 MadeChange = true;
1730 }
1731 }
1732 }
1733
1734 if (MadeChange)
1735 CurDAG->RemoveDeadNodes();
1736}
1737
1738
1739/// Emit any code that needs to be executed only in the main function.
1740void X86DAGToDAGISel::emitSpecialCodeForMain() {
1741 if (Subtarget->isTargetCygMing()) {
1743 auto &DL = CurDAG->getDataLayout();
1744
1746 CLI.setChain(CurDAG->getRoot())
1747 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1748 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1749 std::move(Args));
1750 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1751 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1752 CurDAG->setRoot(Result.second);
1753 }
1754}
1755
1756void X86DAGToDAGISel::emitFunctionEntryCode() {
1757 // If this is main, emit special code for main.
1758 const Function &F = MF->getFunction();
1759 if (F.hasExternalLinkage() && F.getName() == "main")
1760 emitSpecialCodeForMain();
1761}
1762
1763static bool isDispSafeForFrameIndex(int64_t Val) {
1764 // On 64-bit platforms, we can run into an issue where a frame index
1765 // includes a displacement that, when added to the explicit displacement,
1766 // will overflow the displacement field. Assuming that the frame index
1767 // displacement fits into a 31-bit integer (which is only slightly more
1768 // aggressive than the current fundamental assumption that it fits into
1769 // a 32-bit integer), a 31-bit disp should always be safe.
1770 return isInt<31>(Val);
1771}
1772
1773bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1774 X86ISelAddressMode &AM) {
1775 // We may have already matched a displacement and the caller just added the
1776 // symbolic displacement. So we still need to do the checks even if Offset
1777 // is zero.
1778
1779 int64_t Val = AM.Disp + Offset;
1780
1781 // Cannot combine ExternalSymbol displacements with integer offsets.
1782 if (Val != 0 && (AM.ES || AM.MCSym))
1783 return true;
1784
1785 CodeModel::Model M = TM.getCodeModel();
1786 if (Subtarget->is64Bit()) {
1787 if (Val != 0 &&
1789 AM.hasSymbolicDisplacement()))
1790 return true;
1791 // In addition to the checks required for a register base, check that
1792 // we do not try to use an unsafe Disp with a frame index.
1793 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1795 return true;
1796 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1797 // 64 bits. Instructions with 32-bit register addresses perform this zero
1798 // extension for us and we can safely ignore the high bits of Offset.
1799 // Instructions with only a 32-bit immediate address do not, though: they
1800 // sign extend instead. This means only address the low 2GB of address space
1801 // is directly addressable, we need indirect addressing for the high 2GB of
1802 // address space.
1803 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1804 // implicit zero extension of instructions would cover up any problem.
1805 // However, we have asserts elsewhere that get triggered if we do, so keep
1806 // the checks for now.
1807 // TODO: We would actually be able to accept these, as well as the same
1808 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1809 // to get an address size override to be emitted. However, this
1810 // pseudo-register is not part of any register class and therefore causes
1811 // MIR verification to fail.
1812 if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1813 !AM.hasBaseOrIndexReg())
1814 return true;
1815 }
1816 AM.Disp = Val;
1817 return false;
1818}
1819
1820bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1821 bool AllowSegmentRegForX32) {
1822 SDValue Address = N->getOperand(1);
1823
1824 // load gs:0 -> GS segment register.
1825 // load fs:0 -> FS segment register.
1826 //
1827 // This optimization is generally valid because the GNU TLS model defines that
1828 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1829 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1830 // zero-extended to 64 bits and then added it to the base address, which gives
1831 // unwanted results when the register holds a negative value.
1832 // For more information see http://people.redhat.com/drepper/tls.pdf
1833 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1834 !IndirectTlsSegRefs &&
1835 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1836 Subtarget->isTargetFuchsia())) {
1837 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1838 return true;
1839 switch (N->getPointerInfo().getAddrSpace()) {
1840 case X86AS::GS:
1841 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1842 return false;
1843 case X86AS::FS:
1844 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1845 return false;
1846 // Address space X86AS::SS is not handled here, because it is not used to
1847 // address TLS areas.
1848 }
1849 }
1850
1851 return true;
1852}
1853
1854/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1855/// mode. These wrap things that will resolve down into a symbol reference.
1856/// If no match is possible, this returns true, otherwise it returns false.
1857bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1858 // If the addressing mode already has a symbol as the displacement, we can
1859 // never match another symbol.
1860 if (AM.hasSymbolicDisplacement())
1861 return true;
1862
1863 bool IsRIPRelTLS = false;
1864 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1865 if (IsRIPRel) {
1866 SDValue Val = N.getOperand(0);
1868 IsRIPRelTLS = true;
1869 }
1870
1871 // We can't use an addressing mode in the 64-bit large code model.
1872 // Global TLS addressing is an exception. In the medium code model,
1873 // we use can use a mode when RIP wrappers are present.
1874 // That signifies access to globals that are known to be "near",
1875 // such as the GOT itself.
1876 CodeModel::Model M = TM.getCodeModel();
1877 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1878 return true;
1879
1880 // Base and index reg must be 0 in order to use %rip as base.
1881 if (IsRIPRel && AM.hasBaseOrIndexReg())
1882 return true;
1883
1884 // Make a local copy in case we can't do this fold.
1885 X86ISelAddressMode Backup = AM;
1886
1887 int64_t Offset = 0;
1888 SDValue N0 = N.getOperand(0);
1889 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1890 AM.GV = G->getGlobal();
1891 AM.SymbolFlags = G->getTargetFlags();
1892 Offset = G->getOffset();
1893 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1894 AM.CP = CP->getConstVal();
1895 AM.Alignment = CP->getAlign();
1896 AM.SymbolFlags = CP->getTargetFlags();
1897 Offset = CP->getOffset();
1898 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1899 AM.ES = S->getSymbol();
1900 AM.SymbolFlags = S->getTargetFlags();
1901 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1902 AM.MCSym = S->getMCSymbol();
1903 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1904 AM.JT = J->getIndex();
1905 AM.SymbolFlags = J->getTargetFlags();
1906 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1907 AM.BlockAddr = BA->getBlockAddress();
1908 AM.SymbolFlags = BA->getTargetFlags();
1909 Offset = BA->getOffset();
1910 } else
1911 llvm_unreachable("Unhandled symbol reference node.");
1912
1913 // Can't use an addressing mode with large globals.
1914 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1915 TM.isLargeGlobalValue(AM.GV)) {
1916 AM = Backup;
1917 return true;
1918 }
1919
1920 if (foldOffsetIntoAddress(Offset, AM)) {
1921 AM = Backup;
1922 return true;
1923 }
1924
1925 if (IsRIPRel)
1926 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1927
1928 // Commit the changes now that we know this fold is safe.
1929 return false;
1930}
1931
1932/// Add the specified node to the specified addressing mode, returning true if
1933/// it cannot be done. This just pattern matches for the addressing mode.
1934bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1935 if (matchAddressRecursively(N, AM, 0))
1936 return true;
1937
1938 // Post-processing: Make a second attempt to fold a load, if we now know
1939 // that there will not be any other register. This is only performed for
1940 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1941 // any foldable load the first time.
1942 if (Subtarget->isTarget64BitILP32() &&
1943 AM.BaseType == X86ISelAddressMode::RegBase &&
1944 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1945 SDValue Save_Base_Reg = AM.Base_Reg;
1946 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1947 AM.Base_Reg = SDValue();
1948 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1949 AM.Base_Reg = Save_Base_Reg;
1950 }
1951 }
1952
1953 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1954 // a smaller encoding and avoids a scaled-index.
1955 if (AM.Scale == 2 &&
1956 AM.BaseType == X86ISelAddressMode::RegBase &&
1957 AM.Base_Reg.getNode() == nullptr) {
1958 AM.Base_Reg = AM.IndexReg;
1959 AM.Scale = 1;
1960 }
1961
1962 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1963 // because it has a smaller encoding.
1964 if (TM.getCodeModel() != CodeModel::Large &&
1965 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
1966 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
1967 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
1968 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
1969 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1970 }
1971
1972 return false;
1973}
1974
1975bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1976 unsigned Depth) {
1977 // Add an artificial use to this node so that we can keep track of
1978 // it if it gets CSE'd with a different node.
1979 HandleSDNode Handle(N);
1980
1981 X86ISelAddressMode Backup = AM;
1982 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1983 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1984 return false;
1985 AM = Backup;
1986
1987 // Try again after commutating the operands.
1988 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
1989 Depth + 1) &&
1990 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
1991 return false;
1992 AM = Backup;
1993
1994 // If we couldn't fold both operands into the address at the same time,
1995 // see if we can just put each operand into a register and fold at least
1996 // the add.
1997 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1998 !AM.Base_Reg.getNode() &&
1999 !AM.IndexReg.getNode()) {
2000 N = Handle.getValue();
2001 AM.Base_Reg = N.getOperand(0);
2002 AM.IndexReg = N.getOperand(1);
2003 AM.Scale = 1;
2004 return false;
2005 }
2006 N = Handle.getValue();
2007 return true;
2008}
2009
2010// Insert a node into the DAG at least before the Pos node's position. This
2011// will reposition the node as needed, and will assign it a node ID that is <=
2012// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2013// IDs! The selection DAG must no longer depend on their uniqueness when this
2014// is used.
2015static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2016 if (N->getNodeId() == -1 ||
2019 DAG.RepositionNode(Pos->getIterator(), N.getNode());
2020 // Mark Node as invalid for pruning as after this it may be a successor to a
2021 // selected node but otherwise be in the same position of Pos.
2022 // Conservatively mark it with the same -abs(Id) to assure node id
2023 // invariant is preserved.
2024 N->setNodeId(Pos->getNodeId());
2026 }
2027}
2028
2029// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2030// safe. This allows us to convert the shift and and into an h-register
2031// extract and a scaled index. Returns false if the simplification is
2032// performed.
2034 uint64_t Mask,
2035 SDValue Shift, SDValue X,
2036 X86ISelAddressMode &AM) {
2037 if (Shift.getOpcode() != ISD::SRL ||
2038 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2039 !Shift.hasOneUse())
2040 return true;
2041
2042 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2043 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2044 Mask != (0xffu << ScaleLog))
2045 return true;
2046
2047 MVT XVT = X.getSimpleValueType();
2048 MVT VT = N.getSimpleValueType();
2049 SDLoc DL(N);
2050 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2051 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2052 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2053 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2054 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2055 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2056 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2057
2058 // Insert the new nodes into the topological ordering. We must do this in
2059 // a valid topological ordering as nothing is going to go back and re-sort
2060 // these nodes. We continually insert before 'N' in sequence as this is
2061 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2062 // hierarchy left to express.
2063 insertDAGNode(DAG, N, Eight);
2064 insertDAGNode(DAG, N, NewMask);
2065 insertDAGNode(DAG, N, Srl);
2066 insertDAGNode(DAG, N, And);
2067 insertDAGNode(DAG, N, Ext);
2068 insertDAGNode(DAG, N, ShlCount);
2069 insertDAGNode(DAG, N, Shl);
2070 DAG.ReplaceAllUsesWith(N, Shl);
2071 DAG.RemoveDeadNode(N.getNode());
2072 AM.IndexReg = Ext;
2073 AM.Scale = (1 << ScaleLog);
2074 return false;
2075}
2076
2077// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2078// allows us to fold the shift into this addressing mode. Returns false if the
2079// transform succeeded.
2081 X86ISelAddressMode &AM) {
2082 SDValue Shift = N.getOperand(0);
2083
2084 // Use a signed mask so that shifting right will insert sign bits. These
2085 // bits will be removed when we shift the result left so it doesn't matter
2086 // what we use. This might allow a smaller immediate encoding.
2087 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2088
2089 // If we have an any_extend feeding the AND, look through it to see if there
2090 // is a shift behind it. But only if the AND doesn't use the extended bits.
2091 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2092 bool FoundAnyExtend = false;
2093 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2094 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2095 isUInt<32>(Mask)) {
2096 FoundAnyExtend = true;
2097 Shift = Shift.getOperand(0);
2098 }
2099
2100 if (Shift.getOpcode() != ISD::SHL ||
2101 !isa<ConstantSDNode>(Shift.getOperand(1)))
2102 return true;
2103
2104 SDValue X = Shift.getOperand(0);
2105
2106 // Not likely to be profitable if either the AND or SHIFT node has more
2107 // than one use (unless all uses are for address computation). Besides,
2108 // isel mechanism requires their node ids to be reused.
2109 if (!N.hasOneUse() || !Shift.hasOneUse())
2110 return true;
2111
2112 // Verify that the shift amount is something we can fold.
2113 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2114 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2115 return true;
2116
2117 MVT VT = N.getSimpleValueType();
2118 SDLoc DL(N);
2119 if (FoundAnyExtend) {
2120 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2121 insertDAGNode(DAG, N, NewX);
2122 X = NewX;
2123 }
2124
2125 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
2126 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2127 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2128
2129 // Insert the new nodes into the topological ordering. We must do this in
2130 // a valid topological ordering as nothing is going to go back and re-sort
2131 // these nodes. We continually insert before 'N' in sequence as this is
2132 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2133 // hierarchy left to express.
2134 insertDAGNode(DAG, N, NewMask);
2135 insertDAGNode(DAG, N, NewAnd);
2136 insertDAGNode(DAG, N, NewShift);
2137 DAG.ReplaceAllUsesWith(N, NewShift);
2138 DAG.RemoveDeadNode(N.getNode());
2139
2140 AM.Scale = 1 << ShiftAmt;
2141 AM.IndexReg = NewAnd;
2142 return false;
2143}
2144
2145// Implement some heroics to detect shifts of masked values where the mask can
2146// be replaced by extending the shift and undoing that in the addressing mode
2147// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2148// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2149// the addressing mode. This results in code such as:
2150//
2151// int f(short *y, int *lookup_table) {
2152// ...
2153// return *y + lookup_table[*y >> 11];
2154// }
2155//
2156// Turning into:
2157// movzwl (%rdi), %eax
2158// movl %eax, %ecx
2159// shrl $11, %ecx
2160// addl (%rsi,%rcx,4), %eax
2161//
2162// Instead of:
2163// movzwl (%rdi), %eax
2164// movl %eax, %ecx
2165// shrl $9, %ecx
2166// andl $124, %rcx
2167// addl (%rsi,%rcx), %eax
2168//
2169// Note that this function assumes the mask is provided as a mask *after* the
2170// value is shifted. The input chain may or may not match that, but computing
2171// such a mask is trivial.
2173 uint64_t Mask,
2174 SDValue Shift, SDValue X,
2175 X86ISelAddressMode &AM) {
2176 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2177 !isa<ConstantSDNode>(Shift.getOperand(1)))
2178 return true;
2179
2180 // We need to ensure that mask is a continuous run of bits.
2181 unsigned MaskIdx, MaskLen;
2182 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2183 return true;
2184 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2185
2186 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2187
2188 // The amount of shift we're trying to fit into the addressing mode is taken
2189 // from the shifted mask index (number of trailing zeros of the mask).
2190 unsigned AMShiftAmt = MaskIdx;
2191
2192 // There is nothing we can do here unless the mask is removing some bits.
2193 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2194 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2195
2196 // Scale the leading zero count down based on the actual size of the value.
2197 // Also scale it down based on the size of the shift.
2198 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2199 if (MaskLZ < ScaleDown)
2200 return true;
2201 MaskLZ -= ScaleDown;
2202
2203 // The final check is to ensure that any masked out high bits of X are
2204 // already known to be zero. Otherwise, the mask has a semantic impact
2205 // other than masking out a couple of low bits. Unfortunately, because of
2206 // the mask, zero extensions will be removed from operands in some cases.
2207 // This code works extra hard to look through extensions because we can
2208 // replace them with zero extensions cheaply if necessary.
2209 bool ReplacingAnyExtend = false;
2210 if (X.getOpcode() == ISD::ANY_EXTEND) {
2211 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2212 X.getOperand(0).getSimpleValueType().getSizeInBits();
2213 // Assume that we'll replace the any-extend with a zero-extend, and
2214 // narrow the search to the extended value.
2215 X = X.getOperand(0);
2216 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2217 ReplacingAnyExtend = true;
2218 }
2219 APInt MaskedHighBits =
2220 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2221 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2222 return true;
2223
2224 // We've identified a pattern that can be transformed into a single shift
2225 // and an addressing mode. Make it so.
2226 MVT VT = N.getSimpleValueType();
2227 if (ReplacingAnyExtend) {
2228 assert(X.getValueType() != VT);
2229 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2230 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2231 insertDAGNode(DAG, N, NewX);
2232 X = NewX;
2233 }
2234
2235 MVT XVT = X.getSimpleValueType();
2236 SDLoc DL(N);
2237 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2238 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2239 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2240 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2241 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2242
2243 // Insert the new nodes into the topological ordering. We must do this in
2244 // a valid topological ordering as nothing is going to go back and re-sort
2245 // these nodes. We continually insert before 'N' in sequence as this is
2246 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2247 // hierarchy left to express.
2248 insertDAGNode(DAG, N, NewSRLAmt);
2249 insertDAGNode(DAG, N, NewSRL);
2250 insertDAGNode(DAG, N, NewExt);
2251 insertDAGNode(DAG, N, NewSHLAmt);
2252 insertDAGNode(DAG, N, NewSHL);
2253 DAG.ReplaceAllUsesWith(N, NewSHL);
2254 DAG.RemoveDeadNode(N.getNode());
2255
2256 AM.Scale = 1 << AMShiftAmt;
2257 AM.IndexReg = NewExt;
2258 return false;
2259}
2260
2261// Transform "(X >> SHIFT) & (MASK << C1)" to
2262// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2263// matched to a BEXTR later. Returns false if the simplification is performed.
2265 uint64_t Mask,
2266 SDValue Shift, SDValue X,
2267 X86ISelAddressMode &AM,
2268 const X86Subtarget &Subtarget) {
2269 if (Shift.getOpcode() != ISD::SRL ||
2270 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2271 !Shift.hasOneUse() || !N.hasOneUse())
2272 return true;
2273
2274 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2275 if (!Subtarget.hasTBM() &&
2276 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2277 return true;
2278
2279 // We need to ensure that mask is a continuous run of bits.
2280 unsigned MaskIdx, MaskLen;
2281 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2282 return true;
2283
2284 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2285
2286 // The amount of shift we're trying to fit into the addressing mode is taken
2287 // from the shifted mask index (number of trailing zeros of the mask).
2288 unsigned AMShiftAmt = MaskIdx;
2289
2290 // There is nothing we can do here unless the mask is removing some bits.
2291 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2292 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2293
2294 MVT XVT = X.getSimpleValueType();
2295 MVT VT = N.getSimpleValueType();
2296 SDLoc DL(N);
2297 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2298 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2299 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2300 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2301 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2302 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2303 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2304
2305 // Insert the new nodes into the topological ordering. We must do this in
2306 // a valid topological ordering as nothing is going to go back and re-sort
2307 // these nodes. We continually insert before 'N' in sequence as this is
2308 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2309 // hierarchy left to express.
2310 insertDAGNode(DAG, N, NewSRLAmt);
2311 insertDAGNode(DAG, N, NewSRL);
2312 insertDAGNode(DAG, N, NewMask);
2313 insertDAGNode(DAG, N, NewAnd);
2314 insertDAGNode(DAG, N, NewExt);
2315 insertDAGNode(DAG, N, NewSHLAmt);
2316 insertDAGNode(DAG, N, NewSHL);
2317 DAG.ReplaceAllUsesWith(N, NewSHL);
2318 DAG.RemoveDeadNode(N.getNode());
2319
2320 AM.Scale = 1 << AMShiftAmt;
2321 AM.IndexReg = NewExt;
2322 return false;
2323}
2324
2325// Attempt to peek further into a scaled index register, collecting additional
2326// extensions / offsets / etc. Returns /p N if we can't peek any further.
2327SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2328 X86ISelAddressMode &AM,
2329 unsigned Depth) {
2330 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2331 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2332 "Illegal index scale");
2333
2334 // Limit recursion.
2336 return N;
2337
2338 EVT VT = N.getValueType();
2339 unsigned Opc = N.getOpcode();
2340
2341 // index: add(x,c) -> index: x, disp + c
2342 if (CurDAG->isBaseWithConstantOffset(N)) {
2343 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2344 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2345 if (!foldOffsetIntoAddress(Offset, AM))
2346 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2347 }
2348
2349 // index: add(x,x) -> index: x, scale * 2
2350 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2351 if (AM.Scale <= 4) {
2352 AM.Scale *= 2;
2353 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2354 }
2355 }
2356
2357 // index: shl(x,i) -> index: x, scale * (1 << i)
2358 if (Opc == X86ISD::VSHLI) {
2359 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2360 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2361 if ((AM.Scale * ScaleAmt) <= 8) {
2362 AM.Scale *= ScaleAmt;
2363 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2364 }
2365 }
2366
2367 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2368 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2369 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2370 SDValue Src = N.getOperand(0);
2371 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2372 Src.hasOneUse()) {
2373 if (CurDAG->isBaseWithConstantOffset(Src)) {
2374 SDValue AddSrc = Src.getOperand(0);
2375 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2376 uint64_t Offset = (uint64_t)AddVal->getSExtValue();
2377 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2378 SDLoc DL(N);
2379 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2380 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2381 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2382 insertDAGNode(*CurDAG, N, ExtSrc);
2383 insertDAGNode(*CurDAG, N, ExtVal);
2384 insertDAGNode(*CurDAG, N, ExtAdd);
2385 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2386 CurDAG->RemoveDeadNode(N.getNode());
2387 return ExtSrc;
2388 }
2389 }
2390 }
2391 }
2392
2393 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2394 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2395 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2396 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2397 SDValue Src = N.getOperand(0);
2398 unsigned SrcOpc = Src.getOpcode();
2399 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2400 CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2401 Src.hasOneUse()) {
2402 if (CurDAG->isBaseWithConstantOffset(Src)) {
2403 SDValue AddSrc = Src.getOperand(0);
2404 uint64_t Offset = Src.getConstantOperandVal(1);
2405 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2406 SDLoc DL(N);
2407 SDValue Res;
2408 // If we're also scaling, see if we can use that as well.
2409 if (AddSrc.getOpcode() == ISD::SHL &&
2410 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2411 SDValue ShVal = AddSrc.getOperand(0);
2412 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2413 APInt HiBits =
2415 uint64_t ScaleAmt = 1ULL << ShAmt;
2416 if ((AM.Scale * ScaleAmt) <= 8 &&
2417 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2418 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2419 AM.Scale *= ScaleAmt;
2420 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2421 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2422 AddSrc.getOperand(1));
2423 insertDAGNode(*CurDAG, N, ExtShVal);
2424 insertDAGNode(*CurDAG, N, ExtShift);
2425 AddSrc = ExtShift;
2426 Res = ExtShVal;
2427 }
2428 }
2429 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2430 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2431 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2432 insertDAGNode(*CurDAG, N, ExtSrc);
2433 insertDAGNode(*CurDAG, N, ExtVal);
2434 insertDAGNode(*CurDAG, N, ExtAdd);
2435 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2436 CurDAG->RemoveDeadNode(N.getNode());
2437 return Res ? Res : ExtSrc;
2438 }
2439 }
2440 }
2441 }
2442
2443 // TODO: Handle extensions, shifted masks etc.
2444 return N;
2445}
2446
2447bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2448 unsigned Depth) {
2449 SDLoc dl(N);
2450 LLVM_DEBUG({
2451 dbgs() << "MatchAddress: ";
2452 AM.dump(CurDAG);
2453 });
2454 // Limit recursion.
2456 return matchAddressBase(N, AM);
2457
2458 // If this is already a %rip relative address, we can only merge immediates
2459 // into it. Instead of handling this in every case, we handle it here.
2460 // RIP relative addressing: %rip + 32-bit displacement!
2461 if (AM.isRIPRelative()) {
2462 // FIXME: JumpTable and ExternalSymbol address currently don't like
2463 // displacements. It isn't very important, but this should be fixed for
2464 // consistency.
2465 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2466 return true;
2467
2468 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2469 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2470 return false;
2471 return true;
2472 }
2473
2474 switch (N.getOpcode()) {
2475 default: break;
2476 case ISD::LOCAL_RECOVER: {
2477 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2478 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2479 // Use the symbol and don't prefix it.
2480 AM.MCSym = ESNode->getMCSymbol();
2481 return false;
2482 }
2483 break;
2484 }
2485 case ISD::Constant: {
2486 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2487 if (!foldOffsetIntoAddress(Val, AM))
2488 return false;
2489 break;
2490 }
2491
2492 case X86ISD::Wrapper:
2493 case X86ISD::WrapperRIP:
2494 if (!matchWrapper(N, AM))
2495 return false;
2496 break;
2497
2498 case ISD::LOAD:
2499 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2500 return false;
2501 break;
2502
2503 case ISD::FrameIndex:
2504 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2505 AM.Base_Reg.getNode() == nullptr &&
2506 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2507 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2508 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2509 return false;
2510 }
2511 break;
2512
2513 case ISD::SHL:
2514 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2515 break;
2516
2517 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2518 unsigned Val = CN->getZExtValue();
2519 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2520 // that the base operand remains free for further matching. If
2521 // the base doesn't end up getting used, a post-processing step
2522 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2523 if (Val == 1 || Val == 2 || Val == 3) {
2524 SDValue ShVal = N.getOperand(0);
2525 AM.Scale = 1 << Val;
2526 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2527 return false;
2528 }
2529 }
2530 break;
2531
2532 case ISD::SRL: {
2533 // Scale must not be used already.
2534 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2535
2536 // We only handle up to 64-bit values here as those are what matter for
2537 // addressing mode optimizations.
2538 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2539 "Unexpected value size!");
2540
2541 SDValue And = N.getOperand(0);
2542 if (And.getOpcode() != ISD::AND) break;
2543 SDValue X = And.getOperand(0);
2544
2545 // The mask used for the transform is expected to be post-shift, but we
2546 // found the shift first so just apply the shift to the mask before passing
2547 // it down.
2548 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2549 !isa<ConstantSDNode>(And.getOperand(1)))
2550 break;
2551 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2552
2553 // Try to fold the mask and shift into the scale, and return false if we
2554 // succeed.
2555 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2556 return false;
2557 break;
2558 }
2559
2560 case ISD::SMUL_LOHI:
2561 case ISD::UMUL_LOHI:
2562 // A mul_lohi where we need the low part can be folded as a plain multiply.
2563 if (N.getResNo() != 0) break;
2564 [[fallthrough]];
2565 case ISD::MUL:
2566 case X86ISD::MUL_IMM:
2567 // X*[3,5,9] -> X+X*[2,4,8]
2568 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2569 AM.Base_Reg.getNode() == nullptr &&
2570 AM.IndexReg.getNode() == nullptr) {
2571 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2572 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2573 CN->getZExtValue() == 9) {
2574 AM.Scale = unsigned(CN->getZExtValue())-1;
2575
2576 SDValue MulVal = N.getOperand(0);
2577 SDValue Reg;
2578
2579 // Okay, we know that we have a scale by now. However, if the scaled
2580 // value is an add of something and a constant, we can fold the
2581 // constant into the disp field here.
2582 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2583 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2584 Reg = MulVal.getOperand(0);
2585 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2586 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2587 if (foldOffsetIntoAddress(Disp, AM))
2588 Reg = N.getOperand(0);
2589 } else {
2590 Reg = N.getOperand(0);
2591 }
2592
2593 AM.IndexReg = AM.Base_Reg = Reg;
2594 return false;
2595 }
2596 }
2597 break;
2598
2599 case ISD::SUB: {
2600 // Given A-B, if A can be completely folded into the address and
2601 // the index field with the index field unused, use -B as the index.
2602 // This is a win if a has multiple parts that can be folded into
2603 // the address. Also, this saves a mov if the base register has
2604 // other uses, since it avoids a two-address sub instruction, however
2605 // it costs an additional mov if the index register has other uses.
2606
2607 // Add an artificial use to this node so that we can keep track of
2608 // it if it gets CSE'd with a different node.
2609 HandleSDNode Handle(N);
2610
2611 // Test if the LHS of the sub can be folded.
2612 X86ISelAddressMode Backup = AM;
2613 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2614 N = Handle.getValue();
2615 AM = Backup;
2616 break;
2617 }
2618 N = Handle.getValue();
2619 // Test if the index field is free for use.
2620 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2621 AM = Backup;
2622 break;
2623 }
2624
2625 int Cost = 0;
2626 SDValue RHS = N.getOperand(1);
2627 // If the RHS involves a register with multiple uses, this
2628 // transformation incurs an extra mov, due to the neg instruction
2629 // clobbering its operand.
2630 if (!RHS.getNode()->hasOneUse() ||
2631 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2632 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2633 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2634 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2635 RHS.getOperand(0).getValueType() == MVT::i32))
2636 ++Cost;
2637 // If the base is a register with multiple uses, this
2638 // transformation may save a mov.
2639 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2640 !AM.Base_Reg.getNode()->hasOneUse()) ||
2641 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2642 --Cost;
2643 // If the folded LHS was interesting, this transformation saves
2644 // address arithmetic.
2645 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2646 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2647 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2648 --Cost;
2649 // If it doesn't look like it may be an overall win, don't do it.
2650 if (Cost >= 0) {
2651 AM = Backup;
2652 break;
2653 }
2654
2655 // Ok, the transformation is legal and appears profitable. Go for it.
2656 // Negation will be emitted later to avoid creating dangling nodes if this
2657 // was an unprofitable LEA.
2658 AM.IndexReg = RHS;
2659 AM.NegateIndex = true;
2660 AM.Scale = 1;
2661 return false;
2662 }
2663
2664 case ISD::OR:
2665 case ISD::XOR:
2666 // See if we can treat the OR/XOR node as an ADD node.
2667 if (!CurDAG->isADDLike(N))
2668 break;
2669 [[fallthrough]];
2670 case ISD::ADD:
2671 if (!matchAdd(N, AM, Depth))
2672 return false;
2673 break;
2674
2675 case ISD::AND: {
2676 // Perform some heroic transforms on an and of a constant-count shift
2677 // with a constant to enable use of the scaled offset field.
2678
2679 // Scale must not be used already.
2680 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2681
2682 // We only handle up to 64-bit values here as those are what matter for
2683 // addressing mode optimizations.
2684 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2685 "Unexpected value size!");
2686
2687 if (!isa<ConstantSDNode>(N.getOperand(1)))
2688 break;
2689
2690 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2691 SDValue Shift = N.getOperand(0);
2692 SDValue X = Shift.getOperand(0);
2693
2694 uint64_t Mask = N.getConstantOperandVal(1);
2695
2696 // Try to fold the mask and shift into an extract and scale.
2697 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2698 return false;
2699
2700 // Try to fold the mask and shift directly into the scale.
2701 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2702 return false;
2703
2704 // Try to fold the mask and shift into BEXTR and scale.
2705 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2706 return false;
2707 }
2708
2709 // Try to swap the mask and shift to place shifts which can be done as
2710 // a scale on the outside of the mask.
2711 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2712 return false;
2713
2714 break;
2715 }
2716 case ISD::ZERO_EXTEND: {
2717 // Try to widen a zexted shift left to the same size as its use, so we can
2718 // match the shift as a scale factor.
2719 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2720 break;
2721
2722 SDValue Src = N.getOperand(0);
2723
2724 // See if we can match a zext(addlike(x,c)).
2725 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2726 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2727 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2728 if (Index != N) {
2729 AM.IndexReg = Index;
2730 return false;
2731 }
2732
2733 // Peek through mask: zext(and(shl(x,c1),c2))
2734 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2735 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2736 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2737 Mask = MaskC->getAPIntValue();
2738 Src = Src.getOperand(0);
2739 }
2740
2741 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse()) {
2742 // Give up if the shift is not a valid scale factor [1,2,3].
2743 SDValue ShlSrc = Src.getOperand(0);
2744 SDValue ShlAmt = Src.getOperand(1);
2745 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2746 if (!ShAmtC)
2747 break;
2748 unsigned ShAmtV = ShAmtC->getZExtValue();
2749 if (ShAmtV > 3)
2750 break;
2751
2752 // The narrow shift must only shift out zero bits (it must be 'nuw').
2753 // That makes it safe to widen to the destination type.
2754 APInt HighZeros =
2755 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2756 if (!Src->getFlags().hasNoUnsignedWrap() &&
2757 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2758 break;
2759
2760 // zext (shl nuw i8 %x, C1) to i32
2761 // --> shl (zext i8 %x to i32), (zext C1)
2762 // zext (and (shl nuw i8 %x, C1), C2) to i32
2763 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2764 MVT SrcVT = ShlSrc.getSimpleValueType();
2765 MVT VT = N.getSimpleValueType();
2766 SDLoc DL(N);
2767
2768 SDValue Res = ShlSrc;
2769 if (!Mask.isAllOnes()) {
2770 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2771 insertDAGNode(*CurDAG, N, Res);
2772 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2773 insertDAGNode(*CurDAG, N, Res);
2774 }
2775 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2776 insertDAGNode(*CurDAG, N, Zext);
2777 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2778 insertDAGNode(*CurDAG, N, NewShl);
2779 CurDAG->ReplaceAllUsesWith(N, NewShl);
2780 CurDAG->RemoveDeadNode(N.getNode());
2781
2782 // Convert the shift to scale factor.
2783 AM.Scale = 1 << ShAmtV;
2784 // If matchIndexRecursively is not called here,
2785 // Zext may be replaced by other nodes but later used to call a builder
2786 // method
2787 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2788 return false;
2789 }
2790
2791 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2792 // Try to fold the mask and shift into an extract and scale.
2793 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2794 Src.getOperand(0), AM))
2795 return false;
2796
2797 // Try to fold the mask and shift directly into the scale.
2798 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2799 Src.getOperand(0), AM))
2800 return false;
2801
2802 // Try to fold the mask and shift into BEXTR and scale.
2803 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2804 Src.getOperand(0), AM, *Subtarget))
2805 return false;
2806 }
2807
2808 break;
2809 }
2810 }
2811
2812 return matchAddressBase(N, AM);
2813}
2814
2815/// Helper for MatchAddress. Add the specified node to the
2816/// specified addressing mode without any further recursion.
2817bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2818 // Is the base register already occupied?
2819 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2820 // If so, check to see if the scale index register is set.
2821 if (!AM.IndexReg.getNode()) {
2822 AM.IndexReg = N;
2823 AM.Scale = 1;
2824 return false;
2825 }
2826
2827 // Otherwise, we cannot select it.
2828 return true;
2829 }
2830
2831 // Default, generate it as a register.
2832 AM.BaseType = X86ISelAddressMode::RegBase;
2833 AM.Base_Reg = N;
2834 return false;
2835}
2836
2837bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2838 X86ISelAddressMode &AM,
2839 unsigned Depth) {
2840 SDLoc dl(N);
2841 LLVM_DEBUG({
2842 dbgs() << "MatchVectorAddress: ";
2843 AM.dump(CurDAG);
2844 });
2845 // Limit recursion.
2847 return matchAddressBase(N, AM);
2848
2849 // TODO: Support other operations.
2850 switch (N.getOpcode()) {
2851 case ISD::Constant: {
2852 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2853 if (!foldOffsetIntoAddress(Val, AM))
2854 return false;
2855 break;
2856 }
2857 case X86ISD::Wrapper:
2858 if (!matchWrapper(N, AM))
2859 return false;
2860 break;
2861 case ISD::ADD: {
2862 // Add an artificial use to this node so that we can keep track of
2863 // it if it gets CSE'd with a different node.
2864 HandleSDNode Handle(N);
2865
2866 X86ISelAddressMode Backup = AM;
2867 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2868 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2869 Depth + 1))
2870 return false;
2871 AM = Backup;
2872
2873 // Try again after commuting the operands.
2874 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2875 Depth + 1) &&
2876 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2877 Depth + 1))
2878 return false;
2879 AM = Backup;
2880
2881 N = Handle.getValue();
2882 break;
2883 }
2884 }
2885
2886 return matchAddressBase(N, AM);
2887}
2888
2889/// Helper for selectVectorAddr. Handles things that can be folded into a
2890/// gather/scatter address. The index register and scale should have already
2891/// been handled.
2892bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2893 return matchVectorAddressRecursively(N, AM, 0);
2894}
2895
2896bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2897 SDValue IndexOp, SDValue ScaleOp,
2898 SDValue &Base, SDValue &Scale,
2899 SDValue &Index, SDValue &Disp,
2900 SDValue &Segment) {
2901 X86ISelAddressMode AM;
2902 AM.Scale = ScaleOp->getAsZExtVal();
2903
2904 // Attempt to match index patterns, as long as we're not relying on implicit
2905 // sign-extension, which is performed BEFORE scale.
2906 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2907 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2908 else
2909 AM.IndexReg = IndexOp;
2910
2911 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2912 if (AddrSpace == X86AS::GS)
2913 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2914 if (AddrSpace == X86AS::FS)
2915 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2916 if (AddrSpace == X86AS::SS)
2917 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2918
2919 SDLoc DL(BasePtr);
2920 MVT VT = BasePtr.getSimpleValueType();
2921
2922 // Try to match into the base and displacement fields.
2923 if (matchVectorAddress(BasePtr, AM))
2924 return false;
2925
2926 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2927 return true;
2928}
2929
2930/// Returns true if it is able to pattern match an addressing mode.
2931/// It returns the operands which make up the maximal addressing mode it can
2932/// match by reference.
2933///
2934/// Parent is the parent node of the addr operand that is being matched. It
2935/// is always a load, store, atomic node, or null. It is only null when
2936/// checking memory operands for inline asm nodes.
2937bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2938 SDValue &Scale, SDValue &Index,
2939 SDValue &Disp, SDValue &Segment) {
2940 X86ISelAddressMode AM;
2941
2942 if (Parent &&
2943 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2944 // that are not a MemSDNode, and thus don't have proper addrspace info.
2945 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2946 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2947 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2948 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2949 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2950 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2951 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2952 unsigned AddrSpace =
2953 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2954 if (AddrSpace == X86AS::GS)
2955 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2956 if (AddrSpace == X86AS::FS)
2957 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2958 if (AddrSpace == X86AS::SS)
2959 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2960 }
2961
2962 // Save the DL and VT before calling matchAddress, it can invalidate N.
2963 SDLoc DL(N);
2964 MVT VT = N.getSimpleValueType();
2965
2966 if (matchAddress(N, AM))
2967 return false;
2968
2969 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2970 return true;
2971}
2972
2973bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2974 // Cannot use 32 bit constants to reference objects in kernel/large code
2975 // model.
2976 if (TM.getCodeModel() == CodeModel::Kernel ||
2977 TM.getCodeModel() == CodeModel::Large)
2978 return false;
2979
2980 // In static codegen with small code model, we can get the address of a label
2981 // into a register with 'movl'
2982 if (N->getOpcode() != X86ISD::Wrapper)
2983 return false;
2984
2985 N = N.getOperand(0);
2986
2987 // At least GNU as does not accept 'movl' for TPOFF relocations.
2988 // FIXME: We could use 'movl' when we know we are targeting MC.
2989 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2990 return false;
2991
2992 Imm = N;
2993 // Small/medium code model can reference non-TargetGlobalAddress objects with
2994 // 32 bit constants.
2995 if (N->getOpcode() != ISD::TargetGlobalAddress) {
2996 return TM.getCodeModel() == CodeModel::Small ||
2997 TM.getCodeModel() == CodeModel::Medium;
2998 }
2999
3000 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3001 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3002 return CR->getUnsignedMax().ult(1ull << 32);
3003
3004 return !TM.isLargeGlobalValue(GV);
3005}
3006
3007bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
3008 SDValue &Scale, SDValue &Index,
3009 SDValue &Disp, SDValue &Segment) {
3010 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3011 SDLoc DL(N);
3012
3013 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3014 return false;
3015
3016 auto *RN = dyn_cast<RegisterSDNode>(Base);
3017 if (RN && RN->getReg() == 0)
3018 Base = CurDAG->getRegister(0, MVT::i64);
3019 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
3020 // Base could already be %rip, particularly in the x32 ABI.
3021 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3022 MVT::i64), 0);
3023 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3024 Base);
3025 }
3026
3027 RN = dyn_cast<RegisterSDNode>(Index);
3028 if (RN && RN->getReg() == 0)
3029 Index = CurDAG->getRegister(0, MVT::i64);
3030 else {
3031 assert(Index.getValueType() == MVT::i32 &&
3032 "Expect to be extending 32-bit registers for use in LEA");
3033 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3034 MVT::i64), 0);
3035 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3036 Index);
3037 }
3038
3039 return true;
3040}
3041
3042/// Calls SelectAddr and determines if the maximal addressing
3043/// mode it matches can be cost effectively emitted as an LEA instruction.
3044bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3045 SDValue &Base, SDValue &Scale,
3046 SDValue &Index, SDValue &Disp,
3047 SDValue &Segment) {
3048 X86ISelAddressMode AM;
3049
3050 // Save the DL and VT before calling matchAddress, it can invalidate N.
3051 SDLoc DL(N);
3052 MVT VT = N.getSimpleValueType();
3053
3054 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3055 // segments.
3056 SDValue Copy = AM.Segment;
3057 SDValue T = CurDAG->getRegister(0, MVT::i32);
3058 AM.Segment = T;
3059 if (matchAddress(N, AM))
3060 return false;
3061 assert (T == AM.Segment);
3062 AM.Segment = Copy;
3063
3064 unsigned Complexity = 0;
3065 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3066 Complexity = 1;
3067 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3068 Complexity = 4;
3069
3070 if (AM.IndexReg.getNode())
3071 Complexity++;
3072
3073 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3074 // a simple shift.
3075 if (AM.Scale > 1)
3076 Complexity++;
3077
3078 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3079 // to a LEA. This is determined with some experimentation but is by no means
3080 // optimal (especially for code size consideration). LEA is nice because of
3081 // its three-address nature. Tweak the cost function again when we can run
3082 // convertToThreeAddress() at register allocation time.
3083 if (AM.hasSymbolicDisplacement()) {
3084 // For X86-64, always use LEA to materialize RIP-relative addresses.
3085 if (Subtarget->is64Bit())
3086 Complexity = 4;
3087 else
3088 Complexity += 2;
3089 }
3090
3091 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3092 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3093 // duplicating flag-producing instructions later in the pipeline.
3094 if (N.getOpcode() == ISD::ADD) {
3095 auto isMathWithFlags = [](SDValue V) {
3096 switch (V.getOpcode()) {
3097 case X86ISD::ADD:
3098 case X86ISD::SUB:
3099 case X86ISD::ADC:
3100 case X86ISD::SBB:
3101 case X86ISD::SMUL:
3102 case X86ISD::UMUL:
3103 /* TODO: These opcodes can be added safely, but we may want to justify
3104 their inclusion for different reasons (better for reg-alloc).
3105 case X86ISD::OR:
3106 case X86ISD::XOR:
3107 case X86ISD::AND:
3108 */
3109 // Value 1 is the flag output of the node - verify it's not dead.
3110 return !SDValue(V.getNode(), 1).use_empty();
3111 default:
3112 return false;
3113 }
3114 };
3115 // TODO: We might want to factor in whether there's a load folding
3116 // opportunity for the math op that disappears with LEA.
3117 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3118 Complexity++;
3119 }
3120
3121 if (AM.Disp)
3122 Complexity++;
3123
3124 // If it isn't worth using an LEA, reject it.
3125 if (Complexity <= 2)
3126 return false;
3127
3128 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3129 return true;
3130}
3131
3132/// This is only run on TargetGlobalTLSAddress nodes.
3133bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3134 SDValue &Scale, SDValue &Index,
3135 SDValue &Disp, SDValue &Segment) {
3136 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3137 N.getOpcode() == ISD::TargetExternalSymbol);
3138
3139 X86ISelAddressMode AM;
3140 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3141 AM.GV = GA->getGlobal();
3142 AM.Disp += GA->getOffset();
3143 AM.SymbolFlags = GA->getTargetFlags();
3144 } else {
3145 auto *SA = cast<ExternalSymbolSDNode>(N);
3146 AM.ES = SA->getSymbol();
3147 AM.SymbolFlags = SA->getTargetFlags();
3148 }
3149
3150 if (Subtarget->is32Bit()) {
3151 AM.Scale = 1;
3152 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3153 }
3154
3155 MVT VT = N.getSimpleValueType();
3156 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3157 return true;
3158}
3159
3160bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3161 // Keep track of the original value type and whether this value was
3162 // truncated. If we see a truncation from pointer type to VT that truncates
3163 // bits that are known to be zero, we can use a narrow reference.
3164 EVT VT = N.getValueType();
3165 bool WasTruncated = false;
3166 if (N.getOpcode() == ISD::TRUNCATE) {
3167 WasTruncated = true;
3168 N = N.getOperand(0);
3169 }
3170
3171 if (N.getOpcode() != X86ISD::Wrapper)
3172 return false;
3173
3174 // We can only use non-GlobalValues as immediates if they were not truncated,
3175 // as we do not have any range information. If we have a GlobalValue and the
3176 // address was not truncated, we can select it as an operand directly.
3177 unsigned Opc = N.getOperand(0)->getOpcode();
3178 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3179 Op = N.getOperand(0);
3180 // We can only select the operand directly if we didn't have to look past a
3181 // truncate.
3182 return !WasTruncated;
3183 }
3184
3185 // Check that the global's range fits into VT.
3186 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3187 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3188 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3189 return false;
3190
3191 // Okay, we can use a narrow reference.
3192 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3193 GA->getOffset(), GA->getTargetFlags());
3194 return true;
3195}
3196
3197bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3198 SDValue &Base, SDValue &Scale,
3199 SDValue &Index, SDValue &Disp,
3200 SDValue &Segment) {
3201 assert(Root && P && "Unknown root/parent nodes");
3202 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3203 !IsProfitableToFold(N, P, Root) ||
3204 !IsLegalToFold(N, P, Root, OptLevel))
3205 return false;
3206
3207 return selectAddr(N.getNode(),
3208 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3209}
3210
3211bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3212 SDValue &Base, SDValue &Scale,
3213 SDValue &Index, SDValue &Disp,
3214 SDValue &Segment) {
3215 assert(Root && P && "Unknown root/parent nodes");
3216 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3217 !IsProfitableToFold(N, P, Root) ||
3218 !IsLegalToFold(N, P, Root, OptLevel))
3219 return false;
3220
3221 return selectAddr(N.getNode(),
3222 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3223}
3224
3225/// Return an SDNode that returns the value of the global base register.
3226/// Output instructions required to initialize the global base register,
3227/// if necessary.
3228SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3229 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3230 auto &DL = MF->getDataLayout();
3231 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3232}
3233
3234bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3235 if (N->getOpcode() == ISD::TRUNCATE)
3236 N = N->getOperand(0).getNode();
3237 if (N->getOpcode() != X86ISD::Wrapper)
3238 return false;
3239
3240 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3241 if (!GA)
3242 return false;
3243
3244 auto *GV = GA->getGlobal();
3245 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3246 if (CR)
3247 return CR->getSignedMin().sge(-1ull << Width) &&
3248 CR->getSignedMax().slt(1ull << Width);
3249 // In the kernel code model, globals are in the negative 2GB of the address
3250 // space, so globals can be a sign extended 32-bit immediate.
3251 // In other code models, small globals are in the low 2GB of the address
3252 // space, so sign extending them is equivalent to zero extending them.
3253 return Width == 32 && !TM.isLargeGlobalValue(GV);
3254}
3255
3256X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3257 assert(N->isMachineOpcode() && "Unexpected node");
3258 unsigned Opc = N->getMachineOpcode();
3259 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3260 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3261 if (CondNo < 0)
3262 return X86::COND_INVALID;
3263
3264 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3265}
3266
3267/// Test whether the given X86ISD::CMP node has any users that use a flag
3268/// other than ZF.
3269bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3270 // Examine each user of the node.
3271 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3272 UI != UE; ++UI) {
3273 // Only check things that use the flags.
3274 if (UI.getUse().getResNo() != Flags.getResNo())
3275 continue;
3276 // Only examine CopyToReg uses that copy to EFLAGS.
3277 if (UI->getOpcode() != ISD::CopyToReg ||
3278 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3279 return false;
3280 // Examine each user of the CopyToReg use.
3281 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3282 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3283 // Only examine the Flag result.
3284 if (FlagUI.getUse().getResNo() != 1) continue;
3285 // Anything unusual: assume conservatively.
3286 if (!FlagUI->isMachineOpcode()) return false;
3287 // Examine the condition code of the user.
3288 X86::CondCode CC = getCondFromNode(*FlagUI);
3289
3290 switch (CC) {
3291 // Comparisons which only use the zero flag.
3292 case X86::COND_E: case X86::COND_NE:
3293 continue;
3294 // Anything else: assume conservatively.
3295 default:
3296 return false;
3297 }
3298 }
3299 }
3300 return true;
3301}
3302
3303/// Test whether the given X86ISD::CMP node has any uses which require the SF
3304/// flag to be accurate.
3305bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3306 // Examine each user of the node.
3307 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3308 UI != UE; ++UI) {
3309 // Only check things that use the flags.
3310 if (UI.getUse().getResNo() != Flags.getResNo())
3311 continue;
3312 // Only examine CopyToReg uses that copy to EFLAGS.
3313 if (UI->getOpcode() != ISD::CopyToReg ||
3314 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3315 return false;
3316 // Examine each user of the CopyToReg use.
3317 for (SDNode::use_iterator FlagUI = UI->use_begin(),
3318 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3319 // Only examine the Flag result.
3320 if (FlagUI.getUse().getResNo() != 1) continue;
3321 // Anything unusual: assume conservatively.
3322 if (!FlagUI->isMachineOpcode()) return false;
3323 // Examine the condition code of the user.
3324 X86::CondCode CC = getCondFromNode(*FlagUI);
3325
3326 switch (CC) {
3327 // Comparisons which don't examine the SF flag.
3328 case X86::COND_A: case X86::COND_AE:
3329 case X86::COND_B: case X86::COND_BE:
3330 case X86::COND_E: case X86::COND_NE:
3331 case X86::COND_O: case X86::COND_NO:
3332 case X86::COND_P: case X86::COND_NP:
3333 continue;
3334 // Anything else: assume conservatively.
3335 default:
3336 return false;
3337 }
3338 }
3339 }
3340 return true;
3341}
3342
3344 switch (CC) {
3345 // Comparisons which don't examine the CF flag.
3346 case X86::COND_O: case X86::COND_NO:
3347 case X86::COND_E: case X86::COND_NE:
3348 case X86::COND_S: case X86::COND_NS:
3349 case X86::COND_P: case X86::COND_NP:
3350 case X86::COND_L: case X86::COND_GE:
3351 case X86::COND_G: case X86::COND_LE:
3352 return false;
3353 // Anything else: assume conservatively.
3354 default:
3355 return true;
3356 }
3357}
3358
3359/// Test whether the given node which sets flags has any uses which require the
3360/// CF flag to be accurate.
3361 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3362 // Examine each user of the node.
3363 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3364 UI != UE; ++UI) {
3365 // Only check things that use the flags.
3366 if (UI.getUse().getResNo() != Flags.getResNo())
3367 continue;
3368
3369 unsigned UIOpc = UI->getOpcode();
3370
3371 if (UIOpc == ISD::CopyToReg) {
3372 // Only examine CopyToReg uses that copy to EFLAGS.
3373 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3374 return false;
3375 // Examine each user of the CopyToReg use.
3376 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
3377 FlagUI != FlagUE; ++FlagUI) {
3378 // Only examine the Flag result.
3379 if (FlagUI.getUse().getResNo() != 1)
3380 continue;
3381 // Anything unusual: assume conservatively.
3382 if (!FlagUI->isMachineOpcode())
3383 return false;
3384 // Examine the condition code of the user.
3385 X86::CondCode CC = getCondFromNode(*FlagUI);
3386
3387 if (mayUseCarryFlag(CC))
3388 return false;
3389 }
3390
3391 // This CopyToReg is ok. Move on to the next user.
3392 continue;
3393 }
3394
3395 // This might be an unselected node. So look for the pre-isel opcodes that
3396 // use flags.
3397 unsigned CCOpNo;
3398 switch (UIOpc) {
3399 default:
3400 // Something unusual. Be conservative.
3401 return false;
3402 case X86ISD::SETCC: CCOpNo = 0; break;
3403 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3404 case X86ISD::CMOV: CCOpNo = 2; break;
3405 case X86ISD::BRCOND: CCOpNo = 2; break;
3406 }
3407
3408 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
3409 if (mayUseCarryFlag(CC))
3410 return false;
3411 }
3412 return true;
3413}
3414
3415/// Check whether or not the chain ending in StoreNode is suitable for doing
3416/// the {load; op; store} to modify transformation.
3418 SDValue StoredVal, SelectionDAG *CurDAG,
3419 unsigned LoadOpNo,
3420 LoadSDNode *&LoadNode,
3421 SDValue &InputChain) {
3422 // Is the stored value result 0 of the operation?
3423 if (StoredVal.getResNo() != 0) return false;
3424
3425 // Are there other uses of the operation other than the store?
3426 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3427
3428 // Is the store non-extending and non-indexed?
3429 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3430 return false;
3431
3432 SDValue Load = StoredVal->getOperand(LoadOpNo);
3433 // Is the stored value a non-extending and non-indexed load?
3434 if (!ISD::isNormalLoad(Load.getNode())) return false;
3435
3436 // Return LoadNode by reference.
3437 LoadNode = cast<LoadSDNode>(Load);
3438
3439 // Is store the only read of the loaded value?
3440 if (!Load.hasOneUse())
3441 return false;
3442
3443 // Is the address of the store the same as the load?
3444 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3445 LoadNode->getOffset() != StoreNode->getOffset())
3446 return false;
3447
3448 bool FoundLoad = false;
3449 SmallVector<SDValue, 4> ChainOps;
3450 SmallVector<const SDNode *, 4> LoopWorklist;
3452 const unsigned int Max = 1024;
3453
3454 // Visualization of Load-Op-Store fusion:
3455 // -------------------------
3456 // Legend:
3457 // *-lines = Chain operand dependencies.
3458 // |-lines = Normal operand dependencies.
3459 // Dependencies flow down and right. n-suffix references multiple nodes.
3460 //
3461 // C Xn C
3462 // * * *
3463 // * * *
3464 // Xn A-LD Yn TF Yn
3465 // * * \ | * |
3466 // * * \ | * |
3467 // * * \ | => A--LD_OP_ST
3468 // * * \| \
3469 // TF OP \
3470 // * | \ Zn
3471 // * | \
3472 // A-ST Zn
3473 //
3474
3475 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3476 // #2: Yn -> LD
3477 // #3: ST -> Zn
3478
3479 // Ensure the transform is safe by checking for the dual
3480 // dependencies to make sure we do not induce a loop.
3481
3482 // As LD is a predecessor to both OP and ST we can do this by checking:
3483 // a). if LD is a predecessor to a member of Xn or Yn.
3484 // b). if a Zn is a predecessor to ST.
3485
3486 // However, (b) can only occur through being a chain predecessor to
3487 // ST, which is the same as Zn being a member or predecessor of Xn,
3488 // which is a subset of LD being a predecessor of Xn. So it's
3489 // subsumed by check (a).
3490
3491 SDValue Chain = StoreNode->getChain();
3492
3493 // Gather X elements in ChainOps.
3494 if (Chain == Load.getValue(1)) {
3495 FoundLoad = true;
3496 ChainOps.push_back(Load.getOperand(0));
3497 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3498 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3499 SDValue Op = Chain.getOperand(i);
3500 if (Op == Load.getValue(1)) {
3501 FoundLoad = true;
3502 // Drop Load, but keep its chain. No cycle check necessary.
3503 ChainOps.push_back(Load.getOperand(0));
3504 continue;
3505 }
3506 LoopWorklist.push_back(Op.getNode());
3507 ChainOps.push_back(Op);
3508 }
3509 }
3510
3511 if (!FoundLoad)
3512 return false;
3513
3514 // Worklist is currently Xn. Add Yn to worklist.
3515 for (SDValue Op : StoredVal->ops())
3516 if (Op.getNode() != LoadNode)
3517 LoopWorklist.push_back(Op.getNode());
3518
3519 // Check (a) if Load is a predecessor to Xn + Yn
3520 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3521 true))
3522 return false;
3523
3524 InputChain =
3525 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3526 return true;
3527}
3528
3529// Change a chain of {load; op; store} of the same value into a simple op
3530// through memory of that value, if the uses of the modified value and its
3531// address are suitable.
3532//
3533// The tablegen pattern memory operand pattern is currently not able to match
3534// the case where the EFLAGS on the original operation are used.
3535//
3536// To move this to tablegen, we'll need to improve tablegen to allow flags to
3537// be transferred from a node in the pattern to the result node, probably with
3538// a new keyword. For example, we have this
3539// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3540// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3541// (implicit EFLAGS)]>;
3542// but maybe need something like this
3543// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3544// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3545// (transferrable EFLAGS)]>;
3546//
3547// Until then, we manually fold these and instruction select the operation
3548// here.
3549bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3550 auto *StoreNode = cast<StoreSDNode>(Node);
3551 SDValue StoredVal = StoreNode->getOperand(1);
3552 unsigned Opc = StoredVal->getOpcode();
3553
3554 // Before we try to select anything, make sure this is memory operand size
3555 // and opcode we can handle. Note that this must match the code below that
3556 // actually lowers the opcodes.
3557 EVT MemVT = StoreNode->getMemoryVT();
3558 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3559 MemVT != MVT::i8)
3560 return false;
3561
3562 bool IsCommutable = false;
3563 bool IsNegate = false;
3564 switch (Opc) {
3565 default:
3566 return false;
3567 case X86ISD::SUB:
3568 IsNegate = isNullConstant(StoredVal.getOperand(0));
3569 break;
3570 case X86ISD::SBB:
3571 break;
3572 case X86ISD::ADD:
3573 case X86ISD::ADC:
3574 case X86ISD::AND:
3575 case X86ISD::OR:
3576 case X86ISD::XOR:
3577 IsCommutable = true;
3578 break;
3579 }
3580
3581 unsigned LoadOpNo = IsNegate ? 1 : 0;
3582 LoadSDNode *LoadNode = nullptr;
3583 SDValue InputChain;
3584 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3585 LoadNode, InputChain)) {
3586 if (!IsCommutable)
3587 return false;
3588
3589 // This operation is commutable, try the other operand.
3590 LoadOpNo = 1;
3591 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3592 LoadNode, InputChain))
3593 return false;
3594 }
3595
3596 SDValue Base, Scale, Index, Disp, Segment;
3597 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3598 Segment))
3599 return false;
3600
3601 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3602 unsigned Opc8) {
3603 switch (MemVT.getSimpleVT().SimpleTy) {
3604 case MVT::i64:
3605 return Opc64;
3606 case MVT::i32:
3607 return Opc32;
3608 case MVT::i16:
3609 return Opc16;
3610 case MVT::i8:
3611 return Opc8;
3612 default:
3613 llvm_unreachable("Invalid size!");
3614 }
3615 };
3616
3618 switch (Opc) {
3619 case X86ISD::SUB:
3620 // Handle negate.
3621 if (IsNegate) {
3622 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3623 X86::NEG8m);
3624 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3625 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3626 MVT::Other, Ops);
3627 break;
3628 }
3629 [[fallthrough]];
3630 case X86ISD::ADD:
3631 // Try to match inc/dec.
3632 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3633 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3634 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3635 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3636 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3637 unsigned NewOpc =
3638 ((Opc == X86ISD::ADD) == IsOne)
3639 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3640 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3641 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3642 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3643 MVT::Other, Ops);
3644 break;
3645 }
3646 }
3647 [[fallthrough]];
3648 case X86ISD::ADC:
3649 case X86ISD::SBB:
3650 case X86ISD::AND:
3651 case X86ISD::OR:
3652 case X86ISD::XOR: {
3653 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3654 switch (Opc) {
3655 case X86ISD::ADD:
3656 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3657 X86::ADD8mr);
3658 case X86ISD::ADC:
3659 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3660 X86::ADC8mr);
3661 case X86ISD::SUB:
3662 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3663 X86::SUB8mr);
3664 case X86ISD::SBB:
3665 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3666 X86::SBB8mr);
3667 case X86ISD::AND:
3668 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3669 X86::AND8mr);
3670 case X86ISD::OR:
3671 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3672 case X86ISD::XOR:
3673 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3674 X86::XOR8mr);
3675 default:
3676 llvm_unreachable("Invalid opcode!");
3677 }
3678 };
3679 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3680 switch (Opc) {
3681 case X86ISD::ADD:
3682 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3683 X86::ADD8mi);
3684 case X86ISD::ADC:
3685 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3686 X86::ADC8mi);
3687 case X86ISD::SUB:
3688 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3689 X86::SUB8mi);
3690 case X86ISD::SBB:
3691 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3692 X86::SBB8mi);
3693 case X86ISD::AND:
3694 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3695 X86::AND8mi);
3696 case X86ISD::OR:
3697 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3698 X86::OR8mi);
3699 case X86ISD::XOR:
3700 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3701 X86::XOR8mi);
3702 default:
3703 llvm_unreachable("Invalid opcode!");
3704 }
3705 };
3706
3707 unsigned NewOpc = SelectRegOpcode(Opc);
3708 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3709
3710 // See if the operand is a constant that we can fold into an immediate
3711 // operand.
3712 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3713 int64_t OperandV = OperandC->getSExtValue();
3714
3715 // Check if we can shrink the operand enough to fit in an immediate (or
3716 // fit into a smaller immediate) by negating it and switching the
3717 // operation.
3718 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3719 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3720 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3721 isInt<32>(-OperandV))) &&
3722 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3723 OperandV = -OperandV;
3724 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3725 }
3726
3727 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3728 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3729 NewOpc = SelectImmOpcode(Opc);
3730 }
3731 }
3732
3733 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3734 SDValue CopyTo =
3735 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3736 StoredVal.getOperand(2), SDValue());
3737
3738 const SDValue Ops[] = {Base, Scale, Index, Disp,
3739 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3740 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3741 Ops);
3742 } else {
3743 const SDValue Ops[] = {Base, Scale, Index, Disp,
3744 Segment, Operand, InputChain};
3745 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3746 Ops);
3747 }
3748 break;
3749 }
3750 default:
3751 llvm_unreachable("Invalid opcode!");
3752 }
3753
3754 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3755 LoadNode->getMemOperand()};
3756 CurDAG->setNodeMemRefs(Result, MemOps);
3757
3758 // Update Load Chain uses as well.
3759 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3760 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3761 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3762 CurDAG->RemoveDeadNode(Node);
3763 return true;
3764}
3765
3766// See if this is an X & Mask that we can match to BEXTR/BZHI.
3767// Where Mask is one of the following patterns:
3768// a) x & (1 << nbits) - 1
3769// b) x & ~(-1 << nbits)
3770// c) x & (-1 >> (32 - y))
3771// d) x << (32 - y) >> (32 - y)
3772// e) (1 << nbits) - 1
3773bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3774 assert(
3775 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3776 Node->getOpcode() == ISD::SRL) &&
3777 "Should be either an and-mask, or right-shift after clearing high bits.");
3778
3779 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3780 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3781 return false;
3782
3783 MVT NVT = Node->getSimpleValueType(0);
3784
3785 // Only supported for 32 and 64 bits.
3786 if (NVT != MVT::i32 && NVT != MVT::i64)
3787 return false;
3788
3789 SDValue NBits;
3790 bool NegateNBits;
3791
3792 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3793 // Else, if we only have BMI1's BEXTR, we require one-use.
3794 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3795 auto checkUses = [AllowExtraUsesByDefault](
3796 SDValue Op, unsigned NUses,
3797 std::optional<bool> AllowExtraUses) {
3798 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3799 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3800 };
3801 auto checkOneUse = [checkUses](SDValue Op,
3802 std::optional<bool> AllowExtraUses =
3803 std::nullopt) {
3804 return checkUses(Op, 1, AllowExtraUses);
3805 };
3806 auto checkTwoUse = [checkUses](SDValue Op,
3807 std::optional<bool> AllowExtraUses =
3808 std::nullopt) {
3809 return checkUses(Op, 2, AllowExtraUses);
3810 };
3811
3812 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3813 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3814 assert(V.getSimpleValueType() == MVT::i32 &&
3815 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3816 "Expected i64 -> i32 truncation");
3817 V = V.getOperand(0);
3818 }
3819 return V;
3820 };
3821
3822 // a) x & ((1 << nbits) + (-1))
3823 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3824 &NegateNBits](SDValue Mask) -> bool {
3825 // Match `add`. Must only have one use!
3826 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3827 return false;
3828 // We should be adding all-ones constant (i.e. subtracting one.)
3829 if (!isAllOnesConstant(Mask->getOperand(1)))
3830 return false;
3831 // Match `1 << nbits`. Might be truncated. Must only have one use!
3832 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3833 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3834 return false;
3835 if (!isOneConstant(M0->getOperand(0)))
3836 return false;
3837 NBits = M0->getOperand(1);
3838 NegateNBits = false;
3839 return true;
3840 };
3841
3842 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3843 V = peekThroughOneUseTruncation(V);
3844 return CurDAG->MaskedValueIsAllOnes(
3845 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3846 NVT.getSizeInBits()));
3847 };
3848
3849 // b) x & ~(-1 << nbits)
3850 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3851 &NBits, &NegateNBits](SDValue Mask) -> bool {
3852 // Match `~()`. Must only have one use!
3853 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3854 return false;
3855 // The -1 only has to be all-ones for the final Node's NVT.
3856 if (!isAllOnes(Mask->getOperand(1)))
3857 return false;
3858 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3859 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3860 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3861 return false;
3862 // The -1 only has to be all-ones for the final Node's NVT.
3863 if (!isAllOnes(M0->getOperand(0)))
3864 return false;
3865 NBits = M0->getOperand(1);
3866 NegateNBits = false;
3867 return true;
3868 };
3869
3870 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3871 // or leave the shift amount as-is, but then we'll have to negate it.
3872 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3873 unsigned Bitwidth) {
3874 NBits = ShiftAmt;
3875 NegateNBits = true;
3876 // Skip over a truncate of the shift amount, if any.
3877 if (NBits.getOpcode() == ISD::TRUNCATE)
3878 NBits = NBits.getOperand(0);
3879 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3880 // If it doesn't match, that's fine, we'll just negate it ourselves.
3881 if (NBits.getOpcode() != ISD::SUB)
3882 return;
3883 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3884 if (!V0 || V0->getZExtValue() != Bitwidth)
3885 return;
3886 NBits = NBits.getOperand(1);
3887 NegateNBits = false;
3888 };
3889
3890 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3891 // or
3892 // c) x & (-1 >> (32 - y))
3893 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3894 canonicalizeShiftAmt](SDValue Mask) -> bool {
3895 // The mask itself may be truncated.
3896 Mask = peekThroughOneUseTruncation(Mask);
3897 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3898 // Match `l>>`. Must only have one use!
3899 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3900 return false;
3901 // We should be shifting truly all-ones constant.
3902 if (!isAllOnesConstant(Mask.getOperand(0)))
3903 return false;
3904 SDValue M1 = Mask.getOperand(1);
3905 // The shift amount should not be used externally.
3906 if (!checkOneUse(M1))
3907 return false;
3908 canonicalizeShiftAmt(M1, Bitwidth);
3909 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3910 // is no extra use of the mask. Clearly, there was one since we are here.
3911 // But at the same time, if we need to negate the shift amount,
3912 // then we don't want the mask to stick around, else it's unprofitable.
3913 return !NegateNBits;
3914 };
3915
3916 SDValue X;
3917
3918 // d) x << z >> z but then we'll have to subtract z from bitwidth
3919 // or
3920 // d) x << (32 - y) >> (32 - y)
3921 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3922 AllowExtraUsesByDefault, &NegateNBits,
3923 &X](SDNode *Node) -> bool {
3924 if (Node->getOpcode() != ISD::SRL)
3925 return false;
3926 SDValue N0 = Node->getOperand(0);
3927 if (N0->getOpcode() != ISD::SHL)
3928 return false;
3929 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3930 SDValue N1 = Node->getOperand(1);
3931 SDValue N01 = N0->getOperand(1);
3932 // Both of the shifts must be by the exact same value.
3933 if (N1 != N01)
3934 return false;
3935 canonicalizeShiftAmt(N1, Bitwidth);
3936 // There should not be any external uses of the inner shift / shift amount.
3937 // Note that while we are generally okay with external uses given BMI2,
3938 // iff we need to negate the shift amount, we are not okay with extra uses.
3939 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3940 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3941 return false;
3942 X = N0->getOperand(0);
3943 return true;
3944 };
3945
3946 auto matchLowBitMask = [matchPatternA, matchPatternB,
3947 matchPatternC](SDValue Mask) -> bool {
3948 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3949 };
3950
3951 if (Node->getOpcode() == ISD::AND) {
3952 X = Node->getOperand(0);
3953 SDValue Mask = Node->getOperand(1);
3954
3955 if (matchLowBitMask(Mask)) {
3956 // Great.
3957 } else {
3958 std::swap(X, Mask);
3959 if (!matchLowBitMask(Mask))
3960 return false;
3961 }
3962 } else if (matchLowBitMask(SDValue(Node, 0))) {
3963 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
3964 } else if (!matchPatternD(Node))
3965 return false;
3966
3967 // If we need to negate the shift amount, require BMI2 BZHI support.
3968 // It's just too unprofitable for BMI1 BEXTR.
3969 if (NegateNBits && !Subtarget->hasBMI2())
3970 return false;
3971
3972 SDLoc DL(Node);
3973
3974 // Truncate the shift amount.
3975 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3976 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3977
3978 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3979 // All the other bits are undefined, we do not care about them.
3980 SDValue ImplDef = SDValue(
3981 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3982 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3983
3984 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3985 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3986 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
3987 MVT::i32, ImplDef, NBits, SRIdxVal),
3988 0);
3989 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3990
3991 // We might have matched the amount of high bits to be cleared,
3992 // but we want the amount of low bits to be kept, so negate it then.
3993 if (NegateNBits) {
3994 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
3995 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
3996
3997 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
3998 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3999 }
4000
4001 if (Subtarget->hasBMI2()) {
4002 // Great, just emit the BZHI..
4003 if (NVT != MVT::i32) {
4004 // But have to place the bit count into the wide-enough register first.
4005 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4006 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4007 }
4008
4009 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4010 ReplaceNode(Node, Extract.getNode());
4011 SelectCode(Extract.getNode());
4012 return true;
4013 }
4014
4015 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4016 // *logically* shifted (potentially with one-use trunc inbetween),
4017 // and the truncation was the only use of the shift,
4018 // and if so look past one-use truncation.
4019 {
4020 SDValue RealX = peekThroughOneUseTruncation(X);
4021 // FIXME: only if the shift is one-use?
4022 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4023 X = RealX;
4024 }
4025
4026 MVT XVT = X.getSimpleValueType();
4027
4028 // Else, emitting BEXTR requires one more step.
4029 // The 'control' of BEXTR has the pattern of:
4030 // [15...8 bit][ 7...0 bit] location
4031 // [ bit count][ shift] name
4032 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4033
4034 // Shift NBits left by 8 bits, thus producing 'control'.
4035 // This makes the low 8 bits to be zero.
4036 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4037 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4038 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4039 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4040
4041 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4042 // FIXME: only if the shift is one-use?
4043 if (X.getOpcode() == ISD::SRL) {
4044 SDValue ShiftAmt = X.getOperand(1);
4045 X = X.getOperand(0);
4046
4047 assert(ShiftAmt.getValueType() == MVT::i8 &&
4048 "Expected shift amount to be i8");
4049
4050 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4051 // We could zext to i16 in some form, but we intentionally don't do that.
4052 SDValue OrigShiftAmt = ShiftAmt;
4053 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4054 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4055
4056 // And now 'or' these low 8 bits of shift amount into the 'control'.
4057 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4058 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4059 }
4060
4061 // But have to place the 'control' into the wide-enough register first.
4062 if (XVT != MVT::i32) {
4063 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4064 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4065 }
4066
4067 // And finally, form the BEXTR itself.
4068 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4069
4070 // The 'X' was originally truncated. Do that now.
4071 if (XVT != NVT) {
4072 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4073 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4074 }
4075
4076 ReplaceNode(Node, Extract.getNode());
4077 SelectCode(Extract.getNode());
4078
4079 return true;
4080}
4081
4082// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4083MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4084 MVT NVT = Node->getSimpleValueType(0);
4085 SDLoc dl(Node);
4086
4087 SDValue N0 = Node->getOperand(0);
4088 SDValue N1 = Node->getOperand(1);
4089
4090 // If we have TBM we can use an immediate for the control. If we have BMI
4091 // we should only do this if the BEXTR instruction is implemented well.
4092 // Otherwise moving the control into a register makes this more costly.
4093 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4094 // hoisting the move immediate would make it worthwhile with a less optimal
4095 // BEXTR?
4096 bool PreferBEXTR =
4097 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4098 if (!PreferBEXTR && !Subtarget->hasBMI2())
4099 return nullptr;
4100
4101 // Must have a shift right.
4102 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4103 return nullptr;
4104
4105 // Shift can't have additional users.
4106 if (!N0->hasOneUse())
4107 return nullptr;
4108
4109 // Only supported for 32 and 64 bits.
4110 if (NVT != MVT::i32 && NVT != MVT::i64)
4111 return nullptr;
4112
4113 // Shift amount and RHS of and must be constant.
4114 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4115 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4116 if (!MaskCst || !ShiftCst)
4117 return nullptr;
4118
4119 // And RHS must be a mask.
4120 uint64_t Mask = MaskCst->getZExtValue();
4121 if (!isMask_64(Mask))
4122 return nullptr;
4123
4124 uint64_t Shift = ShiftCst->getZExtValue();
4125 uint64_t MaskSize = llvm::popcount(Mask);
4126
4127 // Don't interfere with something that can be handled by extracting AH.
4128 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4129 if (Shift == 8 && MaskSize == 8)
4130 return nullptr;
4131
4132 // Make sure we are only using bits that were in the original value, not
4133 // shifted in.
4134 if (Shift + MaskSize > NVT.getSizeInBits())
4135 return nullptr;
4136
4137 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4138 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4139 // does not fit into 32 bits. Load folding is not a sufficient reason.
4140 if (!PreferBEXTR && MaskSize <= 32)
4141 return nullptr;
4142
4143 SDValue Control;
4144 unsigned ROpc, MOpc;
4145
4146#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4147 if (!PreferBEXTR) {
4148 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4149 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4150 // Let's perform the mask first, and apply shift later. Note that we need to
4151 // widen the mask to account for the fact that we'll apply shift afterwards!
4152 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4153 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4154 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4155 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4156 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4157 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4158 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4159 } else {
4160 // The 'control' of BEXTR has the pattern of:
4161 // [15...8 bit][ 7...0 bit] location
4162 // [ bit count][ shift] name
4163 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4164 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4165 if (Subtarget->hasTBM()) {
4166 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4167 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4168 } else {
4169 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4170 // BMI requires the immediate to placed in a register.
4171 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4172 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4173 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4174 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4175 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4176 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4177 }
4178 }
4179
4180 MachineSDNode *NewNode;
4181 SDValue Input = N0->getOperand(0);
4182 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4183 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4184 SDValue Ops[] = {
4185 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4186 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4187 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4188 // Update the chain.
4189 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4190 // Record the mem-refs
4191 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4192 } else {
4193 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4194 }
4195
4196 if (!PreferBEXTR) {
4197 // We still need to apply the shift.
4198 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4199 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4200 : GET_ND_IF_ENABLED(X86::SHR32ri);
4201 NewNode =
4202 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4203 }
4204
4205 return NewNode;
4206}
4207
4208// Emit a PCMISTR(I/M) instruction.
4209MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4210 bool MayFoldLoad, const SDLoc &dl,
4211 MVT VT, SDNode *Node) {
4212 SDValue N0 = Node->getOperand(0);
4213 SDValue N1 = Node->getOperand(1);
4214 SDValue Imm = Node->getOperand(2);
4215 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4216 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4217
4218 // Try to fold a load. No need to check alignment.
4219 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4220 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4221 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4222 N1.getOperand(0) };
4223 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4224 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4225 // Update the chain.
4226 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4227 // Record the mem-refs
4228 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4229 return CNode;
4230 }
4231
4232 SDValue Ops[] = { N0, N1, Imm };
4233 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4234 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4235 return CNode;
4236}
4237
4238// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4239// to emit a second instruction after this one. This is needed since we have two
4240// copyToReg nodes glued before this and we need to continue that glue through.
4241MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4242 bool MayFoldLoad, const SDLoc &dl,
4243 MVT VT, SDNode *Node,
4244 SDValue &InGlue) {
4245 SDValue N0 = Node->getOperand(0);
4246 SDValue N2 = Node->getOperand(2);
4247 SDValue Imm = Node->getOperand(4);
4248 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4249 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4250
4251 // Try to fold a load. No need to check alignment.
4252 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4253 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4254 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4255 N2.getOperand(0), InGlue };
4256 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4257 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4258 InGlue = SDValue(CNode, 3);
4259 // Update the chain.
4260 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4261 // Record the mem-refs
4262 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4263 return CNode;
4264 }
4265
4266 SDValue Ops[] = { N0, N2, Imm, InGlue };
4267 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4268 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4269 InGlue = SDValue(CNode, 2);
4270 return CNode;
4271}
4272
4273bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4274 EVT VT = N->getValueType(0);
4275
4276 // Only handle scalar shifts.
4277 if (VT.isVector())
4278 return false;
4279
4280 // Narrower shifts only mask to 5 bits in hardware.
4281 unsigned Size = VT == MVT::i64 ? 64 : 32;
4282
4283 SDValue OrigShiftAmt = N->getOperand(1);
4284 SDValue ShiftAmt = OrigShiftAmt;
4285 SDLoc DL(N);
4286
4287 // Skip over a truncate of the shift amount.
4288 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4289 ShiftAmt = ShiftAmt->getOperand(0);
4290
4291 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4292 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4293
4294 SDValue NewShiftAmt;
4295 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4296 ShiftAmt->getOpcode() == ISD::XOR) {
4297 SDValue Add0 = ShiftAmt->getOperand(0);
4298 SDValue Add1 = ShiftAmt->getOperand(1);
4299 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4300 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4301 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4302 // to avoid the ADD/SUB/XOR.
4303 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4304 NewShiftAmt = Add0;
4305
4306 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4307 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4308 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4309 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4310 // we can replace it with a NOT. In the XOR case it may save some code
4311 // size, in the SUB case it also may save a move.
4312 assert(Add0C == nullptr || Add1C == nullptr);
4313
4314 // We can only do N-X, not X-N
4315 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4316 return false;
4317
4318 EVT OpVT = ShiftAmt.getValueType();
4319
4320 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4321 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4322 Add0C == nullptr ? Add0 : Add1, AllOnes);
4323 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4324 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4325 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4326 // -X to generate a NEG instead of a SUB of a constant.
4327 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4328 Add0C->getZExtValue() != 0) {
4329 EVT SubVT = ShiftAmt.getValueType();
4330 SDValue X;
4331 if (Add0C->getZExtValue() % Size == 0)
4332 X = Add1;
4333 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4334 Add0C->getZExtValue() % 32 == 0) {
4335 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4336 // This is mainly beneficial if we already compute (x+n*32).
4337 if (Add1.getOpcode() == ISD::TRUNCATE) {
4338 Add1 = Add1.getOperand(0);
4339 SubVT = Add1.getValueType();
4340 }
4341 if (Add0.getValueType() != SubVT) {
4342 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4343 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4344 }
4345
4346 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4347 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4348 } else
4349 return false;
4350 // Insert a negate op.
4351 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4352 // that uses it that's not a shift.
4353 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4354 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4355 NewShiftAmt = Neg;
4356
4357 // Insert these operands into a valid topological order so they can
4358 // get selected independently.
4359 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4360 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4361 } else
4362 return false;
4363 } else
4364 return false;
4365
4366 if (NewShiftAmt.getValueType() != MVT::i8) {
4367 // Need to truncate the shift amount.
4368 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4369 // Add to a correct topological ordering.
4370 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4371 }
4372
4373 // Insert a new mask to keep the shift amount legal. This should be removed
4374 // by isel patterns.
4375 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4376 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4377 // Place in a correct topological ordering.
4378 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4379
4380 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4381 NewShiftAmt);
4382 if (UpdatedNode != N) {
4383 // If we found an existing node, we should replace ourselves with that node
4384 // and wait for it to be selected after its other users.
4385 ReplaceNode(N, UpdatedNode);
4386 return true;
4387 }
4388
4389 // If the original shift amount is now dead, delete it so that we don't run
4390 // it through isel.
4391 if (OrigShiftAmt.getNode()->use_empty())
4392 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4393
4394 // Now that we've optimized the shift amount, defer to normal isel to get
4395 // load folding and legacy vs BMI2 selection without repeating it here.
4396 SelectCode(N);
4397 return true;
4398}
4399
4400bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4401 MVT NVT = N->getSimpleValueType(0);
4402 unsigned Opcode = N->getOpcode();
4403 SDLoc dl(N);
4404
4405 // For operations of the form (x << C1) op C2, check if we can use a smaller
4406 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4407 SDValue Shift = N->getOperand(0);
4408 SDValue N1 = N->getOperand(1);
4409
4410 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4411 if (!Cst)
4412 return false;
4413
4414 int64_t Val = Cst->getSExtValue();
4415
4416 // If we have an any_extend feeding the AND, look through it to see if there
4417 // is a shift behind it. But only if the AND doesn't use the extended bits.
4418 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4419 bool FoundAnyExtend = false;
4420 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4421 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4422 isUInt<32>(Val)) {
4423 FoundAnyExtend = true;
4424 Shift = Shift.getOperand(0);
4425 }
4426
4427 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4428 return false;
4429
4430 // i8 is unshrinkable, i16 should be promoted to i32.
4431 if (NVT != MVT::i32 && NVT != MVT::i64)
4432 return false;
4433
4434 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4435 if (!ShlCst)
4436 return false;
4437
4438 uint64_t ShAmt = ShlCst->getZExtValue();
4439
4440 // Make sure that we don't change the operation by removing bits.
4441 // This only matters for OR and XOR, AND is unaffected.
4442 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4443 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4444 return false;
4445
4446 // Check the minimum bitwidth for the new constant.
4447 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4448 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4449 if (Opcode == ISD::AND) {
4450 // AND32ri is the same as AND64ri32 with zext imm.
4451 // Try this before sign extended immediates below.
4452 ShiftedVal = (uint64_t)Val >> ShAmt;
4453 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4454 return true;
4455 // Also swap order when the AND can become MOVZX.
4456 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4457 return true;
4458 }
4459 ShiftedVal = Val >> ShAmt;
4460 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4461 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4462 return true;
4463 if (Opcode != ISD::AND) {
4464 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4465 ShiftedVal = (uint64_t)Val >> ShAmt;
4466 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4467 return true;
4468 }
4469 return false;
4470 };
4471
4472 int64_t ShiftedVal;
4473 if (!CanShrinkImmediate(ShiftedVal))
4474 return false;
4475
4476 // Ok, we can reorder to get a smaller immediate.
4477
4478 // But, its possible the original immediate allowed an AND to become MOVZX.
4479 // Doing this late due to avoid the MakedValueIsZero call as late as
4480 // possible.
4481 if (Opcode == ISD::AND) {
4482 // Find the smallest zext this could possibly be.
4483 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4484 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4485
4486 // Figure out which bits need to be zero to achieve that mask.
4487 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4488 ZExtWidth);
4489 NeededMask &= ~Cst->getAPIntValue();
4490
4491 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4492 return false;
4493 }
4494
4495 SDValue X = Shift.getOperand(0);
4496 if (FoundAnyExtend) {
4497 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4498 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4499 X = NewX;
4500 }
4501
4502 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
4503 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4504 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4505 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4506 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4507 Shift.getOperand(1));
4508 ReplaceNode(N, NewSHL.getNode());
4509 SelectCode(NewSHL.getNode());
4510 return true;
4511}
4512
4513bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4514 SDNode *ParentB, SDNode *ParentC,
4516 uint8_t Imm) {
4517 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4518 C.isOperandOf(ParentC) && "Incorrect parent node");
4519
4520 auto tryFoldLoadOrBCast =
4521 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4522 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4523 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4524 return true;
4525
4526 // Not a load, check for broadcast which may be behind a bitcast.
4527 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4528 P = L.getNode();
4529 L = L.getOperand(0);
4530 }
4531
4532 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4533 return false;
4534
4535 // Only 32 and 64 bit broadcasts are supported.
4536 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4537 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4538 if (Size != 32 && Size != 64)
4539 return false;
4540
4541 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4542 };
4543
4544 bool FoldedLoad = false;
4545 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4546 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4547 FoldedLoad = true;
4548 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4549 Tmp4)) {
4550 FoldedLoad = true;
4551 std::swap(A, C);
4552 // Swap bits 1/4 and 3/6.
4553 uint8_t OldImm = Imm;
4554 Imm = OldImm & 0xa5;
4555 if (OldImm & 0x02) Imm |= 0x10;
4556 if (OldImm & 0x10) Imm |= 0x02;
4557 if (OldImm & 0x08) Imm |= 0x40;
4558 if (OldImm & 0x40) Imm |= 0x08;
4559 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4560 Tmp4)) {
4561 FoldedLoad = true;
4562 std::swap(B, C);
4563 // Swap bits 1/2 and 5/6.
4564 uint8_t OldImm = Imm;
4565 Imm = OldImm & 0x99;
4566 if (OldImm & 0x02) Imm |= 0x04;
4567 if (OldImm & 0x04) Imm |= 0x02;
4568 if (OldImm & 0x20) Imm |= 0x40;
4569 if (OldImm & 0x40) Imm |= 0x20;
4570 }
4571
4572 SDLoc DL(Root);
4573
4574 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4575
4576 MVT NVT = Root->getSimpleValueType(0);
4577
4578 MachineSDNode *MNode;
4579 if (FoldedLoad) {
4580 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4581
4582 unsigned Opc;
4583 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4584 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4585 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4586 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4587
4588 bool UseD = EltSize == 32;
4589 if (NVT.is128BitVector())
4590 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4591 else if (NVT.is256BitVector())
4592 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4593 else if (NVT.is512BitVector())
4594 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4595 else
4596 llvm_unreachable("Unexpected vector size!");
4597 } else {
4598 bool UseD = NVT.getVectorElementType() == MVT::i32;
4599 if (NVT.is128BitVector())
4600 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4601 else if (NVT.is256BitVector())
4602 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4603 else if (NVT.is512BitVector())
4604 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4605 else
4606 llvm_unreachable("Unexpected vector size!");
4607 }
4608
4609 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4610 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4611
4612 // Update the chain.
4613 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4614 // Record the mem-refs
4615 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4616 } else {
4617 bool UseD = NVT.getVectorElementType() == MVT::i32;
4618 unsigned Opc;
4619 if (NVT.is128BitVector())
4620 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4621 else if (NVT.is256BitVector())
4622 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4623 else if (NVT.is512BitVector())
4624 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4625 else
4626 llvm_unreachable("Unexpected vector size!");
4627
4628 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4629 }
4630
4631 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4632 CurDAG->RemoveDeadNode(Root);
4633 return true;
4634}
4635
4636// Try to match two logic ops to a VPTERNLOG.
4637// FIXME: Handle more complex patterns that use an operand more than once?
4638bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4639 MVT NVT = N->getSimpleValueType(0);
4640
4641 // Make sure we support VPTERNLOG.
4642 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4643 NVT.getVectorElementType() == MVT::i1)
4644 return false;
4645
4646 // We need VLX for 128/256-bit.
4647 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4648 return false;
4649
4650 SDValue N0 = N->getOperand(0);
4651 SDValue N1 = N->getOperand(1);
4652
4653 auto getFoldableLogicOp = [](SDValue Op) {
4654 // Peek through single use bitcast.
4655 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4656 Op = Op.getOperand(0);
4657
4658 if (!Op.hasOneUse())
4659 return SDValue();
4660
4661 unsigned Opc = Op.getOpcode();
4662 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4663 Opc == X86ISD::ANDNP)
4664 return Op;
4665
4666 return SDValue();
4667 };
4668
4669 SDValue A, FoldableOp;
4670 if ((FoldableOp = getFoldableLogicOp(N1))) {
4671 A = N0;
4672 } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4673 A = N1;
4674 } else
4675 return false;
4676
4677 SDValue B = FoldableOp.getOperand(0);
4678 SDValue C = FoldableOp.getOperand(1);
4679 SDNode *ParentA = N;
4680 SDNode *ParentB = FoldableOp.getNode();
4681 SDNode *ParentC = FoldableOp.getNode();
4682
4683 // We can build the appropriate control immediate by performing the logic
4684 // operation we're matching using these constants for A, B, and C.
4685 uint8_t TernlogMagicA = 0xf0;
4686 uint8_t TernlogMagicB = 0xcc;
4687 uint8_t TernlogMagicC = 0xaa;
4688
4689 // Some of the inputs may be inverted, peek through them and invert the
4690 // magic values accordingly.
4691 // TODO: There may be a bitcast before the xor that we should peek through.
4692 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4693 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4694 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4695 Magic = ~Magic;
4696 Parent = Op.getNode();
4697 Op = Op.getOperand(0);
4698 }
4699 };
4700
4701 PeekThroughNot(A, ParentA, TernlogMagicA);
4702 PeekThroughNot(B, ParentB, TernlogMagicB);
4703 PeekThroughNot(C, ParentC, TernlogMagicC);
4704
4705 uint8_t Imm;
4706 switch (FoldableOp.getOpcode()) {
4707 default: llvm_unreachable("Unexpected opcode!");
4708 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4709 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4710 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4711 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4712 }
4713
4714 switch (N->getOpcode()) {
4715 default: llvm_unreachable("Unexpected opcode!");
4716 case X86ISD::ANDNP:
4717 if (A == N0)
4718 Imm &= ~TernlogMagicA;
4719 else
4720 Imm = ~(Imm) & TernlogMagicA;
4721 break;
4722 case ISD::AND: Imm &= TernlogMagicA; break;
4723 case ISD::OR: Imm |= TernlogMagicA; break;
4724 case ISD::XOR: Imm ^= TernlogMagicA; break;
4725 }
4726
4727 return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4728}
4729
4730/// If the high bits of an 'and' operand are known zero, try setting the
4731/// high bits of an 'and' constant operand to produce a smaller encoding by
4732/// creating a small, sign-extended negative immediate rather than a large
4733/// positive one. This reverses a transform in SimplifyDemandedBits that
4734/// shrinks mask constants by clearing bits. There is also a possibility that
4735/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4736/// case, just replace the 'and'. Return 'true' if the node is replaced.
4737bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4738 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4739 // have immediate operands.
4740 MVT VT = And->getSimpleValueType(0);
4741 if (VT != MVT::i32 && VT != MVT::i64)
4742 return false;
4743
4744 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4745 if (!And1C)
4746 return false;
4747
4748 // Bail out if the mask constant is already negative. It's can't shrink more.
4749 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4750 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4751 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4752 // are negative too.
4753 APInt MaskVal = And1C->getAPIntValue();
4754 unsigned MaskLZ = MaskVal.countl_zero();
4755 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4756 return false;
4757
4758 // Don't extend into the upper 32 bits of a 64 bit mask.
4759 if (VT == MVT::i64 && MaskLZ >= 32) {
4760 MaskLZ -= 32;
4761 MaskVal = MaskVal.trunc(32);
4762 }
4763
4764 SDValue And0 = And->getOperand(0);
4765 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4766 APInt NegMaskVal = MaskVal | HighZeros;
4767
4768 // If a negative constant would not allow a smaller encoding, there's no need
4769 // to continue. Only change the constant when we know it's a win.
4770 unsigned MinWidth = NegMaskVal.getSignificantBits();
4771 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4772 return false;
4773
4774 // Extend masks if we truncated above.
4775 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4776 NegMaskVal = NegMaskVal.zext(64);
4777 HighZeros = HighZeros.zext(64);
4778 }
4779
4780 // The variable operand must be all zeros in the top bits to allow using the
4781 // new, negative constant as the mask.
4782 if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
4783 return false;
4784
4785 // Check if the mask is -1. In that case, this is an unnecessary instruction
4786 // that escaped earlier analysis.
4787 if (NegMaskVal.isAllOnes()) {
4788 ReplaceNode(And, And0.getNode());
4789 return true;
4790 }
4791
4792 // A negative mask allows a smaller encoding. Create a new 'and' node.
4793 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4794 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4795 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4796 ReplaceNode(And, NewAnd.getNode());
4797 SelectCode(NewAnd.getNode());
4798 return true;
4799}
4800
4801static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4802 bool FoldedBCast, bool Masked) {
4803#define VPTESTM_CASE(VT, SUFFIX) \
4804case MVT::VT: \
4805 if (Masked) \
4806 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4807 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4808
4809
4810#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4811default: llvm_unreachable("Unexpected VT!"); \
4812VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4813VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4814VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4815VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4816VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4817VPTESTM_CASE(v8i64, QZ##SUFFIX)
4818
4819#define VPTESTM_FULL_CASES(SUFFIX) \
4820VPTESTM_BROADCAST_CASES(SUFFIX) \
4821VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4822VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4823VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4824VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4825VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4826VPTESTM_CASE(v32i16, WZ##SUFFIX)
4827
4828 if (FoldedBCast) {
4829 switch (TestVT.SimpleTy) {
4831 }
4832 }
4833
4834 if (FoldedLoad) {
4835 switch (TestVT.SimpleTy) {
4837 }
4838 }
4839
4840 switch (TestVT.SimpleTy) {
4842 }
4843
4844#undef VPTESTM_FULL_CASES
4845#undef VPTESTM_BROADCAST_CASES
4846#undef VPTESTM_CASE
4847}
4848
4849// Try to create VPTESTM instruction. If InMask is not null, it will be used
4850// to form a masked operation.
4851bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4852 SDValue InMask) {
4853 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4854 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4855 "Unexpected VT!");
4856
4857 // Look for equal and not equal compares.
4858 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4859 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4860 return false;
4861
4862 SDValue SetccOp0 = Setcc.getOperand(0);
4863 SDValue SetccOp1 = Setcc.getOperand(1);
4864
4865 // Canonicalize the all zero vector to the RHS.
4866 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4867