LLVM 20.0.0git
X86ISelDAGToDAG.cpp
Go to the documentation of this file.
1//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a DAG pattern matching instruction selector for X86,
10// converting from a legalized dag to a X86 dag.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelDAGToDAG.h"
15#include "X86.h"
17#include "X86Subtarget.h"
18#include "X86TargetMachine.h"
19#include "llvm/ADT/Statistic.h"
22#include "llvm/Config/llvm-config.h"
24#include "llvm/IR/Function.h"
26#include "llvm/IR/Intrinsics.h"
27#include "llvm/IR/IntrinsicsX86.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IR/Type.h"
30#include "llvm/Support/Debug.h"
34#include <cstdint>
35
36using namespace llvm;
37
38#define DEBUG_TYPE "x86-isel"
39#define PASS_NAME "X86 DAG->DAG Instruction Selection"
40
41STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46
48 "x86-promote-anyext-load", cl::init(true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50
52
53//===----------------------------------------------------------------------===//
54// Pattern Matcher Implementation
55//===----------------------------------------------------------------------===//
56
57namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
65
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
69
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
83
84 X86ISelAddressMode() = default;
85
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89 }
90
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94 }
95
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
103 }
104
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
108 }
109
110#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151 }
152#endif
153 };
154}
155
156namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
160 ///
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
165
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
168
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
171
172 public:
173 X86DAGToDAGISel() = delete;
174
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 "indirect-tls-seg-refs");
184
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
187 assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
188 "OptForMinSize implies OptForSize");
190 }
191
192 void emitFunctionEntryCode() override;
193
194 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
195
196 void PreprocessISelDAG() override;
197 void PostprocessISelDAG() override;
198
199// Include the pieces autogenerated from the target description.
200#include "X86GenDAGISel.inc"
201
202 private:
203 void Select(SDNode *N) override;
204
205 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
206 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
207 bool AllowSegmentRegForX32 = false);
208 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
209 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
210 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
211 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
212 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
217 unsigned Depth);
218 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
219 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
220 SDValue &Scale, SDValue &Index, SDValue &Disp,
221 SDValue &Segment);
222 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
223 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
224 SDValue &Index, SDValue &Disp, SDValue &Segment);
225 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
226 bool selectLEAAddr(SDValue N, SDValue &Base,
227 SDValue &Scale, SDValue &Index, SDValue &Disp,
228 SDValue &Segment);
229 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
233 SDValue &Scale, SDValue &Index, SDValue &Disp,
234 SDValue &Segment);
235 bool selectRelocImm(SDValue N, SDValue &Op);
236
237 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
238 SDValue &Base, SDValue &Scale,
239 SDValue &Index, SDValue &Disp,
240 SDValue &Segment);
241
242 // Convenience method where P is also root.
243 bool tryFoldLoad(SDNode *P, SDValue N,
244 SDValue &Base, SDValue &Scale,
245 SDValue &Index, SDValue &Disp,
246 SDValue &Segment) {
247 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
248 }
249
250 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
251 SDValue &Base, SDValue &Scale,
252 SDValue &Index, SDValue &Disp,
253 SDValue &Segment);
254
255 bool isProfitableToFormMaskedOp(SDNode *N) const;
256
257 /// Implement addressing mode selection for inline asm expressions.
259 InlineAsm::ConstraintCode ConstraintID,
260 std::vector<SDValue> &OutOps) override;
261
262 void emitSpecialCodeForMain();
263
264 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
265 MVT VT, SDValue &Base, SDValue &Scale,
266 SDValue &Index, SDValue &Disp,
267 SDValue &Segment) {
268 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
269 Base = CurDAG->getTargetFrameIndex(
270 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
271 else if (AM.Base_Reg.getNode())
272 Base = AM.Base_Reg;
273 else
274 Base = CurDAG->getRegister(0, VT);
275
276 Scale = getI8Imm(AM.Scale, DL);
277
278#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
279 // Negate the index if needed.
280 if (AM.NegateIndex) {
281 unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
282 : GET_ND_IF_ENABLED(X86::NEG32r);
283 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
284 AM.IndexReg), 0);
285 AM.IndexReg = Neg;
286 }
287
288 if (AM.IndexReg.getNode())
289 Index = AM.IndexReg;
290 else
291 Index = CurDAG->getRegister(0, VT);
292
293 // These are 32-bit even in 64-bit mode since RIP-relative offset
294 // is 32-bit.
295 if (AM.GV)
296 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
297 MVT::i32, AM.Disp,
298 AM.SymbolFlags);
299 else if (AM.CP)
300 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
301 AM.Disp, AM.SymbolFlags);
302 else if (AM.ES) {
303 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
304 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
305 } else if (AM.MCSym) {
306 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
307 assert(AM.SymbolFlags == 0 && "oo");
308 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
309 } else if (AM.JT != -1) {
310 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
311 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
312 } else if (AM.BlockAddr)
313 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
314 AM.SymbolFlags);
315 else
316 Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
317
318 if (AM.Segment.getNode())
319 Segment = AM.Segment;
320 else
321 Segment = CurDAG->getRegister(0, MVT::i16);
322 }
323
324 // Utility function to determine whether it is AMX SDNode right after
325 // lowering but before ISEL.
326 bool isAMXSDNode(SDNode *N) const {
327 // Check if N is AMX SDNode:
328 // 1. check specific opcode since these carry MVT::Untyped instead of
329 // x86amx_type;
330 // 2. check result type;
331 // 3. check operand type;
332 switch (N->getOpcode()) {
333 default:
334 break;
335 case X86::PT2RPNTLVWZ0V:
336 case X86::PT2RPNTLVWZ0T1V:
337 case X86::PT2RPNTLVWZ1V:
338 case X86::PT2RPNTLVWZ1T1V:
339 case X86::PT2RPNTLVWZ0RSV:
340 case X86::PT2RPNTLVWZ0RST1V:
341 case X86::PT2RPNTLVWZ1RSV:
342 case X86::PT2RPNTLVWZ1RST1V:
343 return true;
344 }
345 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
346 if (N->getValueType(Idx) == MVT::x86amx)
347 return true;
348 }
349 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
350 SDValue Op = N->getOperand(Idx);
351 if (Op.getValueType() == MVT::x86amx)
352 return true;
353 }
354 return false;
355 }
356
357 // Utility function to determine whether we should avoid selecting
358 // immediate forms of instructions for better code size or not.
359 // At a high level, we'd like to avoid such instructions when
360 // we have similar constants used within the same basic block
361 // that can be kept in a register.
362 //
363 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
364 uint32_t UseCount = 0;
365
366 // Do not want to hoist if we're not optimizing for size.
367 // TODO: We'd like to remove this restriction.
368 // See the comment in X86InstrInfo.td for more info.
369 if (!CurDAG->shouldOptForSize())
370 return false;
371
372 // Walk all the users of the immediate.
373 for (const SDNode *User : N->users()) {
374 if (UseCount >= 2)
375 break;
376
377 // This user is already selected. Count it as a legitimate use and
378 // move on.
379 if (User->isMachineOpcode()) {
380 UseCount++;
381 continue;
382 }
383
384 // We want to count stores of immediates as real uses.
385 if (User->getOpcode() == ISD::STORE &&
386 User->getOperand(1).getNode() == N) {
387 UseCount++;
388 continue;
389 }
390
391 // We don't currently match users that have > 2 operands (except
392 // for stores, which are handled above)
393 // Those instruction won't match in ISEL, for now, and would
394 // be counted incorrectly.
395 // This may change in the future as we add additional instruction
396 // types.
397 if (User->getNumOperands() != 2)
398 continue;
399
400 // If this is a sign-extended 8-bit integer immediate used in an ALU
401 // instruction, there is probably an opcode encoding to save space.
402 auto *C = dyn_cast<ConstantSDNode>(N);
403 if (C && isInt<8>(C->getSExtValue()))
404 continue;
405
406 // Immediates that are used for offsets as part of stack
407 // manipulation should be left alone. These are typically
408 // used to indicate SP offsets for argument passing and
409 // will get pulled into stores/pushes (implicitly).
410 if (User->getOpcode() == X86ISD::ADD ||
411 User->getOpcode() == ISD::ADD ||
412 User->getOpcode() == X86ISD::SUB ||
413 User->getOpcode() == ISD::SUB) {
414
415 // Find the other operand of the add/sub.
416 SDValue OtherOp = User->getOperand(0);
417 if (OtherOp.getNode() == N)
418 OtherOp = User->getOperand(1);
419
420 // Don't count if the other operand is SP.
421 RegisterSDNode *RegNode;
422 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
423 (RegNode = dyn_cast_or_null<RegisterSDNode>(
424 OtherOp->getOperand(1).getNode())))
425 if ((RegNode->getReg() == X86::ESP) ||
426 (RegNode->getReg() == X86::RSP))
427 continue;
428 }
429
430 // ... otherwise, count this and move on.
431 UseCount++;
432 }
433
434 // If we have more than 1 use, then recommend for hoisting.
435 return (UseCount > 1);
436 }
437
438 /// Return a target constant with the specified value of type i8.
439 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
440 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
441 }
442
443 /// Return a target constant with the specified value, of type i32.
444 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
445 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
446 }
447
448 /// Return a target constant with the specified value, of type i64.
449 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
450 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
451 }
452
453 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
454 const SDLoc &DL) {
455 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
456 uint64_t Index = N->getConstantOperandVal(1);
457 MVT VecVT = N->getOperand(0).getSimpleValueType();
458 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
459 }
460
461 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
462 const SDLoc &DL) {
463 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
464 uint64_t Index = N->getConstantOperandVal(2);
465 MVT VecVT = N->getSimpleValueType(0);
466 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
467 }
468
469 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
470 const SDLoc &DL) {
471 assert(VecWidth == 128 && "Unexpected vector width");
472 uint64_t Index = N->getConstantOperandVal(2);
473 MVT VecVT = N->getSimpleValueType(0);
474 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
475 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
476 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
477 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
478 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
479 }
480
481 SDValue getSBBZero(SDNode *N) {
482 SDLoc dl(N);
483 MVT VT = N->getSimpleValueType(0);
484
485 // Create zero.
486 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
487 SDValue Zero =
488 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
489 if (VT == MVT::i64) {
490 Zero = SDValue(
491 CurDAG->getMachineNode(
492 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
493 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
494 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
495 0);
496 }
497
498 // Copy flags to the EFLAGS register and glue it to next node.
499 unsigned Opcode = N->getOpcode();
500 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
501 "Unexpected opcode for SBB materialization");
502 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
503 SDValue EFLAGS =
504 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
505 N->getOperand(FlagOpIndex), SDValue());
506
507 // Create a 64-bit instruction if the result is 64-bits otherwise use the
508 // 32-bit version.
509 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
510 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
511 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
512 return SDValue(
513 CurDAG->getMachineNode(Opc, dl, VTs,
514 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
515 0);
516 }
517
518 // Helper to detect unneeded and instructions on shift amounts. Called
519 // from PatFrags in tablegen.
520 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
521 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
522 const APInt &Val = N->getConstantOperandAPInt(1);
523
524 if (Val.countr_one() >= Width)
525 return true;
526
527 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
528 return Mask.countr_one() >= Width;
529 }
530
531 /// Return an SDNode that returns the value of the global base register.
532 /// Output instructions required to initialize the global base register,
533 /// if necessary.
534 SDNode *getGlobalBaseReg();
535
536 /// Return a reference to the TargetMachine, casted to the target-specific
537 /// type.
538 const X86TargetMachine &getTargetMachine() const {
539 return static_cast<const X86TargetMachine &>(TM);
540 }
541
542 /// Return a reference to the TargetInstrInfo, casted to the target-specific
543 /// type.
544 const X86InstrInfo *getInstrInfo() const {
545 return Subtarget->getInstrInfo();
546 }
547
548 /// Return a condition code of the given SDNode
549 X86::CondCode getCondFromNode(SDNode *N) const;
550
551 /// Address-mode matching performs shift-of-and to and-of-shift
552 /// reassociation in order to expose more scaled addressing
553 /// opportunities.
554 bool ComplexPatternFuncMutatesDAG() const override {
555 return true;
556 }
557
558 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
559
560 // Indicates we should prefer to use a non-temporal load for this load.
561 bool useNonTemporalLoad(LoadSDNode *N) const {
562 if (!N->isNonTemporal())
563 return false;
564
565 unsigned StoreSize = N->getMemoryVT().getStoreSize();
566
567 if (N->getAlign().value() < StoreSize)
568 return false;
569
570 switch (StoreSize) {
571 default: llvm_unreachable("Unsupported store size");
572 case 4:
573 case 8:
574 return false;
575 case 16:
576 return Subtarget->hasSSE41();
577 case 32:
578 return Subtarget->hasAVX2();
579 case 64:
580 return Subtarget->hasAVX512();
581 }
582 }
583
584 bool foldLoadStoreIntoMemOperand(SDNode *Node);
585 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
586 bool matchBitExtract(SDNode *Node);
587 bool shrinkAndImmediate(SDNode *N);
588 bool isMaskZeroExtended(SDNode *N) const;
589 bool tryShiftAmountMod(SDNode *N);
590 bool tryShrinkShlLogicImm(SDNode *N);
591 bool tryVPTERNLOG(SDNode *N);
592 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
593 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
594 uint8_t Imm);
595 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
596 bool tryMatchBitSelect(SDNode *N);
597
598 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
599 const SDLoc &dl, MVT VT, SDNode *Node);
600 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
601 const SDLoc &dl, MVT VT, SDNode *Node,
602 SDValue &InGlue);
603
604 bool tryOptimizeRem8Extend(SDNode *N);
605
606 bool onlyUsesZeroFlag(SDValue Flags) const;
607 bool hasNoSignFlagUses(SDValue Flags) const;
608 bool hasNoCarryFlagUses(SDValue Flags) const;
609 };
610
611 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
612 public:
613 static char ID;
614 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
615 CodeGenOptLevel OptLevel)
617 ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
618 };
619}
620
621char X86DAGToDAGISelLegacy::ID = 0;
622
623INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
624
625// Returns true if this masked compare can be implemented legally with this
626// type.
627static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
628 unsigned Opcode = N->getOpcode();
629 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
630 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
631 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
632 // We can get 256-bit 8 element types here without VLX being enabled. When
633 // this happens we will use 512-bit operations and the mask will not be
634 // zero extended.
635 EVT OpVT = N->getOperand(0).getValueType();
636 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
637 // second operand.
638 if (Opcode == X86ISD::STRICT_CMPM)
639 OpVT = N->getOperand(1).getValueType();
640 if (OpVT.is256BitVector() || OpVT.is128BitVector())
641 return Subtarget->hasVLX();
642
643 return true;
644 }
645 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
646 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
647 Opcode == X86ISD::FSETCCM_SAE)
648 return true;
649
650 return false;
651}
652
653// Returns true if we can assume the writer of the mask has zero extended it
654// for us.
655bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
656 // If this is an AND, check if we have a compare on either side. As long as
657 // one side guarantees the mask is zero extended, the AND will preserve those
658 // zeros.
659 if (N->getOpcode() == ISD::AND)
660 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
661 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
662
663 return isLegalMaskCompare(N, Subtarget);
664}
665
666bool
667X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
668 if (OptLevel == CodeGenOptLevel::None)
669 return false;
670
671 if (!N.hasOneUse())
672 return false;
673
674 if (N.getOpcode() != ISD::LOAD)
675 return true;
676
677 // Don't fold non-temporal loads if we have an instruction for them.
678 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
679 return false;
680
681 // If N is a load, do additional profitability checks.
682 if (U == Root) {
683 switch (U->getOpcode()) {
684 default: break;
685 case X86ISD::ADD:
686 case X86ISD::ADC:
687 case X86ISD::SUB:
688 case X86ISD::SBB:
689 case X86ISD::AND:
690 case X86ISD::XOR:
691 case X86ISD::OR:
692 case ISD::ADD:
693 case ISD::UADDO_CARRY:
694 case ISD::AND:
695 case ISD::OR:
696 case ISD::XOR: {
697 SDValue Op1 = U->getOperand(1);
698
699 // If the other operand is a 8-bit immediate we should fold the immediate
700 // instead. This reduces code size.
701 // e.g.
702 // movl 4(%esp), %eax
703 // addl $4, %eax
704 // vs.
705 // movl $4, %eax
706 // addl 4(%esp), %eax
707 // The former is 2 bytes shorter. In case where the increment is 1, then
708 // the saving can be 4 bytes (by using incl %eax).
709 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
710 if (Imm->getAPIntValue().isSignedIntN(8))
711 return false;
712
713 // If this is a 64-bit AND with an immediate that fits in 32-bits,
714 // prefer using the smaller and over folding the load. This is needed to
715 // make sure immediates created by shrinkAndImmediate are always folded.
716 // Ideally we would narrow the load during DAG combine and get the
717 // best of both worlds.
718 if (U->getOpcode() == ISD::AND &&
719 Imm->getAPIntValue().getBitWidth() == 64 &&
720 Imm->getAPIntValue().isIntN(32))
721 return false;
722
723 // If this really a zext_inreg that can be represented with a movzx
724 // instruction, prefer that.
725 // TODO: We could shrink the load and fold if it is non-volatile.
726 if (U->getOpcode() == ISD::AND &&
727 (Imm->getAPIntValue() == UINT8_MAX ||
728 Imm->getAPIntValue() == UINT16_MAX ||
729 Imm->getAPIntValue() == UINT32_MAX))
730 return false;
731
732 // ADD/SUB with can negate the immediate and use the opposite operation
733 // to fit 128 into a sign extended 8 bit immediate.
734 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
735 (-Imm->getAPIntValue()).isSignedIntN(8))
736 return false;
737
738 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
739 (-Imm->getAPIntValue()).isSignedIntN(8) &&
740 hasNoCarryFlagUses(SDValue(U, 1)))
741 return false;
742 }
743
744 // If the other operand is a TLS address, we should fold it instead.
745 // This produces
746 // movl %gs:0, %eax
747 // leal i@NTPOFF(%eax), %eax
748 // instead of
749 // movl $i@NTPOFF, %eax
750 // addl %gs:0, %eax
751 // if the block also has an access to a second TLS address this will save
752 // a load.
753 // FIXME: This is probably also true for non-TLS addresses.
754 if (Op1.getOpcode() == X86ISD::Wrapper) {
755 SDValue Val = Op1.getOperand(0);
757 return false;
758 }
759
760 // Don't fold load if this matches the BTS/BTR/BTC patterns.
761 // BTS: (or X, (shl 1, n))
762 // BTR: (and X, (rotl -2, n))
763 // BTC: (xor X, (shl 1, n))
764 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
765 if (U->getOperand(0).getOpcode() == ISD::SHL &&
766 isOneConstant(U->getOperand(0).getOperand(0)))
767 return false;
768
769 if (U->getOperand(1).getOpcode() == ISD::SHL &&
770 isOneConstant(U->getOperand(1).getOperand(0)))
771 return false;
772 }
773 if (U->getOpcode() == ISD::AND) {
774 SDValue U0 = U->getOperand(0);
775 SDValue U1 = U->getOperand(1);
776 if (U0.getOpcode() == ISD::ROTL) {
777 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
778 if (C && C->getSExtValue() == -2)
779 return false;
780 }
781
782 if (U1.getOpcode() == ISD::ROTL) {
783 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
784 if (C && C->getSExtValue() == -2)
785 return false;
786 }
787 }
788
789 break;
790 }
791 case ISD::SHL:
792 case ISD::SRA:
793 case ISD::SRL:
794 // Don't fold a load into a shift by immediate. The BMI2 instructions
795 // support folding a load, but not an immediate. The legacy instructions
796 // support folding an immediate, but can't fold a load. Folding an
797 // immediate is preferable to folding a load.
798 if (isa<ConstantSDNode>(U->getOperand(1)))
799 return false;
800
801 break;
802 }
803 }
804
805 // Prevent folding a load if this can implemented with an insert_subreg or
806 // a move that implicitly zeroes.
807 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
808 isNullConstant(Root->getOperand(2)) &&
809 (Root->getOperand(0).isUndef() ||
811 return false;
812
813 return true;
814}
815
816// Indicates it is profitable to form an AVX512 masked operation. Returning
817// false will favor a masked register-register masked move or vblendm and the
818// operation will be selected separately.
819bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
820 assert(
821 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
822 "Unexpected opcode!");
823
824 // If the operation has additional users, the operation will be duplicated.
825 // Check the use count to prevent that.
826 // FIXME: Are there cheap opcodes we might want to duplicate?
827 return N->getOperand(1).hasOneUse();
828}
829
830/// Replace the original chain operand of the call with
831/// load's chain operand and move load below the call's chain operand.
832static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
833 SDValue Call, SDValue OrigChain) {
835 SDValue Chain = OrigChain.getOperand(0);
836 if (Chain.getNode() == Load.getNode())
837 Ops.push_back(Load.getOperand(0));
838 else {
839 assert(Chain.getOpcode() == ISD::TokenFactor &&
840 "Unexpected chain operand");
841 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
842 if (Chain.getOperand(i).getNode() == Load.getNode())
843 Ops.push_back(Load.getOperand(0));
844 else
845 Ops.push_back(Chain.getOperand(i));
846 SDValue NewChain =
847 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
848 Ops.clear();
849 Ops.push_back(NewChain);
850 }
851 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
852 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
853 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
854 Load.getOperand(1), Load.getOperand(2));
855
856 Ops.clear();
857 Ops.push_back(SDValue(Load.getNode(), 1));
858 Ops.append(Call->op_begin() + 1, Call->op_end());
859 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
860}
861
862/// Return true if call address is a load and it can be
863/// moved below CALLSEQ_START and the chains leading up to the call.
864/// Return the CALLSEQ_START by reference as a second output.
865/// In the case of a tail call, there isn't a callseq node between the call
866/// chain and the load.
867static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
868 // The transformation is somewhat dangerous if the call's chain was glued to
869 // the call. After MoveBelowOrigChain the load is moved between the call and
870 // the chain, this can create a cycle if the load is not folded. So it is
871 // *really* important that we are sure the load will be folded.
872 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
873 return false;
874 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
875 if (!LD ||
876 !LD->isSimple() ||
877 LD->getAddressingMode() != ISD::UNINDEXED ||
878 LD->getExtensionType() != ISD::NON_EXTLOAD)
879 return false;
880
881 // Now let's find the callseq_start.
882 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
883 if (!Chain.hasOneUse())
884 return false;
885 Chain = Chain.getOperand(0);
886 }
887
888 if (!Chain.getNumOperands())
889 return false;
890 // Since we are not checking for AA here, conservatively abort if the chain
891 // writes to memory. It's not safe to move the callee (a load) across a store.
892 if (isa<MemSDNode>(Chain.getNode()) &&
893 cast<MemSDNode>(Chain.getNode())->writeMem())
894 return false;
895 if (Chain.getOperand(0).getNode() == Callee.getNode())
896 return true;
897 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
898 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
899 Callee.getValue(1).hasOneUse())
900 return true;
901 return false;
902}
903
904static bool isEndbrImm64(uint64_t Imm) {
905// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
906// i.g: 0xF3660F1EFA, 0xF3670F1EFA
907 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
908 return false;
909
910 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
911 0x65, 0x66, 0x67, 0xf0, 0xf2};
912 int i = 24; // 24bit 0x0F1EFA has matched
913 while (i < 64) {
914 uint8_t Byte = (Imm >> i) & 0xFF;
915 if (Byte == 0xF3)
916 return true;
917 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
918 return false;
919 i += 8;
920 }
921
922 return false;
923}
924
925static bool needBWI(MVT VT) {
926 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
927}
928
929void X86DAGToDAGISel::PreprocessISelDAG() {
930 bool MadeChange = false;
931 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
932 E = CurDAG->allnodes_end(); I != E; ) {
933 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
934
935 // This is for CET enhancement.
936 //
937 // ENDBR32 and ENDBR64 have specific opcodes:
938 // ENDBR32: F3 0F 1E FB
939 // ENDBR64: F3 0F 1E FA
940 // And we want that attackers won’t find unintended ENDBR32/64
941 // opcode matches in the binary
942 // Here’s an example:
943 // If the compiler had to generate asm for the following code:
944 // a = 0xF30F1EFA
945 // it could, for example, generate:
946 // mov 0xF30F1EFA, dword ptr[a]
947 // In such a case, the binary would include a gadget that starts
948 // with a fake ENDBR64 opcode. Therefore, we split such generation
949 // into multiple operations, let it not shows in the binary
950 if (N->getOpcode() == ISD::Constant) {
951 MVT VT = N->getSimpleValueType(0);
952 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
953 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
954 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
955 // Check that the cf-protection-branch is enabled.
956 Metadata *CFProtectionBranch =
958 "cf-protection-branch");
959 if (CFProtectionBranch || IndirectBranchTracking) {
960 SDLoc dl(N);
961 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
962 Complement = CurDAG->getNOT(dl, Complement, VT);
963 --I;
964 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
965 ++I;
966 MadeChange = true;
967 continue;
968 }
969 }
970 }
971
972 // If this is a target specific AND node with no flag usages, turn it back
973 // into ISD::AND to enable test instruction matching.
974 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
975 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
976 N->getOperand(0), N->getOperand(1));
977 --I;
978 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
979 ++I;
980 MadeChange = true;
981 continue;
982 }
983
984 // Convert vector increment or decrement to sub/add with an all-ones
985 // constant:
986 // add X, <1, 1...> --> sub X, <-1, -1...>
987 // sub X, <1, 1...> --> add X, <-1, -1...>
988 // The all-ones vector constant can be materialized using a pcmpeq
989 // instruction that is commonly recognized as an idiom (has no register
990 // dependency), so that's better/smaller than loading a splat 1 constant.
991 //
992 // But don't do this if it would inhibit a potentially profitable load
993 // folding opportunity for the other operand. That only occurs with the
994 // intersection of:
995 // (1) The other operand (op0) is load foldable.
996 // (2) The op is an add (otherwise, we are *creating* an add and can still
997 // load fold the other op).
998 // (3) The target has AVX (otherwise, we have a destructive add and can't
999 // load fold the other op without killing the constant op).
1000 // (4) The constant 1 vector has multiple uses (so it is profitable to load
1001 // into a register anyway).
1002 auto mayPreventLoadFold = [&]() {
1003 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
1004 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1005 !N->getOperand(1).hasOneUse();
1006 };
1007 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1008 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1009 APInt SplatVal;
1010 if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
1011 SplatVal.isOne()) {
1012 SDLoc DL(N);
1013
1014 MVT VT = N->getSimpleValueType(0);
1015 unsigned NumElts = VT.getSizeInBits() / 32;
1017 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
1018 AllOnes = CurDAG->getBitcast(VT, AllOnes);
1019
1020 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1021 SDValue Res =
1022 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
1023 --I;
1024 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1025 ++I;
1026 MadeChange = true;
1027 continue;
1028 }
1029 }
1030
1031 switch (N->getOpcode()) {
1032 case X86ISD::VBROADCAST: {
1033 MVT VT = N->getSimpleValueType(0);
1034 // Emulate v32i16/v64i8 broadcast without BWI.
1035 if (!Subtarget->hasBWI() && needBWI(VT)) {
1036 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1037 SDLoc dl(N);
1038 SDValue NarrowBCast =
1039 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1040 SDValue Res =
1041 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1042 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1043 unsigned Index = NarrowVT.getVectorMinNumElements();
1044 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1045 CurDAG->getIntPtrConstant(Index, dl));
1046
1047 --I;
1048 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1049 ++I;
1050 MadeChange = true;
1051 continue;
1052 }
1053
1054 break;
1055 }
1057 MVT VT = N->getSimpleValueType(0);
1058 // Emulate v32i16/v64i8 broadcast without BWI.
1059 if (!Subtarget->hasBWI() && needBWI(VT)) {
1060 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1061 auto *MemNode = cast<MemSDNode>(N);
1062 SDLoc dl(N);
1063 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1064 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1065 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1066 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1067 MemNode->getMemOperand());
1068 SDValue Res =
1069 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1070 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1071 unsigned Index = NarrowVT.getVectorMinNumElements();
1072 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1073 CurDAG->getIntPtrConstant(Index, dl));
1074
1075 --I;
1076 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1077 CurDAG->ReplaceAllUsesWith(N, To);
1078 ++I;
1079 MadeChange = true;
1080 continue;
1081 }
1082
1083 break;
1084 }
1085 case ISD::LOAD: {
1086 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1087 // load, then just extract the lower subvector and avoid the second load.
1088 auto *Ld = cast<LoadSDNode>(N);
1089 MVT VT = N->getSimpleValueType(0);
1090 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1091 !(VT.is128BitVector() || VT.is256BitVector()))
1092 break;
1093
1094 MVT MaxVT = VT;
1095 SDNode *MaxLd = nullptr;
1096 SDValue Ptr = Ld->getBasePtr();
1097 SDValue Chain = Ld->getChain();
1098 for (SDNode *User : Ptr->users()) {
1099 auto *UserLd = dyn_cast<LoadSDNode>(User);
1100 MVT UserVT = User->getSimpleValueType(0);
1101 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1102 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1103 !User->hasAnyUseOfValue(1) &&
1104 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1105 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1106 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1107 MaxLd = User;
1108 MaxVT = UserVT;
1109 }
1110 }
1111 if (MaxLd) {
1112 SDLoc dl(N);
1113 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1114 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1115 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1116 SDValue(MaxLd, 0),
1117 CurDAG->getIntPtrConstant(0, dl));
1118 SDValue Res = CurDAG->getBitcast(VT, Extract);
1119
1120 --I;
1121 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1122 CurDAG->ReplaceAllUsesWith(N, To);
1123 ++I;
1124 MadeChange = true;
1125 continue;
1126 }
1127 break;
1128 }
1129 case ISD::VSELECT: {
1130 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1131 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1132 if (EleVT == MVT::i1)
1133 break;
1134
1135 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1136 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1137 "We can't replace VSELECT with BLENDV in vXi16!");
1138 SDValue R;
1139 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1140 EleVT.getSizeInBits()) {
1141 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1142 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1143 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1144 } else {
1145 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1146 N->getOperand(0), N->getOperand(1),
1147 N->getOperand(2));
1148 }
1149 --I;
1150 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1151 ++I;
1152 MadeChange = true;
1153 continue;
1154 }
1155 case ISD::FP_ROUND:
1157 case ISD::FP_TO_SINT:
1158 case ISD::FP_TO_UINT:
1161 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1162 // don't need 2 sets of patterns.
1163 if (!N->getSimpleValueType(0).isVector())
1164 break;
1165
1166 unsigned NewOpc;
1167 switch (N->getOpcode()) {
1168 default: llvm_unreachable("Unexpected opcode!");
1169 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1170 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1171 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1172 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1173 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1174 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1175 }
1176 SDValue Res;
1177 if (N->isStrictFPOpcode())
1178 Res =
1179 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1180 {N->getOperand(0), N->getOperand(1)});
1181 else
1182 Res =
1183 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1184 N->getOperand(0));
1185 --I;
1186 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1187 ++I;
1188 MadeChange = true;
1189 continue;
1190 }
1191 case ISD::SHL:
1192 case ISD::SRA:
1193 case ISD::SRL: {
1194 // Replace vector shifts with their X86 specific equivalent so we don't
1195 // need 2 sets of patterns.
1196 if (!N->getValueType(0).isVector())
1197 break;
1198
1199 unsigned NewOpc;
1200 switch (N->getOpcode()) {
1201 default: llvm_unreachable("Unexpected opcode!");
1202 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1203 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1204 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1205 }
1206 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1207 N->getOperand(0), N->getOperand(1));
1208 --I;
1209 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1210 ++I;
1211 MadeChange = true;
1212 continue;
1213 }
1214 case ISD::ANY_EXTEND:
1216 // Replace vector any extend with the zero extend equivalents so we don't
1217 // need 2 sets of patterns. Ignore vXi1 extensions.
1218 if (!N->getValueType(0).isVector())
1219 break;
1220
1221 unsigned NewOpc;
1222 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1223 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1224 "Unexpected opcode for mask vector!");
1225 NewOpc = ISD::SIGN_EXTEND;
1226 } else {
1227 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1230 }
1231
1232 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1233 N->getOperand(0));
1234 --I;
1235 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1236 ++I;
1237 MadeChange = true;
1238 continue;
1239 }
1240 case ISD::FCEIL:
1241 case ISD::STRICT_FCEIL:
1242 case ISD::FFLOOR:
1243 case ISD::STRICT_FFLOOR:
1244 case ISD::FTRUNC:
1245 case ISD::STRICT_FTRUNC:
1246 case ISD::FROUNDEVEN:
1248 case ISD::FNEARBYINT:
1250 case ISD::FRINT:
1251 case ISD::STRICT_FRINT: {
1252 // Replace fp rounding with their X86 specific equivalent so we don't
1253 // need 2 sets of patterns.
1254 unsigned Imm;
1255 switch (N->getOpcode()) {
1256 default: llvm_unreachable("Unexpected opcode!");
1257 case ISD::STRICT_FCEIL:
1258 case ISD::FCEIL: Imm = 0xA; break;
1259 case ISD::STRICT_FFLOOR:
1260 case ISD::FFLOOR: Imm = 0x9; break;
1261 case ISD::STRICT_FTRUNC:
1262 case ISD::FTRUNC: Imm = 0xB; break;
1264 case ISD::FROUNDEVEN: Imm = 0x8; break;
1266 case ISD::FNEARBYINT: Imm = 0xC; break;
1267 case ISD::STRICT_FRINT:
1268 case ISD::FRINT: Imm = 0x4; break;
1269 }
1270 SDLoc dl(N);
1271 bool IsStrict = N->isStrictFPOpcode();
1272 SDValue Res;
1273 if (IsStrict)
1274 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1275 {N->getValueType(0), MVT::Other},
1276 {N->getOperand(0), N->getOperand(1),
1277 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1278 else
1279 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1280 N->getOperand(0),
1281 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1282 --I;
1283 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1284 ++I;
1285 MadeChange = true;
1286 continue;
1287 }
1288 case X86ISD::FANDN:
1289 case X86ISD::FAND:
1290 case X86ISD::FOR:
1291 case X86ISD::FXOR: {
1292 // Widen scalar fp logic ops to vector to reduce isel patterns.
1293 // FIXME: Can we do this during lowering/combine.
1294 MVT VT = N->getSimpleValueType(0);
1295 if (VT.isVector() || VT == MVT::f128)
1296 break;
1297
1298 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1299 : VT == MVT::f32 ? MVT::v4f32
1300 : MVT::v8f16;
1301
1302 SDLoc dl(N);
1303 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1304 N->getOperand(0));
1305 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1306 N->getOperand(1));
1307
1308 SDValue Res;
1309 if (Subtarget->hasSSE2()) {
1310 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1311 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1312 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1313 unsigned Opc;
1314 switch (N->getOpcode()) {
1315 default: llvm_unreachable("Unexpected opcode!");
1316 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1317 case X86ISD::FAND: Opc = ISD::AND; break;
1318 case X86ISD::FOR: Opc = ISD::OR; break;
1319 case X86ISD::FXOR: Opc = ISD::XOR; break;
1320 }
1321 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1322 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1323 } else {
1324 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1325 }
1326 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1327 CurDAG->getIntPtrConstant(0, dl));
1328 --I;
1329 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1330 ++I;
1331 MadeChange = true;
1332 continue;
1333 }
1334 }
1335
1336 if (OptLevel != CodeGenOptLevel::None &&
1337 // Only do this when the target can fold the load into the call or
1338 // jmp.
1339 !Subtarget->useIndirectThunkCalls() &&
1340 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1341 (N->getOpcode() == X86ISD::TC_RETURN &&
1342 (Subtarget->is64Bit() ||
1343 !getTargetMachine().isPositionIndependent())))) {
1344 /// Also try moving call address load from outside callseq_start to just
1345 /// before the call to allow it to be folded.
1346 ///
1347 /// [Load chain]
1348 /// ^
1349 /// |
1350 /// [Load]
1351 /// ^ ^
1352 /// | |
1353 /// / \--
1354 /// / |
1355 ///[CALLSEQ_START] |
1356 /// ^ |
1357 /// | |
1358 /// [LOAD/C2Reg] |
1359 /// | |
1360 /// \ /
1361 /// \ /
1362 /// [CALL]
1363 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1364 SDValue Chain = N->getOperand(0);
1365 SDValue Load = N->getOperand(1);
1366 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1367 continue;
1368 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1369 ++NumLoadMoved;
1370 MadeChange = true;
1371 continue;
1372 }
1373
1374 // Lower fpround and fpextend nodes that target the FP stack to be store and
1375 // load to the stack. This is a gross hack. We would like to simply mark
1376 // these as being illegal, but when we do that, legalize produces these when
1377 // it expands calls, then expands these in the same legalize pass. We would
1378 // like dag combine to be able to hack on these between the call expansion
1379 // and the node legalization. As such this pass basically does "really
1380 // late" legalization of these inline with the X86 isel pass.
1381 // FIXME: This should only happen when not compiled with -O0.
1382 switch (N->getOpcode()) {
1383 default: continue;
1384 case ISD::FP_ROUND:
1385 case ISD::FP_EXTEND:
1386 {
1387 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1388 MVT DstVT = N->getSimpleValueType(0);
1389
1390 // If any of the sources are vectors, no fp stack involved.
1391 if (SrcVT.isVector() || DstVT.isVector())
1392 continue;
1393
1394 // If the source and destination are SSE registers, then this is a legal
1395 // conversion that should not be lowered.
1396 const X86TargetLowering *X86Lowering =
1397 static_cast<const X86TargetLowering *>(TLI);
1398 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1399 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1400 if (SrcIsSSE && DstIsSSE)
1401 continue;
1402
1403 if (!SrcIsSSE && !DstIsSSE) {
1404 // If this is an FPStack extension, it is a noop.
1405 if (N->getOpcode() == ISD::FP_EXTEND)
1406 continue;
1407 // If this is a value-preserving FPStack truncation, it is a noop.
1408 if (N->getConstantOperandVal(1))
1409 continue;
1410 }
1411
1412 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1413 // FPStack has extload and truncstore. SSE can fold direct loads into other
1414 // operations. Based on this, decide what we want to do.
1415 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1416 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1417 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1418 MachinePointerInfo MPI =
1419 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1420 SDLoc dl(N);
1421
1422 // FIXME: optimize the case where the src/dest is a load or store?
1423
1424 SDValue Store = CurDAG->getTruncStore(
1425 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1426 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1427 MemTmp, MPI, MemVT);
1428
1429 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1430 // extload we created. This will cause general havok on the dag because
1431 // anything below the conversion could be folded into other existing nodes.
1432 // To avoid invalidating 'I', back it up to the convert node.
1433 --I;
1434 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1435 break;
1436 }
1437
1438 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1439 //dealing with the chain differently, as there is already a preexisting chain.
1442 {
1443 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1444 MVT DstVT = N->getSimpleValueType(0);
1445
1446 // If any of the sources are vectors, no fp stack involved.
1447 if (SrcVT.isVector() || DstVT.isVector())
1448 continue;
1449
1450 // If the source and destination are SSE registers, then this is a legal
1451 // conversion that should not be lowered.
1452 const X86TargetLowering *X86Lowering =
1453 static_cast<const X86TargetLowering *>(TLI);
1454 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1455 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1456 if (SrcIsSSE && DstIsSSE)
1457 continue;
1458
1459 if (!SrcIsSSE && !DstIsSSE) {
1460 // If this is an FPStack extension, it is a noop.
1461 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1462 continue;
1463 // If this is a value-preserving FPStack truncation, it is a noop.
1464 if (N->getConstantOperandVal(2))
1465 continue;
1466 }
1467
1468 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1469 // FPStack has extload and truncstore. SSE can fold direct loads into other
1470 // operations. Based on this, decide what we want to do.
1471 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1472 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1473 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1474 MachinePointerInfo MPI =
1475 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1476 SDLoc dl(N);
1477
1478 // FIXME: optimize the case where the src/dest is a load or store?
1479
1480 //Since the operation is StrictFP, use the preexisting chain.
1482 if (!SrcIsSSE) {
1483 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1484 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1485 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1486 MPI, /*Align*/ std::nullopt,
1488 if (N->getFlags().hasNoFPExcept()) {
1489 SDNodeFlags Flags = Store->getFlags();
1490 Flags.setNoFPExcept(true);
1491 Store->setFlags(Flags);
1492 }
1493 } else {
1494 assert(SrcVT == MemVT && "Unexpected VT!");
1495 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1496 MPI);
1497 }
1498
1499 if (!DstIsSSE) {
1500 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1501 SDValue Ops[] = {Store, MemTmp};
1502 Result = CurDAG->getMemIntrinsicNode(
1503 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1504 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1505 if (N->getFlags().hasNoFPExcept()) {
1506 SDNodeFlags Flags = Result->getFlags();
1507 Flags.setNoFPExcept(true);
1508 Result->setFlags(Flags);
1509 }
1510 } else {
1511 assert(DstVT == MemVT && "Unexpected VT!");
1512 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1513 }
1514
1515 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1516 // extload we created. This will cause general havok on the dag because
1517 // anything below the conversion could be folded into other existing nodes.
1518 // To avoid invalidating 'I', back it up to the convert node.
1519 --I;
1520 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1521 break;
1522 }
1523 }
1524
1525
1526 // Now that we did that, the node is dead. Increment the iterator to the
1527 // next node to process, then delete N.
1528 ++I;
1529 MadeChange = true;
1530 }
1531
1532 // Remove any dead nodes that may have been left behind.
1533 if (MadeChange)
1534 CurDAG->RemoveDeadNodes();
1535}
1536
1537// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1538bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1539 unsigned Opc = N->getMachineOpcode();
1540 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1541 Opc != X86::MOVSX64rr8)
1542 return false;
1543
1544 SDValue N0 = N->getOperand(0);
1545
1546 // We need to be extracting the lower bit of an extend.
1547 if (!N0.isMachineOpcode() ||
1548 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1549 N0.getConstantOperandVal(1) != X86::sub_8bit)
1550 return false;
1551
1552 // We're looking for either a movsx or movzx to match the original opcode.
1553 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1554 : X86::MOVSX32rr8_NOREX;
1555 SDValue N00 = N0.getOperand(0);
1556 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1557 return false;
1558
1559 if (Opc == X86::MOVSX64rr8) {
1560 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1561 // to 64.
1562 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1563 MVT::i64, N00);
1564 ReplaceUses(N, Extend);
1565 } else {
1566 // Ok we can drop this extend and just use the original extend.
1567 ReplaceUses(N, N00.getNode());
1568 }
1569
1570 return true;
1571}
1572
1573void X86DAGToDAGISel::PostprocessISelDAG() {
1574 // Skip peepholes at -O0.
1575 if (TM.getOptLevel() == CodeGenOptLevel::None)
1576 return;
1577
1578 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1579
1580 bool MadeChange = false;
1581 while (Position != CurDAG->allnodes_begin()) {
1582 SDNode *N = &*--Position;
1583 // Skip dead nodes and any non-machine opcodes.
1584 if (N->use_empty() || !N->isMachineOpcode())
1585 continue;
1586
1587 if (tryOptimizeRem8Extend(N)) {
1588 MadeChange = true;
1589 continue;
1590 }
1591
1592 unsigned Opc = N->getMachineOpcode();
1593 switch (Opc) {
1594 default:
1595 continue;
1596 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1597 case X86::TEST8rr:
1598 case X86::TEST16rr:
1599 case X86::TEST32rr:
1600 case X86::TEST64rr:
1601 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1602 case X86::CTEST8rr:
1603 case X86::CTEST16rr:
1604 case X86::CTEST32rr:
1605 case X86::CTEST64rr: {
1606 auto &Op0 = N->getOperand(0);
1607 if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1608 !Op0.isMachineOpcode())
1609 continue;
1610 SDValue And = N->getOperand(0);
1611#define CASE_ND(OP) \
1612 case X86::OP: \
1613 case X86::OP##_ND:
1614 switch (And.getMachineOpcode()) {
1615 default:
1616 continue;
1617 CASE_ND(AND8rr)
1618 CASE_ND(AND16rr)
1619 CASE_ND(AND32rr)
1620 CASE_ND(AND64rr) {
1621 if (And->hasAnyUseOfValue(1))
1622 continue;
1623 SmallVector<SDValue> Ops(N->op_values());
1624 Ops[0] = And.getOperand(0);
1625 Ops[1] = And.getOperand(1);
1627 CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1628 ReplaceUses(N, Test);
1629 MadeChange = true;
1630 continue;
1631 }
1632 CASE_ND(AND8rm)
1633 CASE_ND(AND16rm)
1634 CASE_ND(AND32rm)
1635 CASE_ND(AND64rm) {
1636 if (And->hasAnyUseOfValue(1))
1637 continue;
1638 unsigned NewOpc;
1639 bool IsCTESTCC = X86::isCTESTCC(Opc);
1640#define FROM_TO(A, B) \
1641 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1642 break;
1643 switch (And.getMachineOpcode()) {
1644 FROM_TO(AND8rm, TEST8mr);
1645 FROM_TO(AND16rm, TEST16mr);
1646 FROM_TO(AND32rm, TEST32mr);
1647 FROM_TO(AND64rm, TEST64mr);
1648 }
1649#undef FROM_TO
1650#undef CASE_ND
1651 // Need to swap the memory and register operand.
1652 SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1653 And.getOperand(3), And.getOperand(4),
1654 And.getOperand(5), And.getOperand(0)};
1655 // CC, Cflags.
1656 if (IsCTESTCC) {
1657 Ops.push_back(N->getOperand(2));
1658 Ops.push_back(N->getOperand(3));
1659 }
1660 // Chain of memory load
1661 Ops.push_back(And.getOperand(6));
1662 // Glue
1663 if (IsCTESTCC)
1664 Ops.push_back(N->getOperand(4));
1665
1666 MachineSDNode *Test = CurDAG->getMachineNode(
1667 NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1668 CurDAG->setNodeMemRefs(
1669 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1670 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1671 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1672 MadeChange = true;
1673 continue;
1674 }
1675 }
1676 }
1677 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1678 // used. We're doing this late so we can prefer to fold the AND into masked
1679 // comparisons. Doing that can be better for the live range of the mask
1680 // register.
1681 case X86::KORTESTBkk:
1682 case X86::KORTESTWkk:
1683 case X86::KORTESTDkk:
1684 case X86::KORTESTQkk: {
1685 SDValue Op0 = N->getOperand(0);
1686 if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1687 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1688 continue;
1689#define CASE(A) \
1690 case X86::A: \
1691 break;
1692 switch (Op0.getMachineOpcode()) {
1693 default:
1694 continue;
1695 CASE(KANDBkk)
1696 CASE(KANDWkk)
1697 CASE(KANDDkk)
1698 CASE(KANDQkk)
1699 }
1700 unsigned NewOpc;
1701#define FROM_TO(A, B) \
1702 case X86::A: \
1703 NewOpc = X86::B; \
1704 break;
1705 switch (Opc) {
1706 FROM_TO(KORTESTBkk, KTESTBkk)
1707 FROM_TO(KORTESTWkk, KTESTWkk)
1708 FROM_TO(KORTESTDkk, KTESTDkk)
1709 FROM_TO(KORTESTQkk, KTESTQkk)
1710 }
1711 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1712 // KAND instructions and KTEST use the same ISA feature.
1713 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1714 continue;
1715#undef FROM_TO
1716 MachineSDNode *KTest = CurDAG->getMachineNode(
1717 NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1718 ReplaceUses(N, KTest);
1719 MadeChange = true;
1720 continue;
1721 }
1722 // Attempt to remove vectors moves that were inserted to zero upper bits.
1723 case TargetOpcode::SUBREG_TO_REG: {
1724 unsigned SubRegIdx = N->getConstantOperandVal(2);
1725 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1726 continue;
1727
1728 SDValue Move = N->getOperand(1);
1729 if (!Move.isMachineOpcode())
1730 continue;
1731
1732 // Make sure its one of the move opcodes we recognize.
1733 switch (Move.getMachineOpcode()) {
1734 default:
1735 continue;
1736 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1737 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1738 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1739 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1740 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1741 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1742 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1743 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1744 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1745 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1746 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1747 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1748 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1749 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1750 }
1751#undef CASE
1752
1753 SDValue In = Move.getOperand(0);
1754 if (!In.isMachineOpcode() ||
1755 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1756 continue;
1757
1758 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1759 // the SHA instructions which use a legacy encoding.
1760 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1761 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1762 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1763 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1764 continue;
1765
1766 // Producing instruction is another vector instruction. We can drop the
1767 // move.
1768 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1769 MadeChange = true;
1770 }
1771 }
1772 }
1773
1774 if (MadeChange)
1775 CurDAG->RemoveDeadNodes();
1776}
1777
1778
1779/// Emit any code that needs to be executed only in the main function.
1780void X86DAGToDAGISel::emitSpecialCodeForMain() {
1781 if (Subtarget->isTargetCygMing()) {
1783 auto &DL = CurDAG->getDataLayout();
1784
1786 CLI.setChain(CurDAG->getRoot())
1787 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1788 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1789 std::move(Args));
1790 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1791 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1792 CurDAG->setRoot(Result.second);
1793 }
1794}
1795
1796void X86DAGToDAGISel::emitFunctionEntryCode() {
1797 // If this is main, emit special code for main.
1798 const Function &F = MF->getFunction();
1799 if (F.hasExternalLinkage() && F.getName() == "main")
1800 emitSpecialCodeForMain();
1801}
1802
1803static bool isDispSafeForFrameIndex(int64_t Val) {
1804 // On 64-bit platforms, we can run into an issue where a frame index
1805 // includes a displacement that, when added to the explicit displacement,
1806 // will overflow the displacement field. Assuming that the frame index
1807 // displacement fits into a 31-bit integer (which is only slightly more
1808 // aggressive than the current fundamental assumption that it fits into
1809 // a 32-bit integer), a 31-bit disp should always be safe.
1810 return isInt<31>(Val);
1811}
1812
1813bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1814 X86ISelAddressMode &AM) {
1815 // We may have already matched a displacement and the caller just added the
1816 // symbolic displacement. So we still need to do the checks even if Offset
1817 // is zero.
1818
1819 int64_t Val = AM.Disp + Offset;
1820
1821 // Cannot combine ExternalSymbol displacements with integer offsets.
1822 if (Val != 0 && (AM.ES || AM.MCSym))
1823 return true;
1824
1825 CodeModel::Model M = TM.getCodeModel();
1826 if (Subtarget->is64Bit()) {
1827 if (Val != 0 &&
1829 AM.hasSymbolicDisplacement()))
1830 return true;
1831 // In addition to the checks required for a register base, check that
1832 // we do not try to use an unsafe Disp with a frame index.
1833 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1835 return true;
1836 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1837 // 64 bits. Instructions with 32-bit register addresses perform this zero
1838 // extension for us and we can safely ignore the high bits of Offset.
1839 // Instructions with only a 32-bit immediate address do not, though: they
1840 // sign extend instead. This means only address the low 2GB of address space
1841 // is directly addressable, we need indirect addressing for the high 2GB of
1842 // address space.
1843 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1844 // implicit zero extension of instructions would cover up any problem.
1845 // However, we have asserts elsewhere that get triggered if we do, so keep
1846 // the checks for now.
1847 // TODO: We would actually be able to accept these, as well as the same
1848 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1849 // to get an address size override to be emitted. However, this
1850 // pseudo-register is not part of any register class and therefore causes
1851 // MIR verification to fail.
1852 if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1853 !AM.hasBaseOrIndexReg())
1854 return true;
1855 }
1856 AM.Disp = Val;
1857 return false;
1858}
1859
1860bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1861 bool AllowSegmentRegForX32) {
1862 SDValue Address = N->getOperand(1);
1863
1864 // load gs:0 -> GS segment register.
1865 // load fs:0 -> FS segment register.
1866 //
1867 // This optimization is generally valid because the GNU TLS model defines that
1868 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1869 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1870 // zero-extended to 64 bits and then added it to the base address, which gives
1871 // unwanted results when the register holds a negative value.
1872 // For more information see http://people.redhat.com/drepper/tls.pdf
1873 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1874 !IndirectTlsSegRefs &&
1875 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1876 Subtarget->isTargetFuchsia())) {
1877 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1878 return true;
1879 switch (N->getPointerInfo().getAddrSpace()) {
1880 case X86AS::GS:
1881 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1882 return false;
1883 case X86AS::FS:
1884 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1885 return false;
1886 // Address space X86AS::SS is not handled here, because it is not used to
1887 // address TLS areas.
1888 }
1889 }
1890
1891 return true;
1892}
1893
1894/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1895/// mode. These wrap things that will resolve down into a symbol reference.
1896/// If no match is possible, this returns true, otherwise it returns false.
1897bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1898 // If the addressing mode already has a symbol as the displacement, we can
1899 // never match another symbol.
1900 if (AM.hasSymbolicDisplacement())
1901 return true;
1902
1903 bool IsRIPRelTLS = false;
1904 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1905 if (IsRIPRel) {
1906 SDValue Val = N.getOperand(0);
1908 IsRIPRelTLS = true;
1909 }
1910
1911 // We can't use an addressing mode in the 64-bit large code model.
1912 // Global TLS addressing is an exception. In the medium code model,
1913 // we use can use a mode when RIP wrappers are present.
1914 // That signifies access to globals that are known to be "near",
1915 // such as the GOT itself.
1916 CodeModel::Model M = TM.getCodeModel();
1917 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1918 return true;
1919
1920 // Base and index reg must be 0 in order to use %rip as base.
1921 if (IsRIPRel && AM.hasBaseOrIndexReg())
1922 return true;
1923
1924 // Make a local copy in case we can't do this fold.
1925 X86ISelAddressMode Backup = AM;
1926
1927 int64_t Offset = 0;
1928 SDValue N0 = N.getOperand(0);
1929 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1930 AM.GV = G->getGlobal();
1931 AM.SymbolFlags = G->getTargetFlags();
1932 Offset = G->getOffset();
1933 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1934 AM.CP = CP->getConstVal();
1935 AM.Alignment = CP->getAlign();
1936 AM.SymbolFlags = CP->getTargetFlags();
1937 Offset = CP->getOffset();
1938 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1939 AM.ES = S->getSymbol();
1940 AM.SymbolFlags = S->getTargetFlags();
1941 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1942 AM.MCSym = S->getMCSymbol();
1943 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1944 AM.JT = J->getIndex();
1945 AM.SymbolFlags = J->getTargetFlags();
1946 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1947 AM.BlockAddr = BA->getBlockAddress();
1948 AM.SymbolFlags = BA->getTargetFlags();
1949 Offset = BA->getOffset();
1950 } else
1951 llvm_unreachable("Unhandled symbol reference node.");
1952
1953 // Can't use an addressing mode with large globals.
1954 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1955 TM.isLargeGlobalValue(AM.GV)) {
1956 AM = Backup;
1957 return true;
1958 }
1959
1960 if (foldOffsetIntoAddress(Offset, AM)) {
1961 AM = Backup;
1962 return true;
1963 }
1964
1965 if (IsRIPRel)
1966 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1967
1968 // Commit the changes now that we know this fold is safe.
1969 return false;
1970}
1971
1972/// Add the specified node to the specified addressing mode, returning true if
1973/// it cannot be done. This just pattern matches for the addressing mode.
1974bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1975 if (matchAddressRecursively(N, AM, 0))
1976 return true;
1977
1978 // Post-processing: Make a second attempt to fold a load, if we now know
1979 // that there will not be any other register. This is only performed for
1980 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1981 // any foldable load the first time.
1982 if (Subtarget->isTarget64BitILP32() &&
1983 AM.BaseType == X86ISelAddressMode::RegBase &&
1984 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1985 SDValue Save_Base_Reg = AM.Base_Reg;
1986 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1987 AM.Base_Reg = SDValue();
1988 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1989 AM.Base_Reg = Save_Base_Reg;
1990 }
1991 }
1992
1993 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1994 // a smaller encoding and avoids a scaled-index.
1995 if (AM.Scale == 2 &&
1996 AM.BaseType == X86ISelAddressMode::RegBase &&
1997 AM.Base_Reg.getNode() == nullptr) {
1998 AM.Base_Reg = AM.IndexReg;
1999 AM.Scale = 1;
2000 }
2001
2002 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2003 // because it has a smaller encoding.
2004 if (TM.getCodeModel() != CodeModel::Large &&
2005 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
2006 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2007 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2008 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2009 // However, when GV is a local function symbol and in the same section as
2010 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2011 // referencing GV+Disp generates a relocation referencing the section symbol
2012 // with an even smaller offset, which might underflow. We should bail out if
2013 // the negative offset is too close to INT32_MIN. Actually, we are more
2014 // conservative here, using a smaller magic number also used by
2015 // isOffsetSuitableForCodeModel.
2016 if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
2017 return true;
2018
2019 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
2020 }
2021
2022 return false;
2023}
2024
2025bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2026 unsigned Depth) {
2027 // Add an artificial use to this node so that we can keep track of
2028 // it if it gets CSE'd with a different node.
2029 HandleSDNode Handle(N);
2030
2031 X86ISelAddressMode Backup = AM;
2032 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
2033 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
2034 return false;
2035 AM = Backup;
2036
2037 // Try again after commutating the operands.
2038 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
2039 Depth + 1) &&
2040 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
2041 return false;
2042 AM = Backup;
2043
2044 // If we couldn't fold both operands into the address at the same time,
2045 // see if we can just put each operand into a register and fold at least
2046 // the add.
2047 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2048 !AM.Base_Reg.getNode() &&
2049 !AM.IndexReg.getNode()) {
2050 N = Handle.getValue();
2051 AM.Base_Reg = N.getOperand(0);
2052 AM.IndexReg = N.getOperand(1);
2053 AM.Scale = 1;
2054 return false;
2055 }
2056 N = Handle.getValue();
2057 return true;
2058}
2059
2060// Insert a node into the DAG at least before the Pos node's position. This
2061// will reposition the node as needed, and will assign it a node ID that is <=
2062// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2063// IDs! The selection DAG must no longer depend on their uniqueness when this
2064// is used.
2065static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2066 if (N->getNodeId() == -1 ||
2069 DAG.RepositionNode(Pos->getIterator(), N.getNode());
2070 // Mark Node as invalid for pruning as after this it may be a successor to a
2071 // selected node but otherwise be in the same position of Pos.
2072 // Conservatively mark it with the same -abs(Id) to assure node id
2073 // invariant is preserved.
2074 N->setNodeId(Pos->getNodeId());
2076 }
2077}
2078
2079// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2080// safe. This allows us to convert the shift and and into an h-register
2081// extract and a scaled index. Returns false if the simplification is
2082// performed.
2084 uint64_t Mask,
2085 SDValue Shift, SDValue X,
2086 X86ISelAddressMode &AM) {
2087 if (Shift.getOpcode() != ISD::SRL ||
2088 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2089 !Shift.hasOneUse())
2090 return true;
2091
2092 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2093 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2094 Mask != (0xffu << ScaleLog))
2095 return true;
2096
2097 MVT XVT = X.getSimpleValueType();
2098 MVT VT = N.getSimpleValueType();
2099 SDLoc DL(N);
2100 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2101 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2102 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2103 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2104 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2105 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2106 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2107
2108 // Insert the new nodes into the topological ordering. We must do this in
2109 // a valid topological ordering as nothing is going to go back and re-sort
2110 // these nodes. We continually insert before 'N' in sequence as this is
2111 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2112 // hierarchy left to express.
2113 insertDAGNode(DAG, N, Eight);
2114 insertDAGNode(DAG, N, NewMask);
2115 insertDAGNode(DAG, N, Srl);
2116 insertDAGNode(DAG, N, And);
2117 insertDAGNode(DAG, N, Ext);
2118 insertDAGNode(DAG, N, ShlCount);
2119 insertDAGNode(DAG, N, Shl);
2120 DAG.ReplaceAllUsesWith(N, Shl);
2121 DAG.RemoveDeadNode(N.getNode());
2122 AM.IndexReg = Ext;
2123 AM.Scale = (1 << ScaleLog);
2124 return false;
2125}
2126
2127// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2128// allows us to fold the shift into this addressing mode. Returns false if the
2129// transform succeeded.
2131 X86ISelAddressMode &AM) {
2132 SDValue Shift = N.getOperand(0);
2133
2134 // Use a signed mask so that shifting right will insert sign bits. These
2135 // bits will be removed when we shift the result left so it doesn't matter
2136 // what we use. This might allow a smaller immediate encoding.
2137 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2138
2139 // If we have an any_extend feeding the AND, look through it to see if there
2140 // is a shift behind it. But only if the AND doesn't use the extended bits.
2141 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2142 bool FoundAnyExtend = false;
2143 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2144 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2145 isUInt<32>(Mask)) {
2146 FoundAnyExtend = true;
2147 Shift = Shift.getOperand(0);
2148 }
2149
2150 if (Shift.getOpcode() != ISD::SHL ||
2151 !isa<ConstantSDNode>(Shift.getOperand(1)))
2152 return true;
2153
2154 SDValue X = Shift.getOperand(0);
2155
2156 // Not likely to be profitable if either the AND or SHIFT node has more
2157 // than one use (unless all uses are for address computation). Besides,
2158 // isel mechanism requires their node ids to be reused.
2159 if (!N.hasOneUse() || !Shift.hasOneUse())
2160 return true;
2161
2162 // Verify that the shift amount is something we can fold.
2163 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2164 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2165 return true;
2166
2167 MVT VT = N.getSimpleValueType();
2168 SDLoc DL(N);
2169 if (FoundAnyExtend) {
2170 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2171 insertDAGNode(DAG, N, NewX);
2172 X = NewX;
2173 }
2174
2175 SDValue NewMask = DAG.getSignedConstant(Mask >> ShiftAmt, DL, VT);
2176 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2177 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2178
2179 // Insert the new nodes into the topological ordering. We must do this in
2180 // a valid topological ordering as nothing is going to go back and re-sort
2181 // these nodes. We continually insert before 'N' in sequence as this is
2182 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2183 // hierarchy left to express.
2184 insertDAGNode(DAG, N, NewMask);
2185 insertDAGNode(DAG, N, NewAnd);
2186 insertDAGNode(DAG, N, NewShift);
2187 DAG.ReplaceAllUsesWith(N, NewShift);
2188 DAG.RemoveDeadNode(N.getNode());
2189
2190 AM.Scale = 1 << ShiftAmt;
2191 AM.IndexReg = NewAnd;
2192 return false;
2193}
2194
2195// Implement some heroics to detect shifts of masked values where the mask can
2196// be replaced by extending the shift and undoing that in the addressing mode
2197// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2198// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2199// the addressing mode. This results in code such as:
2200//
2201// int f(short *y, int *lookup_table) {
2202// ...
2203// return *y + lookup_table[*y >> 11];
2204// }
2205//
2206// Turning into:
2207// movzwl (%rdi), %eax
2208// movl %eax, %ecx
2209// shrl $11, %ecx
2210// addl (%rsi,%rcx,4), %eax
2211//
2212// Instead of:
2213// movzwl (%rdi), %eax
2214// movl %eax, %ecx
2215// shrl $9, %ecx
2216// andl $124, %rcx
2217// addl (%rsi,%rcx), %eax
2218//
2219// Note that this function assumes the mask is provided as a mask *after* the
2220// value is shifted. The input chain may or may not match that, but computing
2221// such a mask is trivial.
2223 uint64_t Mask,
2224 SDValue Shift, SDValue X,
2225 X86ISelAddressMode &AM) {
2226 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2227 !isa<ConstantSDNode>(Shift.getOperand(1)))
2228 return true;
2229
2230 // We need to ensure that mask is a continuous run of bits.
2231 unsigned MaskIdx, MaskLen;
2232 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2233 return true;
2234 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2235
2236 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2237
2238 // The amount of shift we're trying to fit into the addressing mode is taken
2239 // from the shifted mask index (number of trailing zeros of the mask).
2240 unsigned AMShiftAmt = MaskIdx;
2241
2242 // There is nothing we can do here unless the mask is removing some bits.
2243 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2244 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2245
2246 // Scale the leading zero count down based on the actual size of the value.
2247 // Also scale it down based on the size of the shift.
2248 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2249 if (MaskLZ < ScaleDown)
2250 return true;
2251 MaskLZ -= ScaleDown;
2252
2253 // The final check is to ensure that any masked out high bits of X are
2254 // already known to be zero. Otherwise, the mask has a semantic impact
2255 // other than masking out a couple of low bits. Unfortunately, because of
2256 // the mask, zero extensions will be removed from operands in some cases.
2257 // This code works extra hard to look through extensions because we can
2258 // replace them with zero extensions cheaply if necessary.
2259 bool ReplacingAnyExtend = false;
2260 if (X.getOpcode() == ISD::ANY_EXTEND) {
2261 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2262 X.getOperand(0).getSimpleValueType().getSizeInBits();
2263 // Assume that we'll replace the any-extend with a zero-extend, and
2264 // narrow the search to the extended value.
2265 X = X.getOperand(0);
2266 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2267 ReplacingAnyExtend = true;
2268 }
2269 APInt MaskedHighBits =
2270 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2271 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2272 return true;
2273
2274 // We've identified a pattern that can be transformed into a single shift
2275 // and an addressing mode. Make it so.
2276 MVT VT = N.getSimpleValueType();
2277 if (ReplacingAnyExtend) {
2278 assert(X.getValueType() != VT);
2279 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2280 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2281 insertDAGNode(DAG, N, NewX);
2282 X = NewX;
2283 }
2284
2285 MVT XVT = X.getSimpleValueType();
2286 SDLoc DL(N);
2287 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2288 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2289 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2290 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2291 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2292
2293 // Insert the new nodes into the topological ordering. We must do this in
2294 // a valid topological ordering as nothing is going to go back and re-sort
2295 // these nodes. We continually insert before 'N' in sequence as this is
2296 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2297 // hierarchy left to express.
2298 insertDAGNode(DAG, N, NewSRLAmt);
2299 insertDAGNode(DAG, N, NewSRL);
2300 insertDAGNode(DAG, N, NewExt);
2301 insertDAGNode(DAG, N, NewSHLAmt);
2302 insertDAGNode(DAG, N, NewSHL);
2303 DAG.ReplaceAllUsesWith(N, NewSHL);
2304 DAG.RemoveDeadNode(N.getNode());
2305
2306 AM.Scale = 1 << AMShiftAmt;
2307 AM.IndexReg = NewExt;
2308 return false;
2309}
2310
2311// Transform "(X >> SHIFT) & (MASK << C1)" to
2312// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2313// matched to a BEXTR later. Returns false if the simplification is performed.
2315 uint64_t Mask,
2316 SDValue Shift, SDValue X,
2317 X86ISelAddressMode &AM,
2318 const X86Subtarget &Subtarget) {
2319 if (Shift.getOpcode() != ISD::SRL ||
2320 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2321 !Shift.hasOneUse() || !N.hasOneUse())
2322 return true;
2323
2324 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2325 if (!Subtarget.hasTBM() &&
2326 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2327 return true;
2328
2329 // We need to ensure that mask is a continuous run of bits.
2330 unsigned MaskIdx, MaskLen;
2331 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2332 return true;
2333
2334 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2335
2336 // The amount of shift we're trying to fit into the addressing mode is taken
2337 // from the shifted mask index (number of trailing zeros of the mask).
2338 unsigned AMShiftAmt = MaskIdx;
2339
2340 // There is nothing we can do here unless the mask is removing some bits.
2341 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2342 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2343
2344 MVT XVT = X.getSimpleValueType();
2345 MVT VT = N.getSimpleValueType();
2346 SDLoc DL(N);
2347 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2348 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2349 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2350 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2351 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2352 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2353 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2354
2355 // Insert the new nodes into the topological ordering. We must do this in
2356 // a valid topological ordering as nothing is going to go back and re-sort
2357 // these nodes. We continually insert before 'N' in sequence as this is
2358 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2359 // hierarchy left to express.
2360 insertDAGNode(DAG, N, NewSRLAmt);
2361 insertDAGNode(DAG, N, NewSRL);
2362 insertDAGNode(DAG, N, NewMask);
2363 insertDAGNode(DAG, N, NewAnd);
2364 insertDAGNode(DAG, N, NewExt);
2365 insertDAGNode(DAG, N, NewSHLAmt);
2366 insertDAGNode(DAG, N, NewSHL);
2367 DAG.ReplaceAllUsesWith(N, NewSHL);
2368 DAG.RemoveDeadNode(N.getNode());
2369
2370 AM.Scale = 1 << AMShiftAmt;
2371 AM.IndexReg = NewExt;
2372 return false;
2373}
2374
2375// Attempt to peek further into a scaled index register, collecting additional
2376// extensions / offsets / etc. Returns /p N if we can't peek any further.
2377SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2378 X86ISelAddressMode &AM,
2379 unsigned Depth) {
2380 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2381 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2382 "Illegal index scale");
2383
2384 // Limit recursion.
2386 return N;
2387
2388 EVT VT = N.getValueType();
2389 unsigned Opc = N.getOpcode();
2390
2391 // index: add(x,c) -> index: x, disp + c
2392 if (CurDAG->isBaseWithConstantOffset(N)) {
2393 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2394 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2395 if (!foldOffsetIntoAddress(Offset, AM))
2396 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2397 }
2398
2399 // index: add(x,x) -> index: x, scale * 2
2400 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2401 if (AM.Scale <= 4) {
2402 AM.Scale *= 2;
2403 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2404 }
2405 }
2406
2407 // index: shl(x,i) -> index: x, scale * (1 << i)
2408 if (Opc == X86ISD::VSHLI) {
2409 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2410 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2411 if ((AM.Scale * ScaleAmt) <= 8) {
2412 AM.Scale *= ScaleAmt;
2413 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2414 }
2415 }
2416
2417 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2418 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2419 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2420 SDValue Src = N.getOperand(0);
2421 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2422 Src.hasOneUse()) {
2423 if (CurDAG->isBaseWithConstantOffset(Src)) {
2424 SDValue AddSrc = Src.getOperand(0);
2425 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2426 int64_t Offset = AddVal->getSExtValue();
2427 if (!foldOffsetIntoAddress((uint64_t)Offset * AM.Scale, AM)) {
2428 SDLoc DL(N);
2429 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2430 SDValue ExtVal = CurDAG->getSignedConstant(Offset, DL, VT);
2431 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2432 insertDAGNode(*CurDAG, N, ExtSrc);
2433 insertDAGNode(*CurDAG, N, ExtVal);
2434 insertDAGNode(*CurDAG, N, ExtAdd);
2435 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2436 CurDAG->RemoveDeadNode(N.getNode());
2437 return ExtSrc;
2438 }
2439 }
2440 }
2441 }
2442
2443 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2444 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2445 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2446 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2447 SDValue Src = N.getOperand(0);
2448 unsigned SrcOpc = Src.getOpcode();
2449 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2450 CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2451 Src.hasOneUse()) {
2452 if (CurDAG->isBaseWithConstantOffset(Src)) {
2453 SDValue AddSrc = Src.getOperand(0);
2454 uint64_t Offset = Src.getConstantOperandVal(1);
2455 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2456 SDLoc DL(N);
2457 SDValue Res;
2458 // If we're also scaling, see if we can use that as well.
2459 if (AddSrc.getOpcode() == ISD::SHL &&
2460 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2461 SDValue ShVal = AddSrc.getOperand(0);
2462 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2463 APInt HiBits =
2465 uint64_t ScaleAmt = 1ULL << ShAmt;
2466 if ((AM.Scale * ScaleAmt) <= 8 &&
2467 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2468 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2469 AM.Scale *= ScaleAmt;
2470 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2471 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2472 AddSrc.getOperand(1));
2473 insertDAGNode(*CurDAG, N, ExtShVal);
2474 insertDAGNode(*CurDAG, N, ExtShift);
2475 AddSrc = ExtShift;
2476 Res = ExtShVal;
2477 }
2478 }
2479 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2480 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2481 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2482 insertDAGNode(*CurDAG, N, ExtSrc);
2483 insertDAGNode(*CurDAG, N, ExtVal);
2484 insertDAGNode(*CurDAG, N, ExtAdd);
2485 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2486 CurDAG->RemoveDeadNode(N.getNode());
2487 return Res ? Res : ExtSrc;
2488 }
2489 }
2490 }
2491 }
2492
2493 // TODO: Handle extensions, shifted masks etc.
2494 return N;
2495}
2496
2497bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2498 unsigned Depth) {
2499 SDLoc dl(N);
2500 LLVM_DEBUG({
2501 dbgs() << "MatchAddress: ";
2502 AM.dump(CurDAG);
2503 });
2504 // Limit recursion.
2506 return matchAddressBase(N, AM);
2507
2508 // If this is already a %rip relative address, we can only merge immediates
2509 // into it. Instead of handling this in every case, we handle it here.
2510 // RIP relative addressing: %rip + 32-bit displacement!
2511 if (AM.isRIPRelative()) {
2512 // FIXME: JumpTable and ExternalSymbol address currently don't like
2513 // displacements. It isn't very important, but this should be fixed for
2514 // consistency.
2515 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2516 return true;
2517
2518 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2519 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2520 return false;
2521 return true;
2522 }
2523
2524 switch (N.getOpcode()) {
2525 default: break;
2526 case ISD::LOCAL_RECOVER: {
2527 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2528 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2529 // Use the symbol and don't prefix it.
2530 AM.MCSym = ESNode->getMCSymbol();
2531 return false;
2532 }
2533 break;
2534 }
2535 case ISD::Constant: {
2536 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2537 if (!foldOffsetIntoAddress(Val, AM))
2538 return false;
2539 break;
2540 }
2541
2542 case X86ISD::Wrapper:
2543 case X86ISD::WrapperRIP:
2544 if (!matchWrapper(N, AM))
2545 return false;
2546 break;
2547
2548 case ISD::LOAD:
2549 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2550 return false;
2551 break;
2552
2553 case ISD::FrameIndex:
2554 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2555 AM.Base_Reg.getNode() == nullptr &&
2556 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2557 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2558 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2559 return false;
2560 }
2561 break;
2562
2563 case ISD::SHL:
2564 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2565 break;
2566
2567 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2568 unsigned Val = CN->getZExtValue();
2569 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2570 // that the base operand remains free for further matching. If
2571 // the base doesn't end up getting used, a post-processing step
2572 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2573 if (Val == 1 || Val == 2 || Val == 3) {
2574 SDValue ShVal = N.getOperand(0);
2575 AM.Scale = 1 << Val;
2576 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2577 return false;
2578 }
2579 }
2580 break;
2581
2582 case ISD::SRL: {
2583 // Scale must not be used already.
2584 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2585
2586 // We only handle up to 64-bit values here as those are what matter for
2587 // addressing mode optimizations.
2588 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2589 "Unexpected value size!");
2590
2591 SDValue And = N.getOperand(0);
2592 if (And.getOpcode() != ISD::AND) break;
2593 SDValue X = And.getOperand(0);
2594
2595 // The mask used for the transform is expected to be post-shift, but we
2596 // found the shift first so just apply the shift to the mask before passing
2597 // it down.
2598 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2599 !isa<ConstantSDNode>(And.getOperand(1)))
2600 break;
2601 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2602
2603 // Try to fold the mask and shift into the scale, and return false if we
2604 // succeed.
2605 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2606 return false;
2607 break;
2608 }
2609
2610 case ISD::SMUL_LOHI:
2611 case ISD::UMUL_LOHI:
2612 // A mul_lohi where we need the low part can be folded as a plain multiply.
2613 if (N.getResNo() != 0) break;
2614 [[fallthrough]];
2615 case ISD::MUL:
2616 case X86ISD::MUL_IMM:
2617 // X*[3,5,9] -> X+X*[2,4,8]
2618 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2619 AM.Base_Reg.getNode() == nullptr &&
2620 AM.IndexReg.getNode() == nullptr) {
2621 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2622 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2623 CN->getZExtValue() == 9) {
2624 AM.Scale = unsigned(CN->getZExtValue())-1;
2625
2626 SDValue MulVal = N.getOperand(0);
2627 SDValue Reg;
2628
2629 // Okay, we know that we have a scale by now. However, if the scaled
2630 // value is an add of something and a constant, we can fold the
2631 // constant into the disp field here.
2632 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2633 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2634 Reg = MulVal.getOperand(0);
2635 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2636 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2637 if (foldOffsetIntoAddress(Disp, AM))
2638 Reg = N.getOperand(0);
2639 } else {
2640 Reg = N.getOperand(0);
2641 }
2642
2643 AM.IndexReg = AM.Base_Reg = Reg;
2644 return false;
2645 }
2646 }
2647 break;
2648
2649 case ISD::SUB: {
2650 // Given A-B, if A can be completely folded into the address and
2651 // the index field with the index field unused, use -B as the index.
2652 // This is a win if a has multiple parts that can be folded into
2653 // the address. Also, this saves a mov if the base register has
2654 // other uses, since it avoids a two-address sub instruction, however
2655 // it costs an additional mov if the index register has other uses.
2656
2657 // Add an artificial use to this node so that we can keep track of
2658 // it if it gets CSE'd with a different node.
2659 HandleSDNode Handle(N);
2660
2661 // Test if the LHS of the sub can be folded.
2662 X86ISelAddressMode Backup = AM;
2663 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2664 N = Handle.getValue();
2665 AM = Backup;
2666 break;
2667 }
2668 N = Handle.getValue();
2669 // Test if the index field is free for use.
2670 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2671 AM = Backup;
2672 break;
2673 }
2674
2675 int Cost = 0;
2676 SDValue RHS = N.getOperand(1);
2677 // If the RHS involves a register with multiple uses, this
2678 // transformation incurs an extra mov, due to the neg instruction
2679 // clobbering its operand.
2680 if (!RHS.getNode()->hasOneUse() ||
2681 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2682 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2683 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2684 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2685 RHS.getOperand(0).getValueType() == MVT::i32))
2686 ++Cost;
2687 // If the base is a register with multiple uses, this
2688 // transformation may save a mov.
2689 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2690 !AM.Base_Reg.getNode()->hasOneUse()) ||
2691 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2692 --Cost;
2693 // If the folded LHS was interesting, this transformation saves
2694 // address arithmetic.
2695 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2696 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2697 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2698 --Cost;
2699 // If it doesn't look like it may be an overall win, don't do it.
2700 if (Cost >= 0) {
2701 AM = Backup;
2702 break;
2703 }
2704
2705 // Ok, the transformation is legal and appears profitable. Go for it.
2706 // Negation will be emitted later to avoid creating dangling nodes if this
2707 // was an unprofitable LEA.
2708 AM.IndexReg = RHS;
2709 AM.NegateIndex = true;
2710 AM.Scale = 1;
2711 return false;
2712 }
2713
2714 case ISD::OR:
2715 case ISD::XOR:
2716 // See if we can treat the OR/XOR node as an ADD node.
2717 if (!CurDAG->isADDLike(N))
2718 break;
2719 [[fallthrough]];
2720 case ISD::ADD:
2721 if (!matchAdd(N, AM, Depth))
2722 return false;
2723 break;
2724
2725 case ISD::AND: {
2726 // Perform some heroic transforms on an and of a constant-count shift
2727 // with a constant to enable use of the scaled offset field.
2728
2729 // Scale must not be used already.
2730 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2731
2732 // We only handle up to 64-bit values here as those are what matter for
2733 // addressing mode optimizations.
2734 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2735 "Unexpected value size!");
2736
2737 if (!isa<ConstantSDNode>(N.getOperand(1)))
2738 break;
2739
2740 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2741 SDValue Shift = N.getOperand(0);
2742 SDValue X = Shift.getOperand(0);
2743
2744 uint64_t Mask = N.getConstantOperandVal(1);
2745
2746 // Try to fold the mask and shift into an extract and scale.
2747 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2748 return false;
2749
2750 // Try to fold the mask and shift directly into the scale.
2751 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2752 return false;
2753
2754 // Try to fold the mask and shift into BEXTR and scale.
2755 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2756 return false;
2757 }
2758
2759 // Try to swap the mask and shift to place shifts which can be done as
2760 // a scale on the outside of the mask.
2761 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2762 return false;
2763
2764 break;
2765 }
2766 case ISD::ZERO_EXTEND: {
2767 // Try to widen a zexted shift left to the same size as its use, so we can
2768 // match the shift as a scale factor.
2769 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2770 break;
2771
2772 SDValue Src = N.getOperand(0);
2773
2774 // See if we can match a zext(addlike(x,c)).
2775 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2776 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2777 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2778 if (Index != N) {
2779 AM.IndexReg = Index;
2780 return false;
2781 }
2782
2783 // Peek through mask: zext(and(shl(x,c1),c2))
2784 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2785 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2786 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2787 Mask = MaskC->getAPIntValue();
2788 Src = Src.getOperand(0);
2789 }
2790
2791 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2792 // Give up if the shift is not a valid scale factor [1,2,3].
2793 SDValue ShlSrc = Src.getOperand(0);
2794 SDValue ShlAmt = Src.getOperand(1);
2795 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2796 if (!ShAmtC)
2797 break;
2798 unsigned ShAmtV = ShAmtC->getZExtValue();
2799 if (ShAmtV > 3)
2800 break;
2801
2802 // The narrow shift must only shift out zero bits (it must be 'nuw').
2803 // That makes it safe to widen to the destination type.
2804 APInt HighZeros =
2805 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2806 if (!Src->getFlags().hasNoUnsignedWrap() &&
2807 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2808 break;
2809
2810 // zext (shl nuw i8 %x, C1) to i32
2811 // --> shl (zext i8 %x to i32), (zext C1)
2812 // zext (and (shl nuw i8 %x, C1), C2) to i32
2813 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2814 MVT SrcVT = ShlSrc.getSimpleValueType();
2815 MVT VT = N.getSimpleValueType();
2816 SDLoc DL(N);
2817
2818 SDValue Res = ShlSrc;
2819 if (!Mask.isAllOnes()) {
2820 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2821 insertDAGNode(*CurDAG, N, Res);
2822 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2823 insertDAGNode(*CurDAG, N, Res);
2824 }
2825 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2826 insertDAGNode(*CurDAG, N, Zext);
2827 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2828 insertDAGNode(*CurDAG, N, NewShl);
2829 CurDAG->ReplaceAllUsesWith(N, NewShl);
2830 CurDAG->RemoveDeadNode(N.getNode());
2831
2832 // Convert the shift to scale factor.
2833 AM.Scale = 1 << ShAmtV;
2834 // If matchIndexRecursively is not called here,
2835 // Zext may be replaced by other nodes but later used to call a builder
2836 // method
2837 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2838 return false;
2839 }
2840
2841 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2842 // Try to fold the mask and shift into an extract and scale.
2843 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2844 Src.getOperand(0), AM))
2845 return false;
2846
2847 // Try to fold the mask and shift directly into the scale.
2848 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2849 Src.getOperand(0), AM))
2850 return false;
2851
2852 // Try to fold the mask and shift into BEXTR and scale.
2853 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2854 Src.getOperand(0), AM, *Subtarget))
2855 return false;
2856 }
2857
2858 break;
2859 }
2860 }
2861
2862 return matchAddressBase(N, AM);
2863}
2864
2865/// Helper for MatchAddress. Add the specified node to the
2866/// specified addressing mode without any further recursion.
2867bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2868 // Is the base register already occupied?
2869 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2870 // If so, check to see if the scale index register is set.
2871 if (!AM.IndexReg.getNode()) {
2872 AM.IndexReg = N;
2873 AM.Scale = 1;
2874 return false;
2875 }
2876
2877 // Otherwise, we cannot select it.
2878 return true;
2879 }
2880
2881 // Default, generate it as a register.
2882 AM.BaseType = X86ISelAddressMode::RegBase;
2883 AM.Base_Reg = N;
2884 return false;
2885}
2886
2887bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2888 X86ISelAddressMode &AM,
2889 unsigned Depth) {
2890 SDLoc dl(N);
2891 LLVM_DEBUG({
2892 dbgs() << "MatchVectorAddress: ";
2893 AM.dump(CurDAG);
2894 });
2895 // Limit recursion.
2897 return matchAddressBase(N, AM);
2898
2899 // TODO: Support other operations.
2900 switch (N.getOpcode()) {
2901 case ISD::Constant: {
2902 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2903 if (!foldOffsetIntoAddress(Val, AM))
2904 return false;
2905 break;
2906 }
2907 case X86ISD::Wrapper:
2908 if (!matchWrapper(N, AM))
2909 return false;
2910 break;
2911 case ISD::ADD: {
2912 // Add an artificial use to this node so that we can keep track of
2913 // it if it gets CSE'd with a different node.
2914 HandleSDNode Handle(N);
2915
2916 X86ISelAddressMode Backup = AM;
2917 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2918 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2919 Depth + 1))
2920 return false;
2921 AM = Backup;
2922
2923 // Try again after commuting the operands.
2924 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2925 Depth + 1) &&
2926 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2927 Depth + 1))
2928 return false;
2929 AM = Backup;
2930
2931 N = Handle.getValue();
2932 break;
2933 }
2934 }
2935
2936 return matchAddressBase(N, AM);
2937}
2938
2939/// Helper for selectVectorAddr. Handles things that can be folded into a
2940/// gather/scatter address. The index register and scale should have already
2941/// been handled.
2942bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2943 return matchVectorAddressRecursively(N, AM, 0);
2944}
2945
2946bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2947 SDValue IndexOp, SDValue ScaleOp,
2948 SDValue &Base, SDValue &Scale,
2949 SDValue &Index, SDValue &Disp,
2950 SDValue &Segment) {
2951 X86ISelAddressMode AM;
2952 AM.Scale = ScaleOp->getAsZExtVal();
2953
2954 // Attempt to match index patterns, as long as we're not relying on implicit
2955 // sign-extension, which is performed BEFORE scale.
2956 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2957 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2958 else
2959 AM.IndexReg = IndexOp;
2960
2961 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2962 if (AddrSpace == X86AS::GS)
2963 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2964 if (AddrSpace == X86AS::FS)
2965 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2966 if (AddrSpace == X86AS::SS)
2967 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2968
2969 SDLoc DL(BasePtr);
2970 MVT VT = BasePtr.getSimpleValueType();
2971
2972 // Try to match into the base and displacement fields.
2973 if (matchVectorAddress(BasePtr, AM))
2974 return false;
2975
2976 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2977 return true;
2978}
2979
2980/// Returns true if it is able to pattern match an addressing mode.
2981/// It returns the operands which make up the maximal addressing mode it can
2982/// match by reference.
2983///
2984/// Parent is the parent node of the addr operand that is being matched. It
2985/// is always a load, store, atomic node, or null. It is only null when
2986/// checking memory operands for inline asm nodes.
2987bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2988 SDValue &Scale, SDValue &Index,
2989 SDValue &Disp, SDValue &Segment) {
2990 X86ISelAddressMode AM;
2991
2992 if (Parent &&
2993 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2994 // that are not a MemSDNode, and thus don't have proper addrspace info.
2995 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2996 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2997 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2998 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2999 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3000 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3001 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3002 unsigned AddrSpace =
3003 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
3004 if (AddrSpace == X86AS::GS)
3005 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3006 if (AddrSpace == X86AS::FS)
3007 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3008 if (AddrSpace == X86AS::SS)
3009 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3010 }
3011
3012 // Save the DL and VT before calling matchAddress, it can invalidate N.
3013 SDLoc DL(N);
3014 MVT VT = N.getSimpleValueType();
3015
3016 if (matchAddress(N, AM))
3017 return false;
3018
3019 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3020 return true;
3021}
3022
3023bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3024 // Cannot use 32 bit constants to reference objects in kernel/large code
3025 // model.
3026 if (TM.getCodeModel() == CodeModel::Kernel ||
3027 TM.getCodeModel() == CodeModel::Large)
3028 return false;
3029
3030 // In static codegen with small code model, we can get the address of a label
3031 // into a register with 'movl'
3032 if (N->getOpcode() != X86ISD::Wrapper)
3033 return false;
3034
3035 N = N.getOperand(0);
3036
3037 // At least GNU as does not accept 'movl' for TPOFF relocations.
3038 // FIXME: We could use 'movl' when we know we are targeting MC.
3039 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3040 return false;
3041
3042 Imm = N;
3043 // Small/medium code model can reference non-TargetGlobalAddress objects with
3044 // 32 bit constants.
3045 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3046 return TM.getCodeModel() == CodeModel::Small ||
3047 TM.getCodeModel() == CodeModel::Medium;
3048 }
3049
3050 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3051 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3052 return CR->getUnsignedMax().ult(1ull << 32);
3053
3054 return !TM.isLargeGlobalValue(GV);
3055}
3056
3057bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
3058 SDValue &Scale, SDValue &Index,
3059 SDValue &Disp, SDValue &Segment) {
3060 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3061 SDLoc DL(N);
3062
3063 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3064 return false;
3065
3066 auto *RN = dyn_cast<RegisterSDNode>(Base);
3067 if (RN && RN->getReg() == 0)
3068 Base = CurDAG->getRegister(0, MVT::i64);
3069 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
3070 // Base could already be %rip, particularly in the x32 ABI.
3071 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3072 MVT::i64), 0);
3073 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3074 Base);
3075 }
3076
3077 RN = dyn_cast<RegisterSDNode>(Index);
3078 if (RN && RN->getReg() == 0)
3079 Index = CurDAG->getRegister(0, MVT::i64);
3080 else {
3081 assert(Index.getValueType() == MVT::i32 &&
3082 "Expect to be extending 32-bit registers for use in LEA");
3083 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3084 MVT::i64), 0);
3085 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3086 Index);
3087 }
3088
3089 return true;
3090}
3091
3092/// Calls SelectAddr and determines if the maximal addressing
3093/// mode it matches can be cost effectively emitted as an LEA instruction.
3094bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3095 SDValue &Base, SDValue &Scale,
3096 SDValue &Index, SDValue &Disp,
3097 SDValue &Segment) {
3098 X86ISelAddressMode AM;
3099
3100 // Save the DL and VT before calling matchAddress, it can invalidate N.
3101 SDLoc DL(N);
3102 MVT VT = N.getSimpleValueType();
3103
3104 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3105 // segments.
3106 SDValue Copy = AM.Segment;
3107 SDValue T = CurDAG->getRegister(0, MVT::i32);
3108 AM.Segment = T;
3109 if (matchAddress(N, AM))
3110 return false;
3111 assert (T == AM.Segment);
3112 AM.Segment = Copy;
3113
3114 unsigned Complexity = 0;
3115 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3116 Complexity = 1;
3117 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3118 Complexity = 4;
3119
3120 if (AM.IndexReg.getNode())
3121 Complexity++;
3122
3123 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3124 // a simple shift.
3125 if (AM.Scale > 1)
3126 Complexity++;
3127
3128 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3129 // to a LEA. This is determined with some experimentation but is by no means
3130 // optimal (especially for code size consideration). LEA is nice because of
3131 // its three-address nature. Tweak the cost function again when we can run
3132 // convertToThreeAddress() at register allocation time.
3133 if (AM.hasSymbolicDisplacement()) {
3134 // For X86-64, always use LEA to materialize RIP-relative addresses.
3135 if (Subtarget->is64Bit())
3136 Complexity = 4;
3137 else
3138 Complexity += 2;
3139 }
3140
3141 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3142 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3143 // duplicating flag-producing instructions later in the pipeline.
3144 if (N.getOpcode() == ISD::ADD) {
3145 auto isMathWithFlags = [](SDValue V) {
3146 switch (V.getOpcode()) {
3147 case X86ISD::ADD:
3148 case X86ISD::SUB:
3149 case X86ISD::ADC:
3150 case X86ISD::SBB:
3151 case X86ISD::SMUL:
3152 case X86ISD::UMUL:
3153 /* TODO: These opcodes can be added safely, but we may want to justify
3154 their inclusion for different reasons (better for reg-alloc).
3155 case X86ISD::OR:
3156 case X86ISD::XOR:
3157 case X86ISD::AND:
3158 */
3159 // Value 1 is the flag output of the node - verify it's not dead.
3160 return !SDValue(V.getNode(), 1).use_empty();
3161 default:
3162 return false;
3163 }
3164 };
3165 // TODO: We might want to factor in whether there's a load folding
3166 // opportunity for the math op that disappears with LEA.
3167 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3168 Complexity++;
3169 }
3170
3171 if (AM.Disp)
3172 Complexity++;
3173
3174 // If it isn't worth using an LEA, reject it.
3175 if (Complexity <= 2)
3176 return false;
3177
3178 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3179 return true;
3180}
3181
3182/// This is only run on TargetGlobalTLSAddress nodes.
3183bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3184 SDValue &Scale, SDValue &Index,
3185 SDValue &Disp, SDValue &Segment) {
3186 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3187 N.getOpcode() == ISD::TargetExternalSymbol);
3188
3189 X86ISelAddressMode AM;
3190 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3191 AM.GV = GA->getGlobal();
3192 AM.Disp += GA->getOffset();
3193 AM.SymbolFlags = GA->getTargetFlags();
3194 } else {
3195 auto *SA = cast<ExternalSymbolSDNode>(N);
3196 AM.ES = SA->getSymbol();
3197 AM.SymbolFlags = SA->getTargetFlags();
3198 }
3199
3200 if (Subtarget->is32Bit()) {
3201 AM.Scale = 1;
3202 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3203 }
3204
3205 MVT VT = N.getSimpleValueType();
3206 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3207 return true;
3208}
3209
3210bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3211 // Keep track of the original value type and whether this value was
3212 // truncated. If we see a truncation from pointer type to VT that truncates
3213 // bits that are known to be zero, we can use a narrow reference.
3214 EVT VT = N.getValueType();
3215 bool WasTruncated = false;
3216 if (N.getOpcode() == ISD::TRUNCATE) {
3217 WasTruncated = true;
3218 N = N.getOperand(0);
3219 }
3220
3221 if (N.getOpcode() != X86ISD::Wrapper)
3222 return false;
3223
3224 // We can only use non-GlobalValues as immediates if they were not truncated,
3225 // as we do not have any range information. If we have a GlobalValue and the
3226 // address was not truncated, we can select it as an operand directly.
3227 unsigned Opc = N.getOperand(0)->getOpcode();
3228 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3229 Op = N.getOperand(0);
3230 // We can only select the operand directly if we didn't have to look past a
3231 // truncate.
3232 return !WasTruncated;
3233 }
3234
3235 // Check that the global's range fits into VT.
3236 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3237 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3238 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3239 return false;
3240
3241 // Okay, we can use a narrow reference.
3242 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3243 GA->getOffset(), GA->getTargetFlags());
3244 return true;
3245}
3246
3247bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3248 SDValue &Base, SDValue &Scale,
3249 SDValue &Index, SDValue &Disp,
3250 SDValue &Segment) {
3251 assert(Root && P && "Unknown root/parent nodes");
3252 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3253 !IsProfitableToFold(N, P, Root) ||
3254 !IsLegalToFold(N, P, Root, OptLevel))
3255 return false;
3256
3257 return selectAddr(N.getNode(),
3258 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3259}
3260
3261bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3262 SDValue &Base, SDValue &Scale,
3263 SDValue &Index, SDValue &Disp,
3264 SDValue &Segment) {
3265 assert(Root && P && "Unknown root/parent nodes");
3266 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3267 !IsProfitableToFold(N, P, Root) ||
3268 !IsLegalToFold(N, P, Root, OptLevel))
3269 return false;
3270
3271 return selectAddr(N.getNode(),
3272 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3273}
3274
3275/// Return an SDNode that returns the value of the global base register.
3276/// Output instructions required to initialize the global base register,
3277/// if necessary.
3278SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3279 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3280 auto &DL = MF->getDataLayout();
3281 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3282}
3283
3284bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3285 if (N->getOpcode() == ISD::TRUNCATE)
3286 N = N->getOperand(0).getNode();
3287 if (N->getOpcode() != X86ISD::Wrapper)
3288 return false;
3289
3290 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3291 if (!GA)
3292 return false;
3293
3294 auto *GV = GA->getGlobal();
3295 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3296 if (CR)
3297 return CR->getSignedMin().sge(-1ull << Width) &&
3298 CR->getSignedMax().slt(1ull << Width);
3299 // In the kernel code model, globals are in the negative 2GB of the address
3300 // space, so globals can be a sign extended 32-bit immediate.
3301 // In other code models, small globals are in the low 2GB of the address
3302 // space, so sign extending them is equivalent to zero extending them.
3303 return Width == 32 && !TM.isLargeGlobalValue(GV);
3304}
3305
3306X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3307 assert(N->isMachineOpcode() && "Unexpected node");
3308 unsigned Opc = N->getMachineOpcode();
3309 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3310 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3311 if (CondNo < 0)
3312 return X86::COND_INVALID;
3313
3314 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3315}
3316
3317/// Test whether the given X86ISD::CMP node has any users that use a flag
3318/// other than ZF.
3319bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3320 // Examine each user of the node.
3321 for (SDUse &Use : Flags->uses()) {
3322 // Only check things that use the flags.
3323 if (Use.getResNo() != Flags.getResNo())
3324 continue;
3325 SDNode *User = Use.getUser();
3326 // Only examine CopyToReg uses that copy to EFLAGS.
3327 if (User->getOpcode() != ISD::CopyToReg ||
3328 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3329 return false;
3330 // Examine each user of the CopyToReg use.
3331 for (SDUse &FlagUse : User->uses()) {
3332 // Only examine the Flag result.
3333 if (FlagUse.getResNo() != 1)
3334 continue;
3335 // Anything unusual: assume conservatively.
3336 if (!FlagUse.getUser()->isMachineOpcode())
3337 return false;
3338 // Examine the condition code of the user.
3339 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3340
3341 switch (CC) {
3342 // Comparisons which only use the zero flag.
3343 case X86::COND_E: case X86::COND_NE:
3344 continue;
3345 // Anything else: assume conservatively.
3346 default:
3347 return false;
3348 }
3349 }
3350 }
3351 return true;
3352}
3353
3354/// Test whether the given X86ISD::CMP node has any uses which require the SF
3355/// flag to be accurate.
3356bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3357 // Examine each user of the node.
3358 for (SDUse &Use : Flags->uses()) {
3359 // Only check things that use the flags.
3360 if (Use.getResNo() != Flags.getResNo())
3361 continue;
3362 SDNode *User = Use.getUser();
3363 // Only examine CopyToReg uses that copy to EFLAGS.
3364 if (User->getOpcode() != ISD::CopyToReg ||
3365 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3366 return false;
3367 // Examine each user of the CopyToReg use.
3368 for (SDUse &FlagUse : User->uses()) {
3369 // Only examine the Flag result.
3370 if (FlagUse.getResNo() != 1)
3371 continue;
3372 // Anything unusual: assume conservatively.
3373 if (!FlagUse.getUser()->isMachineOpcode())
3374 return false;
3375 // Examine the condition code of the user.
3376 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3377
3378 switch (CC) {
3379 // Comparisons which don't examine the SF flag.
3380 case X86::COND_A: case X86::COND_AE:
3381 case X86::COND_B: case X86::COND_BE:
3382 case X86::COND_E: case X86::COND_NE:
3383 case X86::COND_O: case X86::COND_NO:
3384 case X86::COND_P: case X86::COND_NP:
3385 continue;
3386 // Anything else: assume conservatively.
3387 default:
3388 return false;
3389 }
3390 }
3391 }
3392 return true;
3393}
3394
3396 switch (CC) {
3397 // Comparisons which don't examine the CF flag.
3398 case X86::COND_O: case X86::COND_NO:
3399 case X86::COND_E: case X86::COND_NE:
3400 case X86::COND_S: case X86::COND_NS:
3401 case X86::COND_P: case X86::COND_NP:
3402 case X86::COND_L: case X86::COND_GE:
3403 case X86::COND_G: case X86::COND_LE:
3404 return false;
3405 // Anything else: assume conservatively.
3406 default:
3407 return true;
3408 }
3409}
3410
3411/// Test whether the given node which sets flags has any uses which require the
3412/// CF flag to be accurate.
3413 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3414 // Examine each user of the node.
3415 for (SDUse &Use : Flags->uses()) {
3416 // Only check things that use the flags.
3417 if (Use.getResNo() != Flags.getResNo())
3418 continue;
3419
3420 SDNode *User = Use.getUser();
3421 unsigned UserOpc = User->getOpcode();
3422
3423 if (UserOpc == ISD::CopyToReg) {
3424 // Only examine CopyToReg uses that copy to EFLAGS.
3425 if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3426 return false;
3427 // Examine each user of the CopyToReg use.
3428 for (SDUse &FlagUse : User->uses()) {
3429 // Only examine the Flag result.
3430 if (FlagUse.getResNo() != 1)
3431 continue;
3432 // Anything unusual: assume conservatively.
3433 if (!FlagUse.getUser()->isMachineOpcode())
3434 return false;
3435 // Examine the condition code of the user.
3436 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3437
3438 if (mayUseCarryFlag(CC))
3439 return false;
3440 }
3441
3442 // This CopyToReg is ok. Move on to the next user.
3443 continue;
3444 }
3445
3446 // This might be an unselected node. So look for the pre-isel opcodes that
3447 // use flags.
3448 unsigned CCOpNo;
3449 switch (UserOpc) {
3450 default:
3451 // Something unusual. Be conservative.
3452 return false;
3453 case X86ISD::SETCC: CCOpNo = 0; break;
3454 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3455 case X86ISD::CMOV: CCOpNo = 2; break;
3456 case X86ISD::BRCOND: CCOpNo = 2; break;
3457 }
3458
3459 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
3460 if (mayUseCarryFlag(CC))
3461 return false;
3462 }
3463 return true;
3464}
3465
3466/// Check whether or not the chain ending in StoreNode is suitable for doing
3467/// the {load; op; store} to modify transformation.
3469 SDValue StoredVal, SelectionDAG *CurDAG,
3470 unsigned LoadOpNo,
3471 LoadSDNode *&LoadNode,
3472 SDValue &InputChain) {
3473 // Is the stored value result 0 of the operation?
3474 if (StoredVal.getResNo() != 0) return false;
3475
3476 // Are there other uses of the operation other than the store?
3477 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3478
3479 // Is the store non-extending and non-indexed?
3480 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3481 return false;
3482
3483 SDValue Load = StoredVal->getOperand(LoadOpNo);
3484 // Is the stored value a non-extending and non-indexed load?
3485 if (!ISD::isNormalLoad(Load.getNode())) return false;
3486
3487 // Return LoadNode by reference.
3488 LoadNode = cast<LoadSDNode>(Load);
3489
3490 // Is store the only read of the loaded value?
3491 if (!Load.hasOneUse())
3492 return false;
3493
3494 // Is the address of the store the same as the load?
3495 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3496 LoadNode->getOffset() != StoreNode->getOffset())
3497 return false;
3498
3499 bool FoundLoad = false;
3500 SmallVector<SDValue, 4> ChainOps;
3501 SmallVector<const SDNode *, 4> LoopWorklist;
3503 const unsigned int Max = 1024;
3504
3505 // Visualization of Load-Op-Store fusion:
3506 // -------------------------
3507 // Legend:
3508 // *-lines = Chain operand dependencies.
3509 // |-lines = Normal operand dependencies.
3510 // Dependencies flow down and right. n-suffix references multiple nodes.
3511 //
3512 // C Xn C
3513 // * * *
3514 // * * *
3515 // Xn A-LD Yn TF Yn
3516 // * * \ | * |
3517 // * * \ | * |
3518 // * * \ | => A--LD_OP_ST
3519 // * * \| \
3520 // TF OP \
3521 // * | \ Zn
3522 // * | \
3523 // A-ST Zn
3524 //
3525
3526 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3527 // #2: Yn -> LD
3528 // #3: ST -> Zn
3529
3530 // Ensure the transform is safe by checking for the dual
3531 // dependencies to make sure we do not induce a loop.
3532
3533 // As LD is a predecessor to both OP and ST we can do this by checking:
3534 // a). if LD is a predecessor to a member of Xn or Yn.
3535 // b). if a Zn is a predecessor to ST.
3536
3537 // However, (b) can only occur through being a chain predecessor to
3538 // ST, which is the same as Zn being a member or predecessor of Xn,
3539 // which is a subset of LD being a predecessor of Xn. So it's
3540 // subsumed by check (a).
3541
3542 SDValue Chain = StoreNode->getChain();
3543
3544 // Gather X elements in ChainOps.
3545 if (Chain == Load.getValue(1)) {
3546 FoundLoad = true;
3547 ChainOps.push_back(Load.getOperand(0));
3548 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3549 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3550 SDValue Op = Chain.getOperand(i);
3551 if (Op == Load.getValue(1)) {
3552 FoundLoad = true;
3553 // Drop Load, but keep its chain. No cycle check necessary.
3554 ChainOps.push_back(Load.getOperand(0));
3555 continue;
3556 }
3557 LoopWorklist.push_back(Op.getNode());
3558 ChainOps.push_back(Op);
3559 }
3560 }
3561
3562 if (!FoundLoad)
3563 return false;
3564
3565 // Worklist is currently Xn. Add Yn to worklist.
3566 for (SDValue Op : StoredVal->ops())
3567 if (Op.getNode() != LoadNode)
3568 LoopWorklist.push_back(Op.getNode());
3569
3570 // Check (a) if Load is a predecessor to Xn + Yn
3571 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3572 true))
3573 return false;
3574
3575 InputChain =
3576 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3577 return true;
3578}
3579
3580// Change a chain of {load; op; store} of the same value into a simple op
3581// through memory of that value, if the uses of the modified value and its
3582// address are suitable.
3583//
3584// The tablegen pattern memory operand pattern is currently not able to match
3585// the case where the EFLAGS on the original operation are used.
3586//
3587// To move this to tablegen, we'll need to improve tablegen to allow flags to
3588// be transferred from a node in the pattern to the result node, probably with
3589// a new keyword. For example, we have this
3590// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3591// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3592// but maybe need something like this
3593// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3594// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3595// (transferrable EFLAGS)]>;
3596//
3597// Until then, we manually fold these and instruction select the operation
3598// here.
3599bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3600 auto *StoreNode = cast<StoreSDNode>(Node);
3601 SDValue StoredVal = StoreNode->getOperand(1);
3602 unsigned Opc = StoredVal->getOpcode();
3603
3604 // Before we try to select anything, make sure this is memory operand size
3605 // and opcode we can handle. Note that this must match the code below that
3606 // actually lowers the opcodes.
3607 EVT MemVT = StoreNode->getMemoryVT();
3608 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3609 MemVT != MVT::i8)
3610 return false;
3611
3612 bool IsCommutable = false;
3613 bool IsNegate = false;
3614 switch (Opc) {
3615 default:
3616 return false;
3617 case X86ISD::SUB:
3618 IsNegate = isNullConstant(StoredVal.getOperand(0));
3619 break;
3620 case X86ISD::SBB:
3621 break;
3622 case X86ISD::ADD:
3623 case X86ISD::ADC:
3624 case X86ISD::AND:
3625 case X86ISD::OR:
3626 case X86ISD::XOR:
3627 IsCommutable = true;
3628 break;
3629 }
3630
3631 unsigned LoadOpNo = IsNegate ? 1 : 0;
3632 LoadSDNode *LoadNode = nullptr;
3633 SDValue InputChain;
3634 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3635 LoadNode, InputChain)) {
3636 if (!IsCommutable)
3637 return false;
3638
3639 // This operation is commutable, try the other operand.
3640 LoadOpNo = 1;
3641 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3642 LoadNode, InputChain))
3643 return false;
3644 }
3645
3646 SDValue Base, Scale, Index, Disp, Segment;
3647 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3648 Segment))
3649 return false;
3650
3651 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3652 unsigned Opc8) {
3653 switch (MemVT.getSimpleVT().SimpleTy) {
3654 case MVT::i64:
3655 return Opc64;
3656 case MVT::i32:
3657 return Opc32;
3658 case MVT::i16:
3659 return Opc16;
3660 case MVT::i8:
3661 return Opc8;
3662 default:
3663 llvm_unreachable("Invalid size!");
3664 }
3665 };
3666
3668 switch (Opc) {
3669 case X86ISD::SUB:
3670 // Handle negate.
3671 if (IsNegate) {
3672 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3673 X86::NEG8m);
3674 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3675 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3676 MVT::Other, Ops);
3677 break;
3678 }
3679 [[fallthrough]];
3680 case X86ISD::ADD:
3681 // Try to match inc/dec.
3682 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3683 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3684 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3685 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3686 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3687 unsigned NewOpc =
3688 ((Opc == X86ISD::ADD) == IsOne)
3689 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3690 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3691 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3692 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3693 MVT::Other, Ops);
3694 break;
3695 }
3696 }
3697 [[fallthrough]];
3698 case X86ISD::ADC:
3699 case X86ISD::SBB:
3700 case X86ISD::AND:
3701 case X86ISD::OR:
3702 case X86ISD::XOR: {
3703 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3704 switch (Opc) {
3705 case X86ISD::ADD:
3706 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3707 X86::ADD8mr);
3708 case X86ISD::ADC:
3709 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3710 X86::ADC8mr);
3711 case X86ISD::SUB:
3712 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3713 X86::SUB8mr);
3714 case X86ISD::SBB:
3715 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3716 X86::SBB8mr);
3717 case X86ISD::AND:
3718 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3719 X86::AND8mr);
3720 case X86ISD::OR:
3721 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3722 case X86ISD::XOR:
3723 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3724 X86::XOR8mr);
3725 default:
3726 llvm_unreachable("Invalid opcode!");
3727 }
3728 };
3729 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3730 switch (Opc) {
3731 case X86ISD::ADD:
3732 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3733 X86::ADD8mi);
3734 case X86ISD::ADC:
3735 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3736 X86::ADC8mi);
3737 case X86ISD::SUB:
3738 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3739 X86::SUB8mi);
3740 case X86ISD::SBB:
3741 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3742 X86::SBB8mi);
3743 case X86ISD::AND:
3744 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3745 X86::AND8mi);
3746 case X86ISD::OR:
3747 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3748 X86::OR8mi);
3749 case X86ISD::XOR:
3750 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3751 X86::XOR8mi);
3752 default:
3753 llvm_unreachable("Invalid opcode!");
3754 }
3755 };
3756
3757 unsigned NewOpc = SelectRegOpcode(Opc);
3758 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3759
3760 // See if the operand is a constant that we can fold into an immediate
3761 // operand.
3762 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3763 int64_t OperandV = OperandC->getSExtValue();
3764
3765 // Check if we can shrink the operand enough to fit in an immediate (or
3766 // fit into a smaller immediate) by negating it and switching the
3767 // operation.
3768 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3769 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3770 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3771 isInt<32>(-OperandV))) &&
3772 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3773 OperandV = -OperandV;
3774 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3775 }
3776
3777 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3778 Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
3779 NewOpc = SelectImmOpcode(Opc);
3780 }
3781 }
3782
3783 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3784 SDValue CopyTo =
3785 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3786 StoredVal.getOperand(2), SDValue());
3787
3788 const SDValue Ops[] = {Base, Scale, Index, Disp,
3789 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3790 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3791 Ops);
3792 } else {
3793 const SDValue Ops[] = {Base, Scale, Index, Disp,
3794 Segment, Operand, InputChain};
3795 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3796 Ops);
3797 }
3798 break;
3799 }
3800 default:
3801 llvm_unreachable("Invalid opcode!");
3802 }
3803
3804 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3805 LoadNode->getMemOperand()};
3806 CurDAG->setNodeMemRefs(Result, MemOps);
3807
3808 // Update Load Chain uses as well.
3809 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3810 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3811 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3812 CurDAG->RemoveDeadNode(Node);
3813 return true;
3814}
3815
3816// See if this is an X & Mask that we can match to BEXTR/BZHI.
3817// Where Mask is one of the following patterns:
3818// a) x & (1 << nbits) - 1
3819// b) x & ~(-1 << nbits)
3820// c) x & (-1 >> (32 - y))
3821// d) x << (32 - y) >> (32 - y)
3822// e) (1 << nbits) - 1
3823bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3824 assert(
3825 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3826 Node->getOpcode() == ISD::SRL) &&
3827 "Should be either an and-mask, or right-shift after clearing high bits.");
3828
3829 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3830 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3831 return false;
3832
3833 MVT NVT = Node->getSimpleValueType(0);
3834
3835 // Only supported for 32 and 64 bits.
3836 if (NVT != MVT::i32 && NVT != MVT::i64)
3837 return false;
3838
3839 SDValue NBits;
3840 bool NegateNBits;
3841
3842 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3843 // Else, if we only have BMI1's BEXTR, we require one-use.
3844 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3845 auto checkUses = [AllowExtraUsesByDefault](
3846 SDValue Op, unsigned NUses,
3847 std::optional<bool> AllowExtraUses) {
3848 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3849 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3850 };
3851 auto checkOneUse = [checkUses](SDValue Op,
3852 std::optional<bool> AllowExtraUses =
3853 std::nullopt) {
3854 return checkUses(Op, 1, AllowExtraUses);
3855 };
3856 auto checkTwoUse = [checkUses](SDValue Op,
3857 std::optional<bool> AllowExtraUses =
3858 std::nullopt) {
3859 return checkUses(Op, 2, AllowExtraUses);
3860 };
3861
3862 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3863 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3864 assert(V.getSimpleValueType() == MVT::i32 &&
3865 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3866 "Expected i64 -> i32 truncation");
3867 V = V.getOperand(0);
3868 }
3869 return V;
3870 };
3871
3872 // a) x & ((1 << nbits) + (-1))
3873 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3874 &NegateNBits](SDValue Mask) -> bool {
3875 // Match `add`. Must only have one use!
3876 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3877 return false;
3878 // We should be adding all-ones constant (i.e. subtracting one.)
3879 if (!isAllOnesConstant(Mask->getOperand(1)))
3880 return false;
3881 // Match `1 << nbits`. Might be truncated. Must only have one use!
3882 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3883 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3884 return false;
3885 if (!isOneConstant(M0->getOperand(0)))
3886 return false;
3887 NBits = M0->getOperand(1);
3888 NegateNBits = false;
3889 return true;
3890 };
3891
3892 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3893 V = peekThroughOneUseTruncation(V);
3894 return CurDAG->MaskedValueIsAllOnes(
3895 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3896 NVT.getSizeInBits()));
3897 };
3898
3899 // b) x & ~(-1 << nbits)
3900 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3901 &NBits, &NegateNBits](SDValue Mask) -> bool {
3902 // Match `~()`. Must only have one use!
3903 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3904 return false;
3905 // The -1 only has to be all-ones for the final Node's NVT.
3906 if (!isAllOnes(Mask->getOperand(1)))
3907 return false;
3908 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3909 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3910 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3911 return false;
3912 // The -1 only has to be all-ones for the final Node's NVT.
3913 if (!isAllOnes(M0->getOperand(0)))
3914 return false;
3915 NBits = M0->getOperand(1);
3916 NegateNBits = false;
3917 return true;
3918 };
3919
3920 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3921 // or leave the shift amount as-is, but then we'll have to negate it.
3922 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3923 unsigned Bitwidth) {
3924 NBits = ShiftAmt;
3925 NegateNBits = true;
3926 // Skip over a truncate of the shift amount, if any.
3927 if (NBits.getOpcode() == ISD::TRUNCATE)
3928 NBits = NBits.getOperand(0);
3929 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3930 // If it doesn't match, that's fine, we'll just negate it ourselves.
3931 if (NBits.getOpcode() != ISD::SUB)
3932 return;
3933 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3934 if (!V0 || V0->getZExtValue() != Bitwidth)
3935 return;
3936 NBits = NBits.getOperand(1);
3937 NegateNBits = false;
3938 };
3939
3940 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3941 // or
3942 // c) x & (-1 >> (32 - y))
3943 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3944 canonicalizeShiftAmt](SDValue Mask) -> bool {
3945 // The mask itself may be truncated.
3946 Mask = peekThroughOneUseTruncation(Mask);
3947 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3948 // Match `l>>`. Must only have one use!
3949 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3950 return false;
3951 // We should be shifting truly all-ones constant.
3952 if (!isAllOnesConstant(Mask.getOperand(0)))
3953 return false;
3954 SDValue M1 = Mask.getOperand(1);
3955 // The shift amount should not be used externally.
3956 if (!checkOneUse(M1))
3957 return false;
3958 canonicalizeShiftAmt(M1, Bitwidth);
3959 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3960 // is no extra use of the mask. Clearly, there was one since we are here.
3961 // But at the same time, if we need to negate the shift amount,
3962 // then we don't want the mask to stick around, else it's unprofitable.
3963 return !NegateNBits;
3964 };
3965
3966 SDValue X;
3967
3968 // d) x << z >> z but then we'll have to subtract z from bitwidth
3969 // or
3970 // d) x << (32 - y) >> (32 - y)
3971 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3972 AllowExtraUsesByDefault, &NegateNBits,
3973 &X](SDNode *Node) -> bool {
3974 if (Node->getOpcode() != ISD::SRL)
3975 return false;
3976 SDValue N0 = Node->getOperand(0);
3977 if (N0->getOpcode() != ISD::SHL)
3978 return false;
3979 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3980 SDValue N1 = Node->getOperand(1);
3981 SDValue N01 = N0->getOperand(1);
3982 // Both of the shifts must be by the exact same value.
3983 if (N1 != N01)
3984 return false;
3985 canonicalizeShiftAmt(N1, Bitwidth);
3986 // There should not be any external uses of the inner shift / shift amount.
3987 // Note that while we are generally okay with external uses given BMI2,
3988 // iff we need to negate the shift amount, we are not okay with extra uses.
3989 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3990 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3991 return false;
3992 X = N0->getOperand(0);
3993 return true;
3994 };
3995
3996 auto matchLowBitMask = [matchPatternA, matchPatternB,
3997 matchPatternC](SDValue Mask) -> bool {
3998 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3999 };
4000
4001 if (Node->getOpcode() == ISD::AND) {
4002 X = Node->getOperand(0);
4003 SDValue Mask = Node->getOperand(1);
4004
4005 if (matchLowBitMask(Mask)) {
4006 // Great.
4007 } else {
4008 std::swap(X, Mask);
4009 if (!matchLowBitMask(Mask))
4010 return false;
4011 }
4012 } else if (matchLowBitMask(SDValue(Node, 0))) {
4013 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
4014 } else if (!matchPatternD(Node))
4015 return false;
4016
4017 // If we need to negate the shift amount, require BMI2 BZHI support.
4018 // It's just too unprofitable for BMI1 BEXTR.
4019 if (NegateNBits && !Subtarget->hasBMI2())
4020 return false;
4021
4022 SDLoc DL(Node);
4023
4024 // Truncate the shift amount.
4025 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
4026 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4027
4028 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4029 // All the other bits are undefined, we do not care about them.
4030 SDValue ImplDef = SDValue(
4031 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
4032 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
4033
4034 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
4035 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
4036 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
4037 MVT::i32, ImplDef, NBits, SRIdxVal),
4038 0);
4039 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4040
4041 // We might have matched the amount of high bits to be cleared,
4042 // but we want the amount of low bits to be kept, so negate it then.
4043 if (NegateNBits) {
4044 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4045 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4046
4047 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4048 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4049 }
4050
4051 if (Subtarget->hasBMI2()) {
4052 // Great, just emit the BZHI..
4053 if (NVT != MVT::i32) {
4054 // But have to place the bit count into the wide-enough register first.
4055 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4056 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4057 }
4058
4059 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4060 ReplaceNode(Node, Extract.getNode());
4061 SelectCode(Extract.getNode());
4062 return true;
4063 }
4064
4065 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4066 // *logically* shifted (potentially with one-use trunc inbetween),
4067 // and the truncation was the only use of the shift,
4068 // and if so look past one-use truncation.
4069 {
4070 SDValue RealX = peekThroughOneUseTruncation(X);
4071 // FIXME: only if the shift is one-use?
4072 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4073 X = RealX;
4074 }
4075
4076 MVT XVT = X.getSimpleValueType();
4077
4078 // Else, emitting BEXTR requires one more step.
4079 // The 'control' of BEXTR has the pattern of:
4080 // [15...8 bit][ 7...0 bit] location
4081 // [ bit count][ shift] name
4082 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4083
4084 // Shift NBits left by 8 bits, thus producing 'control'.
4085 // This makes the low 8 bits to be zero.
4086 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4087 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4088 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4089 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4090
4091 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4092 // FIXME: only if the shift is one-use?
4093 if (X.getOpcode() == ISD::SRL) {
4094 SDValue ShiftAmt = X.getOperand(1);
4095 X = X.getOperand(0);
4096
4097 assert(ShiftAmt.getValueType() == MVT::i8 &&
4098 "Expected shift amount to be i8");
4099
4100 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4101 // We could zext to i16 in some form, but we intentionally don't do that.
4102 SDValue OrigShiftAmt = ShiftAmt;
4103 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4104 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4105
4106 // And now 'or' these low 8 bits of shift amount into the 'control'.
4107 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4108 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4109 }
4110
4111 // But have to place the 'control' into the wide-enough register first.
4112 if (XVT != MVT::i32) {
4113 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4114 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4115 }
4116
4117 // And finally, form the BEXTR itself.
4118 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4119
4120 // The 'X' was originally truncated. Do that now.
4121 if (XVT != NVT) {
4122 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4123 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4124 }
4125
4126 ReplaceNode(Node, Extract.getNode());
4127 SelectCode(Extract.getNode());
4128
4129 return true;
4130}
4131
4132// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4133MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4134 MVT NVT = Node->getSimpleValueType(0);
4135 SDLoc dl(Node);
4136
4137 SDValue N0 = Node->getOperand(0);
4138 SDValue N1 = Node->getOperand(1);
4139
4140 // If we have TBM we can use an immediate for the control. If we have BMI
4141 // we should only do this if the BEXTR instruction is implemented well.
4142 // Otherwise moving the control into a register makes this more costly.
4143 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4144 // hoisting the move immediate would make it worthwhile with a less optimal
4145 // BEXTR?
4146 bool PreferBEXTR =
4147 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4148 if (!PreferBEXTR && !Subtarget->hasBMI2())
4149 return nullptr;
4150
4151 // Must have a shift right.
4152 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4153 return nullptr;
4154
4155 // Shift can't have additional users.
4156 if (!N0->hasOneUse())
4157 return nullptr;
4158
4159 // Only supported for 32 and 64 bits.
4160 if (NVT != MVT::i32 && NVT != MVT::i64)
4161 return nullptr;
4162
4163 // Shift amount and RHS of and must be constant.
4164 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4165 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4166 if (!MaskCst || !ShiftCst)
4167 return nullptr;
4168
4169 // And RHS must be a mask.
4170 uint64_t Mask = MaskCst->getZExtValue();
4171 if (!isMask_64(Mask))
4172 return nullptr;
4173
4174 uint64_t Shift = ShiftCst->getZExtValue();
4175 uint64_t MaskSize = llvm::popcount(Mask);
4176
4177 // Don't interfere with something that can be handled by extracting AH.
4178 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4179 if (Shift == 8 && MaskSize == 8)
4180 return nullptr;
4181
4182 // Make sure we are only using bits that were in the original value, not
4183 // shifted in.
4184 if (Shift + MaskSize > NVT.getSizeInBits())
4185 return nullptr;
4186
4187 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4188 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4189 // does not fit into 32 bits. Load folding is not a sufficient reason.
4190 if (!PreferBEXTR && MaskSize <= 32)
4191 return nullptr;
4192
4193 SDValue Control;
4194 unsigned ROpc, MOpc;
4195
4196#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4197 if (!PreferBEXTR) {
4198 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4199 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4200 // Let's perform the mask first, and apply shift later. Note that we need to
4201 // widen the mask to account for the fact that we'll apply shift afterwards!
4202 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4203 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4204 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4205 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4206 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4207 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4208 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4209 } else {
4210 // The 'control' of BEXTR has the pattern of:
4211 // [15...8 bit][ 7...0 bit] location
4212 // [ bit count][ shift] name
4213 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4214 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4215 if (Subtarget->hasTBM()) {
4216 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4217 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4218 } else {
4219 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4220 // BMI requires the immediate to placed in a register.
4221 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4222 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4223 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4224 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4225 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4226 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4227 }
4228 }
4229
4230 MachineSDNode *NewNode;
4231 SDValue Input = N0->getOperand(0);
4232 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4233 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4234 SDValue Ops[] = {
4235 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4236 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4237 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4238 // Update the chain.
4239 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4240 // Record the mem-refs
4241 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4242 } else {
4243 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4244 }
4245
4246 if (!PreferBEXTR) {
4247 // We still need to apply the shift.
4248 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4249 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4250 : GET_ND_IF_ENABLED(X86::SHR32ri);
4251 NewNode =
4252 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4253 }
4254
4255 return NewNode;
4256}
4257
4258// Emit a PCMISTR(I/M) instruction.
4259MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4260 bool MayFoldLoad, const SDLoc &dl,
4261 MVT VT, SDNode *Node) {
4262 SDValue N0 = Node->getOperand(0);
4263 SDValue N1 = Node->getOperand(1);
4264 SDValue Imm = Node->getOperand(2);
4265 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4266 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4267
4268 // Try to fold a load. No need to check alignment.
4269 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4270 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4271 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4272 N1.getOperand(0) };
4273 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4274 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4275 // Update the chain.
4276 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4277 // Record the mem-refs
4278 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4279 return CNode;
4280 }
4281
4282 SDValue Ops[] = { N0, N1, Imm };
4283 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4284 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4285 return CNode;
4286}
4287
4288// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4289// to emit a second instruction after this one. This is needed since we have two
4290// copyToReg nodes glued before this and we need to continue that glue through.
4291MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4292 bool MayFoldLoad, const SDLoc &dl,
4293 MVT VT, SDNode *Node,
4294 SDValue &InGlue) {
4295 SDValue N0 = Node->getOperand(0);
4296 SDValue N2 = Node->getOperand(2);
4297 SDValue Imm = Node->getOperand(4);
4298 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4299 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4300
4301 // Try to fold a load. No need to check alignment.
4302 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4303 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4304 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4305 N2.getOperand(0), InGlue };
4306 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4307 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4308 InGlue = SDValue(CNode, 3);
4309 // Update the chain.
4310 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4311 // Record the mem-refs
4312 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4313 return CNode;
4314 }
4315
4316 SDValue Ops[] = { N0, N2, Imm, InGlue };
4317 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4318 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4319 InGlue = SDValue(CNode, 2);
4320 return CNode;
4321}
4322
4323bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4324 EVT VT = N->getValueType(0);
4325
4326 // Only handle scalar shifts.
4327 if (VT.isVector())
4328 return false;
4329
4330 // Narrower shifts only mask to 5 bits in hardware.
4331 unsigned Size = VT == MVT::i64 ? 64 : 32;
4332
4333 SDValue OrigShiftAmt = N->getOperand(1);
4334 SDValue ShiftAmt = OrigShiftAmt;
4335 SDLoc DL(N);
4336
4337 // Skip over a truncate of the shift amount.
4338 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4339 ShiftAmt = ShiftAmt->getOperand(0);
4340
4341 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4342 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4343
4344 SDValue NewShiftAmt;
4345 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4346 ShiftAmt->getOpcode() == ISD::XOR) {
4347 SDValue Add0 = ShiftAmt->getOperand(0);
4348 SDValue Add1 = ShiftAmt->getOperand(1);
4349 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4350 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4351 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4352 // to avoid the ADD/SUB/XOR.
4353 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4354 NewShiftAmt = Add0;
4355
4356 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4357 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4358 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4359 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4360 // we can replace it with a NOT. In the XOR case it may save some code
4361 // size, in the SUB case it also may save a move.
4362 assert(Add0C == nullptr || Add1C == nullptr);
4363
4364 // We can only do N-X, not X-N
4365 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4366 return false;
4367
4368 EVT OpVT = ShiftAmt.getValueType();
4369
4370 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4371 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4372 Add0C == nullptr ? Add0 : Add1, AllOnes);
4373 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4374 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4375 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4376 // -X to generate a NEG instead of a SUB of a constant.
4377 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4378 Add0C->getZExtValue() != 0) {
4379 EVT SubVT = ShiftAmt.getValueType();
4380 SDValue X;
4381 if (Add0C->getZExtValue() % Size == 0)
4382 X = Add1;
4383 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4384 Add0C->getZExtValue() % 32 == 0) {
4385 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4386 // This is mainly beneficial if we already compute (x+n*32).
4387 if (Add1.getOpcode() == ISD::TRUNCATE) {
4388 Add1 = Add1.getOperand(0);
4389 SubVT = Add1.getValueType();
4390 }
4391 if (Add0.getValueType() != SubVT) {
4392 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4393 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4394 }
4395
4396 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4397 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4398 } else
4399 return false;
4400 // Insert a negate op.
4401 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4402 // that uses it that's not a shift.
4403 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4404 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4405 NewShiftAmt = Neg;
4406
4407 // Insert these operands into a valid topological order so they can
4408 // get selected independently.
4409 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4410 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4411 } else
4412 return false;
4413 } else
4414 return false;
4415
4416 if (NewShiftAmt.getValueType() != MVT::i8) {
4417 // Need to truncate the shift amount.
4418 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4419 // Add to a correct topological ordering.
4420 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4421 }
4422
4423 // Insert a new mask to keep the shift amount legal. This should be removed
4424 // by isel patterns.
4425 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4426 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4427 // Place in a correct topological ordering.
4428 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4429
4430 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4431 NewShiftAmt);
4432 if (UpdatedNode != N) {
4433 // If we found an existing node, we should replace ourselves with that node
4434 // and wait for it to be selected after its other users.
4435 ReplaceNode(N, UpdatedNode);
4436 return true;
4437 }
4438
4439 // If the original shift amount is now dead, delete it so that we don't run
4440 // it through isel.
4441 if (OrigShiftAmt.getNode()->use_empty())
4442 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4443
4444 // Now that we've optimized the shift amount, defer to normal isel to get
4445 // load folding and legacy vs BMI2 selection without repeating it here.
4446 SelectCode(N);
4447 return true;
4448}
4449
4450bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4451 MVT NVT = N->getSimpleValueType(0);
4452 unsigned Opcode = N->getOpcode();
4453 SDLoc dl(N);
4454
4455 // For operations of the form (x << C1) op C2, check if we can use a smaller
4456 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4457 SDValue Shift = N->getOperand(0);
4458 SDValue N1 = N->getOperand(1);
4459
4460 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4461 if (!Cst)
4462 return false;
4463
4464 int64_t Val = Cst->getSExtValue();
4465
4466 // If we have an any_extend feeding the AND, look through it to see if there
4467 // is a shift behind it. But only if the AND doesn't use the extended bits.
4468 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4469 bool FoundAnyExtend = false;
4470 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4471 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4472 isUInt<32>(Val)) {
4473 FoundAnyExtend = true;
4474 Shift = Shift.getOperand(0);
4475 }
4476
4477 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4478 return false;
4479
4480 // i8 is unshrinkable, i16 should be promoted to i32.
4481 if (NVT != MVT::i32 && NVT != MVT::i64)
4482 return false;
4483
4484 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4485 if (!ShlCst)
4486 return false;
4487
4488 uint64_t ShAmt = ShlCst->getZExtValue();
4489
4490 // Make sure that we don't change the operation by removing bits.
4491 // This only matters for OR and XOR, AND is unaffected.
4492 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4493 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4494 return false;
4495
4496 // Check the minimum bitwidth for the new constant.
4497 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4498 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4499 if (Opcode == ISD::AND) {
4500 // AND32ri is the same as AND64ri32 with zext imm.
4501 // Try this before sign extended immediates below.
4502 ShiftedVal = (uint64_t)Val >> ShAmt;
4503 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4504 return true;
4505 // Also swap order when the AND can become MOVZX.
4506 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4507 return true;
4508 }
4509 ShiftedVal = Val >> ShAmt;
4510 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4511 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4512 return true;
4513 if (Opcode != ISD::AND) {
4514 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4515 ShiftedVal = (uint64_t)Val >> ShAmt;
4516 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4517 return true;
4518 }
4519 return false;
4520 };
4521
4522 int64_t ShiftedVal;
4523 if (!CanShrinkImmediate(ShiftedVal))
4524 return false;
4525
4526 // Ok, we can reorder to get a smaller immediate.
4527
4528 // But, its possible the original immediate allowed an AND to become MOVZX.
4529 // Doing this late due to avoid the MakedValueIsZero call as late as
4530 // possible.
4531 if (Opcode == ISD::AND) {
4532 // Find the smallest zext this could possibly be.
4533 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4534 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4535
4536 // Figure out which bits need to be zero to achieve that mask.
4537 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4538 ZExtWidth);
4539 NeededMask &= ~Cst->getAPIntValue();
4540
4541 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4542 return false;
4543 }
4544
4545 SDValue X = Shift.getOperand(0);
4546 if (FoundAnyExtend) {
4547 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4548 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4549 X = NewX;
4550 }
4551
4552 SDValue NewCst = CurDAG->getSignedConstant(ShiftedVal, dl, NVT);
4553 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4554 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4555 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4556 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4557 Shift.getOperand(1));
4558 ReplaceNode(N, NewSHL.getNode());
4559 SelectCode(NewSHL.getNode());
4560 return true;
4561}
4562
4563bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4564 SDNode *ParentB, SDNode *ParentC,
4566 uint8_t Imm) {
4567 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4568 C.isOperandOf(ParentC) && "Incorrect parent node");
4569
4570 auto tryFoldLoadOrBCast =
4571 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4572 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4573 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4574 return true;
4575
4576 // Not a load, check for broadcast which may be behind a bitcast.
4577 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4578 P = L.getNode();
4579 L = L.getOperand(0);
4580 }
4581
4582 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4583 return false;
4584
4585 // Only 32 and 64 bit broadcasts are supported.
4586 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4587 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4588 if (Size != 32 && Size != 64)
4589 return false;
4590
4591 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4592 };
4593
4594 bool FoldedLoad = false;
4595 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4596 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4597 FoldedLoad = true;
4598 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4599 Tmp4)) {
4600 FoldedLoad = true;
4601 std::swap(A, C);
4602 // Swap bits 1/4 and 3/6.
4603 uint8_t OldImm = Imm;
4604 Imm = OldImm & 0xa5;
4605 if (OldImm & 0x02) Imm |= 0x10;
4606 if (OldImm & 0x10) Imm |= 0x02;
4607 if (OldImm & 0x08) Imm |= 0x40;
4608 if (OldImm & 0x40) Imm |= 0x08;
4609 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4610 Tmp4)) {
4611 FoldedLoad = true;
4612 std::swap(B, C);
4613 // Swap bits 1/2 and 5/6.
4614 uint8_t OldImm = Imm;
4615 Imm = OldImm & 0x99;
4616 if (OldImm & 0x02) Imm |= 0x04;
4617 if (OldImm & 0x04) Imm |= 0x02;
4618 if (OldImm & 0x20) Imm |= 0x40;
4619 if (OldImm & 0x40) Imm |= 0x20;
4620 }
4621
4622 SDLoc DL(Root);
4623
4624 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4625
4626 MVT NVT = Root->getSimpleValueType(0);
4627
4628 MachineSDNode *MNode;
4629 if (FoldedLoad) {
4630 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4631
4632 unsigned Opc;
4633 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4634 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4635 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4636 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4637
4638 bool UseD = EltSize == 32;
4639 if (NVT.is128BitVector())
4640 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4641 else if (NVT.is256BitVector())
4642 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4643 else if (NVT.is512BitVector())
4644 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4645 else
4646 llvm_unreachable("Unexpected vector size!");
4647 } else {
4648 bool UseD = NVT.getVectorElementType() == MVT::i32;
4649 if (NVT.is128BitVector())
4650 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4651 else if (NVT.is256BitVector())
4652 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4653 else if (NVT.is512BitVector())
4654 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4655 else
4656 llvm_unreachable("Unexpected vector size!");
4657 }
4658
4659 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4660 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4661
4662 // Update the chain.
4663 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4664 // Record the mem-refs
4665 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4666 } else {
4667 bool UseD = NVT.getVectorElementType() == MVT::i32;
4668 unsigned Opc;
4669 if (NVT.is128BitVector())
4670 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4671 else if (NVT.is256BitVector())
4672 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4673 else if (NVT.is512BitVector())
4674 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4675 else
4676 llvm_unreachable("Unexpected vector size!");
4677
4678 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4679 }
4680
4681 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4682 CurDAG->RemoveDeadNode(Root);
4683 return true;
4684}
4685
4686// Try to match two logic ops to a VPTERNLOG.
4687// FIXME: Handle more complex patterns that use an operand more than once?
4688bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4689 MVT NVT = N->getSimpleValueType(0);
4690
4691 // Make sure we support VPTERNLOG.
4692 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4693 NVT.getVectorElementType() == MVT::i1)
4694 return false;
4695
4696 // We need VLX for 128/256-bit.
4697 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4698 return false;
4699
4700 SDValue N0 = N->getOperand(0);
4701 SDValue N1 = N->getOperand(1);
4702
4703 auto getFoldableLogicOp = [](SDValue Op) {
4704 // Peek through single use bitcast.
4705 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4706 Op = Op.getOperand(0);
4707
4708 if (!Op.hasOneUse())
4709 return SDValue();
4710
4711 unsigned Opc = Op.getOpcode();
4712 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4713 Opc == X86ISD::ANDNP)
4714 return Op;
4715
4716 return SDValue();
4717 };
4718
4719 SDValue A, FoldableOp;
4720 if ((FoldableOp = getFoldableLogicOp(N1))) {
4721 A = N0;
4722 } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4723 A = N1;
4724 } else
4725 return false;
4726
4727 SDValue B = FoldableOp.getOperand(0);
4728 SDValue C = FoldableOp.getOperand(1);
4729 SDNode *ParentA = N;
4730 SDNode *ParentB = FoldableOp.getNode();
4731 SDNode *ParentC = FoldableOp.getNode();
4732
4733 // We can build the appropriate control immediate by performing the logic
4734 // operation we're matching using these constants for A, B, and C.
4735 uint8_t TernlogMagicA = 0xf0;
4736 uint8_t TernlogMagicB = 0xcc;
4737 uint8_t TernlogMagicC = 0xaa;
4738
4739 // Some of the inputs may be inverted, peek through them and invert the
4740 // magic values accordingly.
4741 // TODO: There may be a bitcast before the xor that we should peek through.
4742 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4743 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4744 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4745 Magic = ~Magic;
4746 Parent = Op.getNode();
4747 Op = Op.getOperand(0);
4748 }
4749 };
4750
4751 PeekThroughNot(A, ParentA, TernlogMagicA);
4752 PeekThroughNot(B, ParentB, TernlogMagicB);
4753 PeekThroughNot(C, ParentC, TernlogMagicC);
4754
4755 uint8_t Imm;
4756 switch (FoldableOp.getOpcode()) {
4757 default: llvm_unreachable("Unexpected opcode!");
4758 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4759 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4760 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4761 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4762 }
4763
4764 switch (N->getOpcode()) {
4765 default: llvm_unreachable("Unexpected opcode!");
4766 case X86ISD::ANDNP:
4767 if (A == N0)
4768 Imm &= ~TernlogMagicA;
4769 else
4770 Imm = ~(Imm) & TernlogMagicA;
4771 break;
4772 case ISD::AND: Imm &= TernlogMagicA; break;
4773 case ISD::OR: Imm |= TernlogMagicA; break;
4774 case ISD::XOR: Imm ^= TernlogMagicA; break;
4775 }
4776
4777 return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4778}
4779
4780/// If the high bits of an 'and' operand are known zero, try setting the
4781/// high bits of an 'and' constant operand to produce a smaller encoding by
4782/// creating a small, sign-extended negative immediate rather than a large
4783/// positive one. This reverses a transform in SimplifyDemandedBits that
4784/// shrinks mask constants by clearing bits. There is also a possibility that
4785/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4786/// case, just replace the 'and'. Return 'true' if the node is replaced.
4787bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4788 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4789 // have immediate operands.
4790 MVT VT = And->getSimpleValueType(0);
4791 if (VT != MVT::i32 && VT != MVT::i64)
4792 return false;
4793
4794 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4795 if (!And1C)
4796 return false;
4797
4798 // Bail out if the mask constant is already negative. It's can't shrink more.
4799 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4800 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4801 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4802 // are negative too.
4803 APInt MaskVal = And1C->getAPIntValue();
4804 unsigned MaskLZ = MaskVal.countl_zero();
4805 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4806 return false;
4807
4808 // Don't extend into the upper 32 bits of a 64 bit mask.
4809 if (VT == MVT::i64 && MaskLZ >= 32) {
4810 MaskLZ -= 32;
4811 MaskVal = MaskVal.trunc(32);
4812 }
4813
4814 SDValue And0 = And->getOperand(0);
4815 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4816 APInt NegMaskVal = MaskVal | HighZeros;
4817
4818 // If a negative constant would not allow a smaller encoding, there's no need
4819 // to continue. Only change the constant when we know it's a win.
4820 unsigned MinWidth = NegMaskVal.getSignificantBits();
4821 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4822 return false;
4823
4824 // Extend masks if we truncated above.
4825 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4826 NegMaskVal = NegMaskVal.zext(64);
4827 HighZeros = HighZeros.zext(64);
4828 }
4829
4830 // The variable operand must be all zeros in the top bits to allow using the
4831 // new, negative constant as the mask.
4832 // TODO: Handle constant folding?
4833 KnownBits Known0 = CurDAG->computeKnownBits(And0);
4834 if (Known0.isConstant() || !HighZeros.isSubsetOf(Known0.Zero))
4835 return false;
4836
4837 // Check if the mask is -1. In that case, this is an unnecessary instruction
4838 // that escaped earlier analysis.
4839 if (NegMaskVal.isAllOnes()) {
4840 ReplaceNode(And, And0.getNode());
4841 return true;
4842 }
4843
4844 // A negative mask allows a smaller encoding. Create a new 'and' node.
4845 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4846 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4847 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4848 ReplaceNode(And, NewAnd.getNode());
4849 SelectCode(NewAnd.getNode());
4850 return true;
4851}
4852
4853static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4854 bool FoldedBCast, bool Masked) {
4855#define VPTESTM_CASE(VT, SUFFIX) \
4856case MVT::VT: \
4857 if (Masked) \
4858 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4859 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4860
4861
4862#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4863default: llvm_unreachable("Unexpected VT!"); \
4864VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4865VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4866VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4867VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4868VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4869VPTESTM_CASE(v8i64, QZ##SUFFIX)
4870
4871#define VPTESTM_FULL_CASES(SUFFIX) \
4872VPTESTM_BROADCAST_CASES(SUFFIX) \