LLVM 23.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the X86 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86InstrInfo.h"
14#include "X86.h"
15#include "X86InstrBuilder.h"
16#include "X86InstrFoldTables.h"
18#include "X86Subtarget.h"
19#include "X86TargetMachine.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/Sequence.h"
36#include "llvm/IR/Function.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Module.h"
39#include "llvm/MC/MCAsmInfo.h"
40#include "llvm/MC/MCExpr.h"
41#include "llvm/MC/MCInst.h"
43#include "llvm/Support/Debug.h"
47#include <atomic>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "x86-instr-info"
53
54#define GET_INSTRINFO_CTOR_DTOR
55#include "X86GenInstrInfo.inc"
56
58
59static cl::opt<bool>
60 NoFusing("disable-spill-fusing",
61 cl::desc("Disable fusing of spill code into instructions"),
63static cl::opt<bool>
64 PrintFailedFusing("print-failed-fuse-candidates",
65 cl::desc("Print instructions that the allocator wants to"
66 " fuse, but the X86 backend currently can't"),
68static cl::opt<bool>
69 ReMatPICStubLoad("remat-pic-stub-load",
70 cl::desc("Re-materialize load from stub in PIC mode"),
71 cl::init(false), cl::Hidden);
73 PartialRegUpdateClearance("partial-reg-update-clearance",
74 cl::desc("Clearance between two register writes "
75 "for inserting XOR to avoid partial "
76 "register update"),
77 cl::init(64), cl::Hidden);
79 "undef-reg-clearance",
80 cl::desc("How many idle instructions we would like before "
81 "certain undef register reads"),
82 cl::init(128), cl::Hidden);
83
84// Pin the vtable to this file.
85void X86InstrInfo::anchor() {}
86
88 : X86GenInstrInfo(STI, RI,
89 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
90 : X86::ADJCALLSTACKDOWN32),
91 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
92 : X86::ADJCALLSTACKUP32),
93 X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
94 Subtarget(STI), RI(STI.getTargetTriple()) {}
95
97 unsigned OpNum) const {
98 auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum);
99 // If the target does not have egpr, then r16-r31 will be resereved for all
100 // instructions.
101 if (!RC || !Subtarget.hasEGPR())
102 return RC;
103
105 return RC;
106
107 const X86RegisterInfo *RI = Subtarget.getRegisterInfo();
108 return RI->constrainRegClassToNonRex2(RC);
109}
110
112 Register &SrcReg, Register &DstReg,
113 unsigned &SubIdx) const {
114 switch (MI.getOpcode()) {
115 default:
116 break;
117 case X86::MOVSX16rr8:
118 case X86::MOVZX16rr8:
119 case X86::MOVSX32rr8:
120 case X86::MOVZX32rr8:
121 case X86::MOVSX64rr8:
122 if (!Subtarget.is64Bit())
123 // It's not always legal to reference the low 8-bit of the larger
124 // register in 32-bit mode.
125 return false;
126 [[fallthrough]];
127 case X86::MOVSX32rr16:
128 case X86::MOVZX32rr16:
129 case X86::MOVSX64rr16:
130 case X86::MOVSX64rr32: {
131 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
132 // Be conservative.
133 return false;
134 SrcReg = MI.getOperand(1).getReg();
135 DstReg = MI.getOperand(0).getReg();
136 switch (MI.getOpcode()) {
137 default:
138 llvm_unreachable("Unreachable!");
139 case X86::MOVSX16rr8:
140 case X86::MOVZX16rr8:
141 case X86::MOVSX32rr8:
142 case X86::MOVZX32rr8:
143 case X86::MOVSX64rr8:
144 SubIdx = X86::sub_8bit;
145 break;
146 case X86::MOVSX32rr16:
147 case X86::MOVZX32rr16:
148 case X86::MOVSX64rr16:
149 SubIdx = X86::sub_16bit;
150 break;
151 case X86::MOVSX64rr32:
152 SubIdx = X86::sub_32bit;
153 break;
154 }
155 return true;
156 }
157 }
158 return false;
159}
160
162 if (MI.mayLoad() || MI.mayStore())
163 return false;
164
165 // Some target-independent operations that trivially lower to data-invariant
166 // instructions.
167 if (MI.isCopyLike() || MI.isInsertSubreg())
168 return true;
169
170 unsigned Opcode = MI.getOpcode();
171 using namespace X86;
172 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
173 // However, they set flags and are perhaps the most surprisingly constant
174 // time operations so we call them out here separately.
175 if (isIMUL(Opcode))
176 return true;
177 // Bit scanning and counting instructions that are somewhat surprisingly
178 // constant time as they scan across bits and do other fairly complex
179 // operations like popcnt, but are believed to be constant time on x86.
180 // However, these set flags.
181 if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
182 isTZCNT(Opcode))
183 return true;
184 // Bit manipulation instructions are effectively combinations of basic
185 // arithmetic ops, and should still execute in constant time. These also
186 // set flags.
187 if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
188 isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
189 isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
190 isTZMSK(Opcode))
191 return true;
192 // Bit extracting and clearing instructions should execute in constant time,
193 // and set flags.
194 if (isBEXTR(Opcode) || isBZHI(Opcode))
195 return true;
196 // Shift and rotate.
197 if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
198 isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
199 return true;
200 // Basic arithmetic is constant time on the input but does set flags.
201 if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
202 isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
203 return true;
204 // Arithmetic with just 32-bit and 64-bit variants and no immediates.
205 if (isANDN(Opcode))
206 return true;
207 // Unary arithmetic operations.
208 if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
209 return true;
210 // Unlike other arithmetic, NOT doesn't set EFLAGS.
211 if (isNOT(Opcode))
212 return true;
213 // Various move instructions used to zero or sign extend things. Note that we
214 // intentionally don't support the _NOREX variants as we can't handle that
215 // register constraint anyways.
216 if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
217 return true;
218 // Arithmetic instructions that are both constant time and don't set flags.
219 if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
220 return true;
221 // LEA doesn't actually access memory, and its arithmetic is constant time.
222 if (isLEA(Opcode))
223 return true;
224 // By default, assume that the instruction is not data invariant.
225 return false;
226}
227
229 switch (MI.getOpcode()) {
230 default:
231 // By default, assume that the load will immediately leak.
232 return false;
233
234 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
235 // However, they set flags and are perhaps the most surprisingly constant
236 // time operations so we call them out here separately.
237 case X86::IMUL16rm:
238 case X86::IMUL16rmi:
239 case X86::IMUL32rm:
240 case X86::IMUL32rmi:
241 case X86::IMUL64rm:
242 case X86::IMUL64rmi32:
243
244 // Bit scanning and counting instructions that are somewhat surprisingly
245 // constant time as they scan across bits and do other fairly complex
246 // operations like popcnt, but are believed to be constant time on x86.
247 // However, these set flags.
248 case X86::BSF16rm:
249 case X86::BSF32rm:
250 case X86::BSF64rm:
251 case X86::BSR16rm:
252 case X86::BSR32rm:
253 case X86::BSR64rm:
254 case X86::LZCNT16rm:
255 case X86::LZCNT32rm:
256 case X86::LZCNT64rm:
257 case X86::POPCNT16rm:
258 case X86::POPCNT32rm:
259 case X86::POPCNT64rm:
260 case X86::TZCNT16rm:
261 case X86::TZCNT32rm:
262 case X86::TZCNT64rm:
263
264 // Bit manipulation instructions are effectively combinations of basic
265 // arithmetic ops, and should still execute in constant time. These also
266 // set flags.
267 case X86::BLCFILL32rm:
268 case X86::BLCFILL64rm:
269 case X86::BLCI32rm:
270 case X86::BLCI64rm:
271 case X86::BLCIC32rm:
272 case X86::BLCIC64rm:
273 case X86::BLCMSK32rm:
274 case X86::BLCMSK64rm:
275 case X86::BLCS32rm:
276 case X86::BLCS64rm:
277 case X86::BLSFILL32rm:
278 case X86::BLSFILL64rm:
279 case X86::BLSI32rm:
280 case X86::BLSI64rm:
281 case X86::BLSIC32rm:
282 case X86::BLSIC64rm:
283 case X86::BLSMSK32rm:
284 case X86::BLSMSK64rm:
285 case X86::BLSR32rm:
286 case X86::BLSR64rm:
287 case X86::TZMSK32rm:
288 case X86::TZMSK64rm:
289
290 // Bit extracting and clearing instructions should execute in constant time,
291 // and set flags.
292 case X86::BEXTR32rm:
293 case X86::BEXTR64rm:
294 case X86::BEXTRI32mi:
295 case X86::BEXTRI64mi:
296 case X86::BZHI32rm:
297 case X86::BZHI64rm:
298
299 // Basic arithmetic is constant time on the input but does set flags.
300 case X86::ADC8rm:
301 case X86::ADC16rm:
302 case X86::ADC32rm:
303 case X86::ADC64rm:
304 case X86::ADD8rm:
305 case X86::ADD16rm:
306 case X86::ADD32rm:
307 case X86::ADD64rm:
308 case X86::AND8rm:
309 case X86::AND16rm:
310 case X86::AND32rm:
311 case X86::AND64rm:
312 case X86::ANDN32rm:
313 case X86::ANDN64rm:
314 case X86::OR8rm:
315 case X86::OR16rm:
316 case X86::OR32rm:
317 case X86::OR64rm:
318 case X86::SBB8rm:
319 case X86::SBB16rm:
320 case X86::SBB32rm:
321 case X86::SBB64rm:
322 case X86::SUB8rm:
323 case X86::SUB16rm:
324 case X86::SUB32rm:
325 case X86::SUB64rm:
326 case X86::XOR8rm:
327 case X86::XOR16rm:
328 case X86::XOR32rm:
329 case X86::XOR64rm:
330
331 // Integer multiply w/o affecting flags is still believed to be constant
332 // time on x86. Called out separately as this is among the most surprising
333 // instructions to exhibit that behavior.
334 case X86::MULX32rm:
335 case X86::MULX64rm:
336
337 // Arithmetic instructions that are both constant time and don't set flags.
338 case X86::RORX32mi:
339 case X86::RORX64mi:
340 case X86::SARX32rm:
341 case X86::SARX64rm:
342 case X86::SHLX32rm:
343 case X86::SHLX64rm:
344 case X86::SHRX32rm:
345 case X86::SHRX64rm:
346
347 // Conversions are believed to be constant time and don't set flags.
348 case X86::CVTTSD2SI64rm:
349 case X86::VCVTTSD2SI64rm:
350 case X86::VCVTTSD2SI64Zrm:
351 case X86::CVTTSD2SIrm:
352 case X86::VCVTTSD2SIrm:
353 case X86::VCVTTSD2SIZrm:
354 case X86::CVTTSS2SI64rm:
355 case X86::VCVTTSS2SI64rm:
356 case X86::VCVTTSS2SI64Zrm:
357 case X86::CVTTSS2SIrm:
358 case X86::VCVTTSS2SIrm:
359 case X86::VCVTTSS2SIZrm:
360 case X86::CVTSI2SDrm:
361 case X86::VCVTSI2SDrm:
362 case X86::VCVTSI2SDZrm:
363 case X86::CVTSI2SSrm:
364 case X86::VCVTSI2SSrm:
365 case X86::VCVTSI2SSZrm:
366 case X86::CVTSI642SDrm:
367 case X86::VCVTSI642SDrm:
368 case X86::VCVTSI642SDZrm:
369 case X86::CVTSI642SSrm:
370 case X86::VCVTSI642SSrm:
371 case X86::VCVTSI642SSZrm:
372 case X86::CVTSS2SDrm:
373 case X86::VCVTSS2SDrm:
374 case X86::VCVTSS2SDZrm:
375 case X86::CVTSD2SSrm:
376 case X86::VCVTSD2SSrm:
377 case X86::VCVTSD2SSZrm:
378 // AVX512 added unsigned integer conversions.
379 case X86::VCVTTSD2USI64Zrm:
380 case X86::VCVTTSD2USIZrm:
381 case X86::VCVTTSS2USI64Zrm:
382 case X86::VCVTTSS2USIZrm:
383 case X86::VCVTUSI2SDZrm:
384 case X86::VCVTUSI642SDZrm:
385 case X86::VCVTUSI2SSZrm:
386 case X86::VCVTUSI642SSZrm:
387
388 // Loads to register don't set flags.
389 case X86::MOV8rm:
390 case X86::MOV8rm_NOREX:
391 case X86::MOV16rm:
392 case X86::MOV32rm:
393 case X86::MOV64rm:
394 case X86::MOVSX16rm8:
395 case X86::MOVSX32rm16:
396 case X86::MOVSX32rm8:
397 case X86::MOVSX32rm8_NOREX:
398 case X86::MOVSX64rm16:
399 case X86::MOVSX64rm32:
400 case X86::MOVSX64rm8:
401 case X86::MOVZX16rm8:
402 case X86::MOVZX32rm16:
403 case X86::MOVZX32rm8:
404 case X86::MOVZX32rm8_NOREX:
405 case X86::MOVZX64rm16:
406 case X86::MOVZX64rm8:
407 return true;
408 }
409}
410
412 const MachineFunction *MF = MI.getParent()->getParent();
414
415 if (isFrameInstr(MI)) {
416 int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
417 SPAdj -= getFrameAdjustment(MI);
418 if (!isFrameSetup(MI))
419 SPAdj = -SPAdj;
420 return SPAdj;
421 }
422
423 // To know whether a call adjusts the stack, we need information
424 // that is bound to the following ADJCALLSTACKUP pseudo.
425 // Look for the next ADJCALLSTACKUP that follows the call.
426 if (MI.isCall()) {
427 const MachineBasicBlock *MBB = MI.getParent();
429 for (auto E = MBB->end(); I != E; ++I) {
430 if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall())
431 break;
432 }
433
434 // If we could not find a frame destroy opcode, then it has already
435 // been simplified, so we don't care.
436 if (I->getOpcode() != getCallFrameDestroyOpcode())
437 return 0;
438
439 return -(I->getOperand(1).getImm());
440 }
441
442 // Currently handle only PUSHes we can reasonably expect to see
443 // in call sequences
444 switch (MI.getOpcode()) {
445 default:
446 return 0;
447 case X86::PUSH32r:
448 case X86::PUSH32rmm:
449 case X86::PUSH32rmr:
450 case X86::PUSH32i:
451 return 4;
452 case X86::PUSH64r:
453 case X86::PUSH64rmm:
454 case X86::PUSH64rmr:
455 case X86::PUSH64i32:
456 return 8;
457 }
458}
459
460/// Return true and the FrameIndex if the specified
461/// operand and follow operands form a reference to the stack frame.
462bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
463 int &FrameIndex) const {
464 if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
465 MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
466 MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
467 MI.getOperand(Op + X86::AddrDisp).isImm() &&
468 MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
469 MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
470 MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
471 FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
472 return true;
473 }
474 return false;
475}
476
477static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
478 switch (Opcode) {
479 default:
480 return false;
481 case X86::MOV8rm:
482 case X86::KMOVBkm:
483 case X86::KMOVBkm_EVEX:
484 MemBytes = TypeSize::getFixed(1);
485 return true;
486 case X86::MOV16rm:
487 case X86::KMOVWkm:
488 case X86::KMOVWkm_EVEX:
489 case X86::VMOVSHZrm:
490 case X86::VMOVSHZrm_alt:
491 MemBytes = TypeSize::getFixed(2);
492 return true;
493 case X86::MOV32rm:
494 case X86::MOVSSrm:
495 case X86::MOVSSrm_alt:
496 case X86::VMOVSSrm:
497 case X86::VMOVSSrm_alt:
498 case X86::VMOVSSZrm:
499 case X86::VMOVSSZrm_alt:
500 case X86::KMOVDkm:
501 case X86::KMOVDkm_EVEX:
502 MemBytes = TypeSize::getFixed(4);
503 return true;
504 case X86::MOV64rm:
505 case X86::LD_Fp64m:
506 case X86::MOVSDrm:
507 case X86::MOVSDrm_alt:
508 case X86::VMOVSDrm:
509 case X86::VMOVSDrm_alt:
510 case X86::VMOVSDZrm:
511 case X86::VMOVSDZrm_alt:
512 case X86::MMX_MOVD64rm:
513 case X86::MMX_MOVQ64rm:
514 case X86::KMOVQkm:
515 case X86::KMOVQkm_EVEX:
516 MemBytes = TypeSize::getFixed(8);
517 return true;
518 case X86::MOVAPSrm:
519 case X86::MOVUPSrm:
520 case X86::MOVAPDrm:
521 case X86::MOVUPDrm:
522 case X86::MOVDQArm:
523 case X86::MOVDQUrm:
524 case X86::VMOVAPSrm:
525 case X86::VMOVUPSrm:
526 case X86::VMOVAPDrm:
527 case X86::VMOVUPDrm:
528 case X86::VMOVDQArm:
529 case X86::VMOVDQUrm:
530 case X86::VMOVAPSZ128rm:
531 case X86::VMOVUPSZ128rm:
532 case X86::VMOVAPSZ128rm_NOVLX:
533 case X86::VMOVUPSZ128rm_NOVLX:
534 case X86::VMOVAPDZ128rm:
535 case X86::VMOVUPDZ128rm:
536 case X86::VMOVDQU8Z128rm:
537 case X86::VMOVDQU16Z128rm:
538 case X86::VMOVDQA32Z128rm:
539 case X86::VMOVDQU32Z128rm:
540 case X86::VMOVDQA64Z128rm:
541 case X86::VMOVDQU64Z128rm:
542 MemBytes = TypeSize::getFixed(16);
543 return true;
544 case X86::VMOVAPSYrm:
545 case X86::VMOVUPSYrm:
546 case X86::VMOVAPDYrm:
547 case X86::VMOVUPDYrm:
548 case X86::VMOVDQAYrm:
549 case X86::VMOVDQUYrm:
550 case X86::VMOVAPSZ256rm:
551 case X86::VMOVUPSZ256rm:
552 case X86::VMOVAPSZ256rm_NOVLX:
553 case X86::VMOVUPSZ256rm_NOVLX:
554 case X86::VMOVAPDZ256rm:
555 case X86::VMOVUPDZ256rm:
556 case X86::VMOVDQU8Z256rm:
557 case X86::VMOVDQU16Z256rm:
558 case X86::VMOVDQA32Z256rm:
559 case X86::VMOVDQU32Z256rm:
560 case X86::VMOVDQA64Z256rm:
561 case X86::VMOVDQU64Z256rm:
562 MemBytes = TypeSize::getFixed(32);
563 return true;
564 case X86::VMOVAPSZrm:
565 case X86::VMOVUPSZrm:
566 case X86::VMOVAPDZrm:
567 case X86::VMOVUPDZrm:
568 case X86::VMOVDQU8Zrm:
569 case X86::VMOVDQU16Zrm:
570 case X86::VMOVDQA32Zrm:
571 case X86::VMOVDQU32Zrm:
572 case X86::VMOVDQA64Zrm:
573 case X86::VMOVDQU64Zrm:
574 MemBytes = TypeSize::getFixed(64);
575 return true;
576 }
577}
578
579static bool isFrameStoreOpcode(int Opcode, TypeSize &MemBytes) {
580 switch (Opcode) {
581 default:
582 return false;
583 case X86::MOV8mr:
584 case X86::KMOVBmk:
585 case X86::KMOVBmk_EVEX:
586 MemBytes = TypeSize::getFixed(1);
587 return true;
588 case X86::MOV16mr:
589 case X86::KMOVWmk:
590 case X86::KMOVWmk_EVEX:
591 case X86::VMOVSHZmr:
592 MemBytes = TypeSize::getFixed(2);
593 return true;
594 case X86::MOV32mr:
595 case X86::MOVSSmr:
596 case X86::VMOVSSmr:
597 case X86::VMOVSSZmr:
598 case X86::KMOVDmk:
599 case X86::KMOVDmk_EVEX:
600 MemBytes = TypeSize::getFixed(4);
601 return true;
602 case X86::MOV64mr:
603 case X86::ST_FpP64m:
604 case X86::MOVSDmr:
605 case X86::VMOVSDmr:
606 case X86::VMOVSDZmr:
607 case X86::MMX_MOVD64mr:
608 case X86::MMX_MOVQ64mr:
609 case X86::MMX_MOVNTQmr:
610 case X86::KMOVQmk:
611 case X86::KMOVQmk_EVEX:
612 MemBytes = TypeSize::getFixed(8);
613 return true;
614 case X86::MOVAPSmr:
615 case X86::MOVUPSmr:
616 case X86::MOVAPDmr:
617 case X86::MOVUPDmr:
618 case X86::MOVDQAmr:
619 case X86::MOVDQUmr:
620 case X86::VMOVAPSmr:
621 case X86::VMOVUPSmr:
622 case X86::VMOVAPDmr:
623 case X86::VMOVUPDmr:
624 case X86::VMOVDQAmr:
625 case X86::VMOVDQUmr:
626 case X86::VMOVUPSZ128mr:
627 case X86::VMOVAPSZ128mr:
628 case X86::VMOVUPSZ128mr_NOVLX:
629 case X86::VMOVAPSZ128mr_NOVLX:
630 case X86::VMOVUPDZ128mr:
631 case X86::VMOVAPDZ128mr:
632 case X86::VMOVDQA32Z128mr:
633 case X86::VMOVDQU32Z128mr:
634 case X86::VMOVDQA64Z128mr:
635 case X86::VMOVDQU64Z128mr:
636 case X86::VMOVDQU8Z128mr:
637 case X86::VMOVDQU16Z128mr:
638 MemBytes = TypeSize::getFixed(16);
639 return true;
640 case X86::VMOVUPSYmr:
641 case X86::VMOVAPSYmr:
642 case X86::VMOVUPDYmr:
643 case X86::VMOVAPDYmr:
644 case X86::VMOVDQUYmr:
645 case X86::VMOVDQAYmr:
646 case X86::VMOVUPSZ256mr:
647 case X86::VMOVAPSZ256mr:
648 case X86::VMOVUPSZ256mr_NOVLX:
649 case X86::VMOVAPSZ256mr_NOVLX:
650 case X86::VMOVUPDZ256mr:
651 case X86::VMOVAPDZ256mr:
652 case X86::VMOVDQU8Z256mr:
653 case X86::VMOVDQU16Z256mr:
654 case X86::VMOVDQA32Z256mr:
655 case X86::VMOVDQU32Z256mr:
656 case X86::VMOVDQA64Z256mr:
657 case X86::VMOVDQU64Z256mr:
658 MemBytes = TypeSize::getFixed(32);
659 return true;
660 case X86::VMOVUPSZmr:
661 case X86::VMOVAPSZmr:
662 case X86::VMOVUPDZmr:
663 case X86::VMOVAPDZmr:
664 case X86::VMOVDQU8Zmr:
665 case X86::VMOVDQU16Zmr:
666 case X86::VMOVDQA32Zmr:
667 case X86::VMOVDQU32Zmr:
668 case X86::VMOVDQA64Zmr:
669 case X86::VMOVDQU64Zmr:
670 MemBytes = TypeSize::getFixed(64);
671 return true;
672 }
673 return false;
674}
675
677 int &FrameIndex) const {
678 TypeSize Dummy = TypeSize::getZero();
679 return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
680}
681
683 int &FrameIndex,
684 TypeSize &MemBytes) const {
685 if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
686 if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
687 return MI.getOperand(0).getReg();
688 return Register();
689}
690
692 int &FrameIndex) const {
693 TypeSize Dummy = TypeSize::getZero();
694 if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
695 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
696 return Reg;
697 // Check for post-frame index elimination operations
699 if (hasLoadFromStackSlot(MI, Accesses)) {
700 FrameIndex =
701 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
702 ->getFrameIndex();
703 return MI.getOperand(0).getReg();
704 }
705 }
706 return Register();
707}
708
710 int &FrameIndex) const {
711 TypeSize Dummy = TypeSize::getZero();
712 return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
713}
714
716 int &FrameIndex,
717 TypeSize &MemBytes) const {
718 if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
719 if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
720 isFrameOperand(MI, 0, FrameIndex))
721 return MI.getOperand(X86::AddrNumOperands).getReg();
722 return Register();
723}
724
726 int &FrameIndex) const {
727 TypeSize Dummy = TypeSize::getZero();
728 if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
729 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
730 return Reg;
731 // Check for post-frame index elimination operations
733 if (hasStoreToStackSlot(MI, Accesses)) {
734 FrameIndex =
735 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
736 ->getFrameIndex();
737 return MI.getOperand(X86::AddrNumOperands).getReg();
738 }
739 }
740 return Register();
741}
742
743/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
744static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
745 // Don't waste compile time scanning use-def chains of physregs.
746 if (!BaseReg.isVirtual())
747 return false;
748 bool isPICBase = false;
749 for (const MachineInstr &DefMI : MRI.def_instructions(BaseReg)) {
750 if (DefMI.getOpcode() != X86::MOVPC32r)
751 return false;
752 assert(!isPICBase && "More than one PIC base?");
753 isPICBase = true;
754 }
755 return isPICBase;
756}
757
759 const MachineInstr &MI) const {
760 switch (MI.getOpcode()) {
761 default:
762 // This function should only be called for opcodes with the ReMaterializable
763 // flag set.
764 llvm_unreachable("Unknown rematerializable operation!");
765 break;
766 case X86::IMPLICIT_DEF:
767 // Defer to generic logic.
768 break;
769 case X86::LOAD_STACK_GUARD:
770 case X86::LD_Fp032:
771 case X86::LD_Fp064:
772 case X86::LD_Fp080:
773 case X86::LD_Fp132:
774 case X86::LD_Fp164:
775 case X86::LD_Fp180:
776 case X86::AVX1_SETALLONES:
777 case X86::AVX2_SETALLONES:
778 case X86::AVX512_128_SET0:
779 case X86::AVX512_256_SET0:
780 case X86::AVX512_512_SET0:
781 case X86::AVX512_128_SETALLONES:
782 case X86::AVX512_256_SETALLONES:
783 case X86::AVX512_512_SETALLONES:
784 case X86::AVX512_FsFLD0SD:
785 case X86::AVX512_FsFLD0SH:
786 case X86::AVX512_FsFLD0SS:
787 case X86::AVX512_FsFLD0F128:
788 case X86::AVX_SET0:
789 case X86::FsFLD0SD:
790 case X86::FsFLD0SS:
791 case X86::FsFLD0SH:
792 case X86::FsFLD0F128:
793 case X86::KSET0B:
794 case X86::KSET0D:
795 case X86::KSET0Q:
796 case X86::KSET0W:
797 case X86::KSET1B:
798 case X86::KSET1D:
799 case X86::KSET1Q:
800 case X86::KSET1W:
801 case X86::MMX_SET0:
802 case X86::MOV32ImmSExti8:
803 case X86::MOV32r0:
804 case X86::MOV32r1:
805 case X86::MOV32r_1:
806 case X86::MOV32ri64:
807 case X86::MOV64ImmSExti8:
808 case X86::V_SET0:
809 case X86::V_SETALLONES:
810 case X86::MOV16ri:
811 case X86::MOV32ri:
812 case X86::MOV64ri:
813 case X86::MOV64ri32:
814 case X86::MOV8ri:
815 case X86::PTILEZEROV:
816 return true;
817
818 case X86::MOV8rm:
819 case X86::MOV8rm_NOREX:
820 case X86::MOV16rm:
821 case X86::MOV32rm:
822 case X86::MOV64rm:
823 case X86::MOVSSrm:
824 case X86::MOVSSrm_alt:
825 case X86::MOVSDrm:
826 case X86::MOVSDrm_alt:
827 case X86::MOVAPSrm:
828 case X86::MOVUPSrm:
829 case X86::MOVAPDrm:
830 case X86::MOVUPDrm:
831 case X86::MOVDQArm:
832 case X86::MOVDQUrm:
833 case X86::VMOVSSrm:
834 case X86::VMOVSSrm_alt:
835 case X86::VMOVSDrm:
836 case X86::VMOVSDrm_alt:
837 case X86::VMOVAPSrm:
838 case X86::VMOVUPSrm:
839 case X86::VMOVAPDrm:
840 case X86::VMOVUPDrm:
841 case X86::VMOVDQArm:
842 case X86::VMOVDQUrm:
843 case X86::VMOVAPSYrm:
844 case X86::VMOVUPSYrm:
845 case X86::VMOVAPDYrm:
846 case X86::VMOVUPDYrm:
847 case X86::VMOVDQAYrm:
848 case X86::VMOVDQUYrm:
849 case X86::MMX_MOVD64rm:
850 case X86::MMX_MOVQ64rm:
851 case X86::VBROADCASTSSrm:
852 case X86::VBROADCASTSSYrm:
853 case X86::VBROADCASTSDYrm:
854 // AVX-512
855 case X86::VPBROADCASTBZ128rm:
856 case X86::VPBROADCASTBZ256rm:
857 case X86::VPBROADCASTBZrm:
858 case X86::VBROADCASTF32X2Z256rm:
859 case X86::VBROADCASTF32X2Zrm:
860 case X86::VBROADCASTI32X2Z128rm:
861 case X86::VBROADCASTI32X2Z256rm:
862 case X86::VBROADCASTI32X2Zrm:
863 case X86::VPBROADCASTWZ128rm:
864 case X86::VPBROADCASTWZ256rm:
865 case X86::VPBROADCASTWZrm:
866 case X86::VPBROADCASTDZ128rm:
867 case X86::VPBROADCASTDZ256rm:
868 case X86::VPBROADCASTDZrm:
869 case X86::VBROADCASTSSZ128rm:
870 case X86::VBROADCASTSSZ256rm:
871 case X86::VBROADCASTSSZrm:
872 case X86::VPBROADCASTQZ128rm:
873 case X86::VPBROADCASTQZ256rm:
874 case X86::VPBROADCASTQZrm:
875 case X86::VBROADCASTSDZ256rm:
876 case X86::VBROADCASTSDZrm:
877 case X86::VMOVSSZrm:
878 case X86::VMOVSSZrm_alt:
879 case X86::VMOVSDZrm:
880 case X86::VMOVSDZrm_alt:
881 case X86::VMOVSHZrm:
882 case X86::VMOVSHZrm_alt:
883 case X86::VMOVAPDZ128rm:
884 case X86::VMOVAPDZ256rm:
885 case X86::VMOVAPDZrm:
886 case X86::VMOVAPSZ128rm:
887 case X86::VMOVAPSZ256rm:
888 case X86::VMOVAPSZ128rm_NOVLX:
889 case X86::VMOVAPSZ256rm_NOVLX:
890 case X86::VMOVAPSZrm:
891 case X86::VMOVDQA32Z128rm:
892 case X86::VMOVDQA32Z256rm:
893 case X86::VMOVDQA32Zrm:
894 case X86::VMOVDQA64Z128rm:
895 case X86::VMOVDQA64Z256rm:
896 case X86::VMOVDQA64Zrm:
897 case X86::VMOVDQU16Z128rm:
898 case X86::VMOVDQU16Z256rm:
899 case X86::VMOVDQU16Zrm:
900 case X86::VMOVDQU32Z128rm:
901 case X86::VMOVDQU32Z256rm:
902 case X86::VMOVDQU32Zrm:
903 case X86::VMOVDQU64Z128rm:
904 case X86::VMOVDQU64Z256rm:
905 case X86::VMOVDQU64Zrm:
906 case X86::VMOVDQU8Z128rm:
907 case X86::VMOVDQU8Z256rm:
908 case X86::VMOVDQU8Zrm:
909 case X86::VMOVUPDZ128rm:
910 case X86::VMOVUPDZ256rm:
911 case X86::VMOVUPDZrm:
912 case X86::VMOVUPSZ128rm:
913 case X86::VMOVUPSZ256rm:
914 case X86::VMOVUPSZ128rm_NOVLX:
915 case X86::VMOVUPSZ256rm_NOVLX:
916 case X86::VMOVUPSZrm: {
917 // Loads from constant pools are trivially rematerializable.
918 if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
919 MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
920 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
921 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
922 MI.isDereferenceableInvariantLoad()) {
923 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
924 if (BaseReg == 0 || BaseReg == X86::RIP)
925 return true;
926 // Allow re-materialization of PIC load.
927 if (!(!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())) {
928 const MachineFunction &MF = *MI.getParent()->getParent();
929 const MachineRegisterInfo &MRI = MF.getRegInfo();
930 if (regIsPICBase(BaseReg, MRI))
931 return true;
932 }
933 }
934 break;
935 }
936
937 case X86::LEA32r:
938 case X86::LEA64r: {
939 if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
940 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
941 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
942 !MI.getOperand(1 + X86::AddrDisp).isReg()) {
943 // lea fi#, lea GV, etc. are all rematerializable.
944 if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
945 return true;
946 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
947 if (BaseReg == 0)
948 return true;
949 // Allow re-materialization of lea PICBase + x.
950 const MachineFunction &MF = *MI.getParent()->getParent();
951 const MachineRegisterInfo &MRI = MF.getRegInfo();
952 if (regIsPICBase(BaseReg, MRI))
953 return true;
954 }
955 break;
956 }
957 }
959}
960
963 Register DestReg, unsigned SubIdx,
964 const MachineInstr &Orig) const {
965 bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
966 if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
968 // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
969 // effects.
970 int Value;
971 switch (Orig.getOpcode()) {
972 case X86::MOV32r0:
973 Value = 0;
974 break;
975 case X86::MOV32r1:
976 Value = 1;
977 break;
978 case X86::MOV32r_1:
979 Value = -1;
980 break;
981 default:
982 llvm_unreachable("Unexpected instruction!");
983 }
984
985 const DebugLoc &DL = Orig.getDebugLoc();
986 BuildMI(MBB, I, DL, get(X86::MOV32ri))
987 .add(Orig.getOperand(0))
988 .addImm(Value);
989 } else {
990 MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
991 MBB.insert(I, MI);
992 }
993
994 MachineInstr &NewMI = *std::prev(I);
995 NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
996}
997
998/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
1000 for (const MachineOperand &MO : MI.operands()) {
1001 if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS &&
1002 !MO.isDead()) {
1003 return true;
1004 }
1005 }
1006 return false;
1007}
1008
1009/// Check whether the shift count for a machine operand is non-zero.
1010inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1011 unsigned ShiftAmtOperandIdx) {
1012 // The shift count is six bits with the REX.W prefix and five bits without.
1013 unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1014 unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1015 return Imm & ShiftCountMask;
1016}
1017
1018/// Check whether the given shift count is appropriate
1019/// can be represented by a LEA instruction.
1020inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1021 // Left shift instructions can be transformed into load-effective-address
1022 // instructions if we can encode them appropriately.
1023 // A LEA instruction utilizes a SIB byte to encode its scale factor.
1024 // The SIB.scale field is two bits wide which means that we can encode any
1025 // shift amount less than 4.
1026 return ShAmt < 4 && ShAmt > 0;
1027}
1028
1029static bool
1031 const MachineRegisterInfo *MRI, MachineInstr **AndInstr,
1032 const TargetRegisterInfo *TRI, const X86Subtarget &ST,
1033 bool &NoSignFlag, bool &ClearsOverflowFlag) {
1034 if (!(CmpValDefInstr.getOpcode() == X86::SUBREG_TO_REG &&
1035 CmpInstr.getOpcode() == X86::TEST64rr) &&
1036 !(CmpValDefInstr.getOpcode() == X86::COPY &&
1037 CmpInstr.getOpcode() == X86::TEST16rr))
1038 return false;
1039
1040 // CmpInstr is a TEST16rr/TEST64rr instruction, and
1041 // `X86InstrInfo::analyzeCompare` guarantees that it's analyzable only if two
1042 // registers are identical.
1043 assert((CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
1044 "CmpInstr is an analyzable TEST16rr/TEST64rr, and "
1045 "`X86InstrInfo::analyzeCompare` requires two reg operands are the"
1046 "same.");
1047
1048 // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
1049 // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
1050 // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
1051 // redundant.
1052 assert(
1053 (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
1054 "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG or TEST16rr "
1055 "is a user of COPY sub16bit.");
1056 MachineInstr *VregDefInstr = nullptr;
1057 if (CmpInstr.getOpcode() == X86::TEST16rr) {
1058 if (!CmpValDefInstr.getOperand(1).getReg().isVirtual())
1059 return false;
1060 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1061 if (!VregDefInstr)
1062 return false;
1063 // We can only remove test when AND32ri or AND64ri32 whose imm can fit 16bit
1064 // size, others 32/64 bit ops would test higher bits which test16rr don't
1065 // want to.
1066 if (!((VregDefInstr->getOpcode() == X86::AND32ri ||
1067 VregDefInstr->getOpcode() == X86::AND64ri32) &&
1068 isUInt<16>(VregDefInstr->getOperand(2).getImm())))
1069 return false;
1070 }
1071
1072 if (CmpInstr.getOpcode() == X86::TEST64rr) {
1073 // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1074 // sub_32bit or sub_xmm.
1075 if (CmpValDefInstr.getOperand(2).getImm() != X86::sub_32bit)
1076 return false;
1077
1078 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1079 }
1080
1081 assert(VregDefInstr && "Must have a definition (SSA)");
1082
1083 // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1084 // to simplify the subsequent analysis.
1085 //
1086 // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1087 // `CmpValDefInstr.getParent()`, this could be handled.
1088 if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1089 return false;
1090
1091 if (X86::isAND(VregDefInstr->getOpcode()) &&
1092 (!ST.hasNF() || VregDefInstr->modifiesRegister(X86::EFLAGS, TRI))) {
1093 // Get a sequence of instructions like
1094 // %reg = and* ... // Set EFLAGS
1095 // ... // EFLAGS not changed
1096 // %extended_reg = subreg_to_reg %reg, %subreg.sub_32bit
1097 // test64rr %extended_reg, %extended_reg, implicit-def $eflags
1098 // or
1099 // %reg = and32* ...
1100 // ... // EFLAGS not changed.
1101 // %src_reg = copy %reg.sub_16bit:gr32
1102 // test16rr %src_reg, %src_reg, implicit-def $eflags
1103 //
1104 // If subsequent readers use a subset of bits that don't change
1105 // after `and*` instructions, it's likely that the test64rr could
1106 // be optimized away.
1107 for (const MachineInstr &Instr :
1108 make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1109 MachineBasicBlock::iterator(CmpValDefInstr))) {
1110 // There are instructions between 'VregDefInstr' and
1111 // 'CmpValDefInstr' that modifies EFLAGS.
1112 if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1113 return false;
1114 }
1115
1116 *AndInstr = VregDefInstr;
1117
1118 // AND instruction will essentially update SF and clear OF, so
1119 // NoSignFlag should be false in the sense that SF is modified by `AND`.
1120 //
1121 // However, the implementation artifically sets `NoSignFlag` to true
1122 // to poison the SF bit; that is to say, if SF is looked at later, the
1123 // optimization (to erase TEST64rr) will be disabled.
1124 //
1125 // The reason to poison SF bit is that SF bit value could be different
1126 // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1127 // and is known to be 0 as a result of `TEST64rr`.
1128 //
1129 // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1130 // the AND instruction and using the static information to guide peephole
1131 // optimization if possible. For example, it's possible to fold a
1132 // conditional move into a copy if the relevant EFLAG bits could be deduced
1133 // from an immediate operand of and operation.
1134 //
1135 NoSignFlag = true;
1136 // ClearsOverflowFlag is true for AND operation (no surprise).
1137 ClearsOverflowFlag = true;
1138 return true;
1139 }
1140 return false;
1141}
1142
1144 unsigned Opc, bool AllowSP, Register &NewSrc,
1145 unsigned &NewSrcSubReg, bool &isKill,
1146 MachineOperand &ImplicitOp, LiveVariables *LV,
1147 LiveIntervals *LIS) const {
1148 MachineFunction &MF = *MI.getParent()->getParent();
1149 const TargetRegisterClass *RC;
1150 if (AllowSP) {
1151 RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1152 } else {
1153 RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1154 }
1155 Register SrcReg = Src.getReg();
1156 unsigned SubReg = Src.getSubReg();
1157 isKill = MI.killsRegister(SrcReg, /*TRI=*/nullptr);
1158
1159 NewSrcSubReg = X86::NoSubRegister;
1160
1161 // For both LEA64 and LEA32 the register already has essentially the right
1162 // type (32-bit or 64-bit) we may just need to forbid SP.
1163 if (Opc != X86::LEA64_32r) {
1164 NewSrc = SrcReg;
1165 NewSrcSubReg = SubReg;
1166 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1167
1168 if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1169 return false;
1170
1171 return true;
1172 }
1173
1174 // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1175 // another we need to add 64-bit registers to the final MI.
1176 if (SrcReg.isPhysical()) {
1177 ImplicitOp = Src;
1178 ImplicitOp.setImplicit();
1179
1180 NewSrc = getX86SubSuperRegister(SrcReg, 64);
1181 assert(!SubReg && "no superregister for source");
1182 assert(NewSrc.isValid() && "Invalid Operand");
1183 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1184 } else {
1185 // Virtual register of the wrong class, we have to create a temporary 64-bit
1186 // vreg to feed into the LEA.
1187 NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1188 NewSrcSubReg = X86::NoSubRegister;
1189 MachineInstr *Copy =
1190 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1191 .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1192 .addReg(SrcReg, getKillRegState(isKill), SubReg);
1193
1194 // Which is obviously going to be dead after we're done with it.
1195 isKill = true;
1196
1197 if (LV)
1198 LV->replaceKillInstruction(SrcReg, MI, *Copy);
1199
1200 if (LIS) {
1201 SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1202 SlotIndex Idx = LIS->getInstructionIndex(MI);
1203 LiveInterval &LI = LIS->getInterval(SrcReg);
1205 if (S->end.getBaseIndex() == Idx)
1206 S->end = CopyIdx.getRegSlot();
1207 }
1208 }
1209
1210 // We've set all the parameters without issue.
1211 return true;
1212}
1213
1214MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1216 LiveVariables *LV,
1217 LiveIntervals *LIS,
1218 bool Is8BitOp) const {
1219 // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1220 MachineBasicBlock &MBB = *MI.getParent();
1221 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
1222 assert((Is8BitOp ||
1223 RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1224 *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1225 "Unexpected type for LEA transform");
1226
1227 // TODO: For a 32-bit target, we need to adjust the LEA variables with
1228 // something like this:
1229 // Opcode = X86::LEA32r;
1230 // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1231 // OutRegLEA =
1232 // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1233 // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1234 if (!Subtarget.is64Bit())
1235 return nullptr;
1236
1237 unsigned Opcode = X86::LEA64_32r;
1238 Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1239 Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1240 Register InRegLEA2;
1241
1242 // Build and insert into an implicit UNDEF value. This is OK because
1243 // we will be shifting and then extracting the lower 8/16-bits.
1244 // This has the potential to cause partial register stall. e.g.
1245 // movw (%rbp,%rcx,2), %dx
1246 // leal -65(%rdx), %esi
1247 // But testing has shown this *does* help performance in 64-bit mode (at
1248 // least on modern x86 machines).
1249 MachineBasicBlock::iterator MBBI = MI.getIterator();
1250 Register Dest = MI.getOperand(0).getReg();
1251 Register Src = MI.getOperand(1).getReg();
1252 unsigned SrcSubReg = MI.getOperand(1).getSubReg();
1253 Register Src2;
1254 unsigned Src2SubReg;
1255 bool IsDead = MI.getOperand(0).isDead();
1256 bool IsKill = MI.getOperand(1).isKill();
1257 unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1258 assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1259 MachineInstr *ImpDef =
1260 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1261 MachineInstr *InsMI =
1262 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1263 .addReg(InRegLEA, RegState::Define, SubReg)
1264 .addReg(Src, getKillRegState(IsKill), SrcSubReg);
1265 MachineInstr *ImpDef2 = nullptr;
1266 MachineInstr *InsMI2 = nullptr;
1267
1269 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1270#define CASE_NF(OP) \
1271 case X86::OP: \
1272 case X86::OP##_NF:
1273 switch (MIOpc) {
1274 default:
1275 llvm_unreachable("Unreachable!");
1276 CASE_NF(SHL8ri)
1277 CASE_NF(SHL16ri) {
1278 unsigned ShAmt = MI.getOperand(2).getImm();
1279 MIB.addReg(0)
1280 .addImm(1LL << ShAmt)
1281 .addReg(InRegLEA, RegState::Kill)
1282 .addImm(0)
1283 .addReg(0);
1284 break;
1285 }
1286 CASE_NF(INC8r)
1287 CASE_NF(INC16r)
1288 addRegOffset(MIB, InRegLEA, true, 1);
1289 break;
1290 CASE_NF(DEC8r)
1291 CASE_NF(DEC16r)
1292 addRegOffset(MIB, InRegLEA, true, -1);
1293 break;
1294 CASE_NF(ADD8ri)
1295 CASE_NF(ADD16ri)
1296 case X86::ADD8ri_DB:
1297 case X86::ADD16ri_DB:
1298 addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1299 break;
1300 CASE_NF(ADD8rr)
1301 CASE_NF(ADD16rr)
1302 case X86::ADD8rr_DB:
1303 case X86::ADD16rr_DB: {
1304 Src2 = MI.getOperand(2).getReg();
1305 Src2SubReg = MI.getOperand(2).getSubReg();
1306 bool IsKill2 = MI.getOperand(2).isKill();
1307 assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1308 if (Src == Src2) {
1309 // ADD8rr/ADD16rr killed %reg1028, %reg1028
1310 // just a single insert_subreg.
1311 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA, false,
1312 X86::NoSubRegister);
1313 } else {
1314 if (Subtarget.is64Bit())
1315 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1316 else
1317 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1318 // Build and insert into an implicit UNDEF value. This is OK because
1319 // we will be shifting and then extracting the lower 8/16-bits.
1320 ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1321 InRegLEA2);
1322 InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1323 .addReg(InRegLEA2, RegState::Define, SubReg)
1324 .addReg(Src2, getKillRegState(IsKill2), Src2SubReg);
1325 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA2, true,
1326 X86::NoSubRegister);
1327 }
1328 if (LV && IsKill2 && InsMI2)
1329 LV->replaceKillInstruction(Src2, MI, *InsMI2);
1330 break;
1331 }
1332 }
1333
1334 MachineInstr *NewMI = MIB;
1335 MachineInstr *ExtMI =
1336 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1338 .addReg(OutRegLEA, RegState::Kill, SubReg);
1339
1340 if (LV) {
1341 // Update live variables.
1342 LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1343 if (InRegLEA2)
1344 LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI);
1345 LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1346 if (IsKill)
1347 LV->replaceKillInstruction(Src, MI, *InsMI);
1348 if (IsDead)
1349 LV->replaceKillInstruction(Dest, MI, *ExtMI);
1350 }
1351
1352 if (LIS) {
1353 LIS->InsertMachineInstrInMaps(*ImpDef);
1354 SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1355 if (ImpDef2)
1356 LIS->InsertMachineInstrInMaps(*ImpDef2);
1357 SlotIndex Ins2Idx;
1358 if (InsMI2)
1359 Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1360 SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1361 SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1362 LIS->getInterval(InRegLEA);
1363 LIS->getInterval(OutRegLEA);
1364 if (InRegLEA2)
1365 LIS->getInterval(InRegLEA2);
1366
1367 // Move the use of Src up to InsMI.
1368 LiveInterval &SrcLI = LIS->getInterval(Src);
1369 LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1370 if (SrcSeg->end == NewIdx.getRegSlot())
1371 SrcSeg->end = InsIdx.getRegSlot();
1372
1373 if (InsMI2) {
1374 // Move the use of Src2 up to InsMI2.
1375 LiveInterval &Src2LI = LIS->getInterval(Src2);
1376 LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1377 if (Src2Seg->end == NewIdx.getRegSlot())
1378 Src2Seg->end = Ins2Idx.getRegSlot();
1379 }
1380
1381 // Move the definition of Dest down to ExtMI.
1382 LiveInterval &DestLI = LIS->getInterval(Dest);
1383 LiveRange::Segment *DestSeg =
1384 DestLI.getSegmentContaining(NewIdx.getRegSlot());
1385 assert(DestSeg->start == NewIdx.getRegSlot() &&
1386 DestSeg->valno->def == NewIdx.getRegSlot());
1387 DestSeg->start = ExtIdx.getRegSlot();
1388 DestSeg->valno->def = ExtIdx.getRegSlot();
1389 }
1390
1391 return ExtMI;
1392}
1393
1394/// This method must be implemented by targets that
1395/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1396/// may be able to convert a two-address instruction into a true
1397/// three-address instruction on demand. This allows the X86 target (for
1398/// example) to convert ADD and SHL instructions into LEA instructions if they
1399/// would require register copies due to two-addressness.
1400///
1401/// This method returns a null pointer if the transformation cannot be
1402/// performed, otherwise it returns the new instruction.
1403///
1405 LiveVariables *LV,
1406 LiveIntervals *LIS) const {
1407 // The following opcodes also sets the condition code register(s). Only
1408 // convert them to equivalent lea if the condition code register def's
1409 // are dead!
1411 return nullptr;
1412
1413 MachineFunction &MF = *MI.getParent()->getParent();
1414 // All instructions input are two-addr instructions. Get the known operands.
1415 const MachineOperand &Dest = MI.getOperand(0);
1416 const MachineOperand &Src = MI.getOperand(1);
1417
1418 // Ideally, operations with undef should be folded before we get here, but we
1419 // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1420 // Without this, we have to forward undef state to new register operands to
1421 // avoid machine verifier errors.
1422 if (Src.isUndef())
1423 return nullptr;
1424 if (MI.getNumOperands() > 2)
1425 if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1426 return nullptr;
1427
1428 MachineInstr *NewMI = nullptr;
1429 Register SrcReg, SrcReg2;
1430 unsigned SrcSubReg, SrcSubReg2;
1431 bool Is64Bit = Subtarget.is64Bit();
1432
1433 bool Is8BitOp = false;
1434 unsigned NumRegOperands = 2;
1435 unsigned MIOpc = MI.getOpcode();
1436 switch (MIOpc) {
1437 default:
1438 llvm_unreachable("Unreachable!");
1439 CASE_NF(SHL64ri) {
1440 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1441 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1442 if (!isTruncatedShiftCountForLEA(ShAmt))
1443 return nullptr;
1444
1445 // LEA can't handle RSP.
1446 if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1447 Src.getReg(), &X86::GR64_NOSPRegClass))
1448 return nullptr;
1449
1450 NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1451 .add(Dest)
1452 .addReg(0)
1453 .addImm(1LL << ShAmt)
1454 .add(Src)
1455 .addImm(0)
1456 .addReg(0);
1457 break;
1458 }
1459 CASE_NF(SHL32ri) {
1460 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1461 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1462 if (!isTruncatedShiftCountForLEA(ShAmt))
1463 return nullptr;
1464
1465 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1466
1467 // LEA can't handle ESP.
1468 bool isKill;
1469 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1470 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1471 isKill, ImplicitOp, LV, LIS))
1472 return nullptr;
1473
1475 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1476 .add(Dest)
1477 .addReg(0)
1478 .addImm(1LL << ShAmt)
1479 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
1480 .addImm(0)
1481 .addReg(0);
1482 if (ImplicitOp.getReg() != 0)
1483 MIB.add(ImplicitOp);
1484 NewMI = MIB;
1485
1486 // Add kills if classifyLEAReg created a new register.
1487 if (LV && SrcReg != Src.getReg())
1488 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1489 break;
1490 }
1491 CASE_NF(SHL8ri)
1492 Is8BitOp = true;
1493 [[fallthrough]];
1494 CASE_NF(SHL16ri) {
1495 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1496 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1497 if (!isTruncatedShiftCountForLEA(ShAmt))
1498 return nullptr;
1499 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1500 }
1501 CASE_NF(INC64r)
1502 CASE_NF(INC32r) {
1503 assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1504 unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
1505 ? X86::LEA64r
1506 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1507 bool isKill;
1508 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1509 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1510 isKill, ImplicitOp, LV, LIS))
1511 return nullptr;
1512
1513 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1514 .add(Dest)
1515 .addReg(SrcReg, getKillRegState(isKill));
1516 if (ImplicitOp.getReg() != 0)
1517 MIB.add(ImplicitOp);
1518
1519 NewMI = addOffset(MIB, 1);
1520
1521 // Add kills if classifyLEAReg created a new register.
1522 if (LV && SrcReg != Src.getReg())
1523 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1524 break;
1525 }
1526 CASE_NF(DEC64r)
1527 CASE_NF(DEC32r) {
1528 assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1529 unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
1530 ? X86::LEA64r
1531 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1532
1533 bool isKill;
1534 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1535 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1536 isKill, ImplicitOp, LV, LIS))
1537 return nullptr;
1538
1539 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1540 .add(Dest)
1541 .addReg(SrcReg, getKillRegState(isKill));
1542 if (ImplicitOp.getReg() != 0)
1543 MIB.add(ImplicitOp);
1544
1545 NewMI = addOffset(MIB, -1);
1546
1547 // Add kills if classifyLEAReg created a new register.
1548 if (LV && SrcReg != Src.getReg())
1549 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1550 break;
1551 }
1552 CASE_NF(DEC8r)
1553 CASE_NF(INC8r)
1554 Is8BitOp = true;
1555 [[fallthrough]];
1556 CASE_NF(DEC16r)
1557 CASE_NF(INC16r)
1558 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1559 CASE_NF(ADD64rr)
1560 CASE_NF(ADD32rr)
1561 case X86::ADD64rr_DB:
1562 case X86::ADD32rr_DB: {
1563 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1564 unsigned Opc;
1565 if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_NF ||
1566 MIOpc == X86::ADD64rr_DB)
1567 Opc = X86::LEA64r;
1568 else
1569 Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1570
1571 const MachineOperand &Src2 = MI.getOperand(2);
1572 bool isKill2;
1573 MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1574 if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, SrcSubReg2,
1575 isKill2, ImplicitOp2, LV, LIS))
1576 return nullptr;
1577
1578 bool isKill;
1579 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1580 if (Src.getReg() == Src2.getReg()) {
1581 // Don't call classify LEAReg a second time on the same register, in case
1582 // the first call inserted a COPY from Src2 and marked it as killed.
1583 isKill = isKill2;
1584 SrcReg = SrcReg2;
1585 SrcSubReg = SrcSubReg2;
1586 } else {
1587 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1588 isKill, ImplicitOp, LV, LIS))
1589 return nullptr;
1590 }
1591
1592 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1593 if (ImplicitOp.getReg() != 0)
1594 MIB.add(ImplicitOp);
1595 if (ImplicitOp2.getReg() != 0)
1596 MIB.add(ImplicitOp2);
1597
1598 NewMI =
1599 addRegReg(MIB, SrcReg, isKill, SrcSubReg, SrcReg2, isKill2, SrcSubReg2);
1600
1601 // Add kills if classifyLEAReg created a new register.
1602 if (LV) {
1603 if (SrcReg2 != Src2.getReg())
1604 LV->getVarInfo(SrcReg2).Kills.push_back(NewMI);
1605 if (SrcReg != SrcReg2 && SrcReg != Src.getReg())
1606 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1607 }
1608 NumRegOperands = 3;
1609 break;
1610 }
1611 CASE_NF(ADD8rr)
1612 case X86::ADD8rr_DB:
1613 Is8BitOp = true;
1614 [[fallthrough]];
1615 CASE_NF(ADD16rr)
1616 case X86::ADD16rr_DB:
1617 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1618 CASE_NF(ADD64ri32)
1619 case X86::ADD64ri32_DB:
1620 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1621 NewMI = addOffset(
1622 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1623 MI.getOperand(2));
1624 break;
1625 CASE_NF(ADD32ri)
1626 case X86::ADD32ri_DB: {
1627 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1628 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1629
1630 bool isKill;
1631 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1632 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1633 isKill, ImplicitOp, LV, LIS))
1634 return nullptr;
1635
1637 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1638 .add(Dest)
1639 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1640 if (ImplicitOp.getReg() != 0)
1641 MIB.add(ImplicitOp);
1642
1643 NewMI = addOffset(MIB, MI.getOperand(2));
1644
1645 // Add kills if classifyLEAReg created a new register.
1646 if (LV && SrcReg != Src.getReg())
1647 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1648 break;
1649 }
1650 CASE_NF(ADD8ri)
1651 case X86::ADD8ri_DB:
1652 Is8BitOp = true;
1653 [[fallthrough]];
1654 CASE_NF(ADD16ri)
1655 case X86::ADD16ri_DB:
1656 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1657 CASE_NF(SUB8ri)
1658 CASE_NF(SUB16ri)
1659 /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1660 return nullptr;
1661 CASE_NF(SUB32ri) {
1662 if (!MI.getOperand(2).isImm())
1663 return nullptr;
1664 int64_t Imm = MI.getOperand(2).getImm();
1665 if (!isInt<32>(-Imm))
1666 return nullptr;
1667
1668 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1669 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1670
1671 bool isKill;
1672 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1673 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1674 isKill, ImplicitOp, LV, LIS))
1675 return nullptr;
1676
1678 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1679 .add(Dest)
1680 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1681 if (ImplicitOp.getReg() != 0)
1682 MIB.add(ImplicitOp);
1683
1684 NewMI = addOffset(MIB, -Imm);
1685
1686 // Add kills if classifyLEAReg created a new register.
1687 if (LV && SrcReg != Src.getReg())
1688 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1689 break;
1690 }
1691
1692 CASE_NF(SUB64ri32) {
1693 if (!MI.getOperand(2).isImm())
1694 return nullptr;
1695 int64_t Imm = MI.getOperand(2).getImm();
1696 if (!isInt<32>(-Imm))
1697 return nullptr;
1698
1699 assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1700
1702 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
1703 NewMI = addOffset(MIB, -Imm);
1704 break;
1705 }
1706
1707 case X86::VMOVDQU8Z128rmk:
1708 case X86::VMOVDQU8Z256rmk:
1709 case X86::VMOVDQU8Zrmk:
1710 case X86::VMOVDQU16Z128rmk:
1711 case X86::VMOVDQU16Z256rmk:
1712 case X86::VMOVDQU16Zrmk:
1713 case X86::VMOVDQU32Z128rmk:
1714 case X86::VMOVDQA32Z128rmk:
1715 case X86::VMOVDQU32Z256rmk:
1716 case X86::VMOVDQA32Z256rmk:
1717 case X86::VMOVDQU32Zrmk:
1718 case X86::VMOVDQA32Zrmk:
1719 case X86::VMOVDQU64Z128rmk:
1720 case X86::VMOVDQA64Z128rmk:
1721 case X86::VMOVDQU64Z256rmk:
1722 case X86::VMOVDQA64Z256rmk:
1723 case X86::VMOVDQU64Zrmk:
1724 case X86::VMOVDQA64Zrmk:
1725 case X86::VMOVUPDZ128rmk:
1726 case X86::VMOVAPDZ128rmk:
1727 case X86::VMOVUPDZ256rmk:
1728 case X86::VMOVAPDZ256rmk:
1729 case X86::VMOVUPDZrmk:
1730 case X86::VMOVAPDZrmk:
1731 case X86::VMOVUPSZ128rmk:
1732 case X86::VMOVAPSZ128rmk:
1733 case X86::VMOVUPSZ256rmk:
1734 case X86::VMOVAPSZ256rmk:
1735 case X86::VMOVUPSZrmk:
1736 case X86::VMOVAPSZrmk:
1737 case X86::VBROADCASTSDZ256rmk:
1738 case X86::VBROADCASTSDZrmk:
1739 case X86::VBROADCASTSSZ128rmk:
1740 case X86::VBROADCASTSSZ256rmk:
1741 case X86::VBROADCASTSSZrmk:
1742 case X86::VPBROADCASTDZ128rmk:
1743 case X86::VPBROADCASTDZ256rmk:
1744 case X86::VPBROADCASTDZrmk:
1745 case X86::VPBROADCASTQZ128rmk:
1746 case X86::VPBROADCASTQZ256rmk:
1747 case X86::VPBROADCASTQZrmk: {
1748 unsigned Opc;
1749 switch (MIOpc) {
1750 default:
1751 llvm_unreachable("Unreachable!");
1752 case X86::VMOVDQU8Z128rmk:
1753 Opc = X86::VPBLENDMBZ128rmk;
1754 break;
1755 case X86::VMOVDQU8Z256rmk:
1756 Opc = X86::VPBLENDMBZ256rmk;
1757 break;
1758 case X86::VMOVDQU8Zrmk:
1759 Opc = X86::VPBLENDMBZrmk;
1760 break;
1761 case X86::VMOVDQU16Z128rmk:
1762 Opc = X86::VPBLENDMWZ128rmk;
1763 break;
1764 case X86::VMOVDQU16Z256rmk:
1765 Opc = X86::VPBLENDMWZ256rmk;
1766 break;
1767 case X86::VMOVDQU16Zrmk:
1768 Opc = X86::VPBLENDMWZrmk;
1769 break;
1770 case X86::VMOVDQU32Z128rmk:
1771 Opc = X86::VPBLENDMDZ128rmk;
1772 break;
1773 case X86::VMOVDQU32Z256rmk:
1774 Opc = X86::VPBLENDMDZ256rmk;
1775 break;
1776 case X86::VMOVDQU32Zrmk:
1777 Opc = X86::VPBLENDMDZrmk;
1778 break;
1779 case X86::VMOVDQU64Z128rmk:
1780 Opc = X86::VPBLENDMQZ128rmk;
1781 break;
1782 case X86::VMOVDQU64Z256rmk:
1783 Opc = X86::VPBLENDMQZ256rmk;
1784 break;
1785 case X86::VMOVDQU64Zrmk:
1786 Opc = X86::VPBLENDMQZrmk;
1787 break;
1788 case X86::VMOVUPDZ128rmk:
1789 Opc = X86::VBLENDMPDZ128rmk;
1790 break;
1791 case X86::VMOVUPDZ256rmk:
1792 Opc = X86::VBLENDMPDZ256rmk;
1793 break;
1794 case X86::VMOVUPDZrmk:
1795 Opc = X86::VBLENDMPDZrmk;
1796 break;
1797 case X86::VMOVUPSZ128rmk:
1798 Opc = X86::VBLENDMPSZ128rmk;
1799 break;
1800 case X86::VMOVUPSZ256rmk:
1801 Opc = X86::VBLENDMPSZ256rmk;
1802 break;
1803 case X86::VMOVUPSZrmk:
1804 Opc = X86::VBLENDMPSZrmk;
1805 break;
1806 case X86::VMOVDQA32Z128rmk:
1807 Opc = X86::VPBLENDMDZ128rmk;
1808 break;
1809 case X86::VMOVDQA32Z256rmk:
1810 Opc = X86::VPBLENDMDZ256rmk;
1811 break;
1812 case X86::VMOVDQA32Zrmk:
1813 Opc = X86::VPBLENDMDZrmk;
1814 break;
1815 case X86::VMOVDQA64Z128rmk:
1816 Opc = X86::VPBLENDMQZ128rmk;
1817 break;
1818 case X86::VMOVDQA64Z256rmk:
1819 Opc = X86::VPBLENDMQZ256rmk;
1820 break;
1821 case X86::VMOVDQA64Zrmk:
1822 Opc = X86::VPBLENDMQZrmk;
1823 break;
1824 case X86::VMOVAPDZ128rmk:
1825 Opc = X86::VBLENDMPDZ128rmk;
1826 break;
1827 case X86::VMOVAPDZ256rmk:
1828 Opc = X86::VBLENDMPDZ256rmk;
1829 break;
1830 case X86::VMOVAPDZrmk:
1831 Opc = X86::VBLENDMPDZrmk;
1832 break;
1833 case X86::VMOVAPSZ128rmk:
1834 Opc = X86::VBLENDMPSZ128rmk;
1835 break;
1836 case X86::VMOVAPSZ256rmk:
1837 Opc = X86::VBLENDMPSZ256rmk;
1838 break;
1839 case X86::VMOVAPSZrmk:
1840 Opc = X86::VBLENDMPSZrmk;
1841 break;
1842 case X86::VBROADCASTSDZ256rmk:
1843 Opc = X86::VBLENDMPDZ256rmbk;
1844 break;
1845 case X86::VBROADCASTSDZrmk:
1846 Opc = X86::VBLENDMPDZrmbk;
1847 break;
1848 case X86::VBROADCASTSSZ128rmk:
1849 Opc = X86::VBLENDMPSZ128rmbk;
1850 break;
1851 case X86::VBROADCASTSSZ256rmk:
1852 Opc = X86::VBLENDMPSZ256rmbk;
1853 break;
1854 case X86::VBROADCASTSSZrmk:
1855 Opc = X86::VBLENDMPSZrmbk;
1856 break;
1857 case X86::VPBROADCASTDZ128rmk:
1858 Opc = X86::VPBLENDMDZ128rmbk;
1859 break;
1860 case X86::VPBROADCASTDZ256rmk:
1861 Opc = X86::VPBLENDMDZ256rmbk;
1862 break;
1863 case X86::VPBROADCASTDZrmk:
1864 Opc = X86::VPBLENDMDZrmbk;
1865 break;
1866 case X86::VPBROADCASTQZ128rmk:
1867 Opc = X86::VPBLENDMQZ128rmbk;
1868 break;
1869 case X86::VPBROADCASTQZ256rmk:
1870 Opc = X86::VPBLENDMQZ256rmbk;
1871 break;
1872 case X86::VPBROADCASTQZrmk:
1873 Opc = X86::VPBLENDMQZrmbk;
1874 break;
1875 }
1876
1877 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1878 .add(Dest)
1879 .add(MI.getOperand(2))
1880 .add(Src)
1881 .add(MI.getOperand(3))
1882 .add(MI.getOperand(4))
1883 .add(MI.getOperand(5))
1884 .add(MI.getOperand(6))
1885 .add(MI.getOperand(7));
1886 NumRegOperands = 4;
1887 break;
1888 }
1889
1890 case X86::VMOVDQU8Z128rrk:
1891 case X86::VMOVDQU8Z256rrk:
1892 case X86::VMOVDQU8Zrrk:
1893 case X86::VMOVDQU16Z128rrk:
1894 case X86::VMOVDQU16Z256rrk:
1895 case X86::VMOVDQU16Zrrk:
1896 case X86::VMOVDQU32Z128rrk:
1897 case X86::VMOVDQA32Z128rrk:
1898 case X86::VMOVDQU32Z256rrk:
1899 case X86::VMOVDQA32Z256rrk:
1900 case X86::VMOVDQU32Zrrk:
1901 case X86::VMOVDQA32Zrrk:
1902 case X86::VMOVDQU64Z128rrk:
1903 case X86::VMOVDQA64Z128rrk:
1904 case X86::VMOVDQU64Z256rrk:
1905 case X86::VMOVDQA64Z256rrk:
1906 case X86::VMOVDQU64Zrrk:
1907 case X86::VMOVDQA64Zrrk:
1908 case X86::VMOVUPDZ128rrk:
1909 case X86::VMOVAPDZ128rrk:
1910 case X86::VMOVUPDZ256rrk:
1911 case X86::VMOVAPDZ256rrk:
1912 case X86::VMOVUPDZrrk:
1913 case X86::VMOVAPDZrrk:
1914 case X86::VMOVUPSZ128rrk:
1915 case X86::VMOVAPSZ128rrk:
1916 case X86::VMOVUPSZ256rrk:
1917 case X86::VMOVAPSZ256rrk:
1918 case X86::VMOVUPSZrrk:
1919 case X86::VMOVAPSZrrk: {
1920 unsigned Opc;
1921 switch (MIOpc) {
1922 default:
1923 llvm_unreachable("Unreachable!");
1924 case X86::VMOVDQU8Z128rrk:
1925 Opc = X86::VPBLENDMBZ128rrk;
1926 break;
1927 case X86::VMOVDQU8Z256rrk:
1928 Opc = X86::VPBLENDMBZ256rrk;
1929 break;
1930 case X86::VMOVDQU8Zrrk:
1931 Opc = X86::VPBLENDMBZrrk;
1932 break;
1933 case X86::VMOVDQU16Z128rrk:
1934 Opc = X86::VPBLENDMWZ128rrk;
1935 break;
1936 case X86::VMOVDQU16Z256rrk:
1937 Opc = X86::VPBLENDMWZ256rrk;
1938 break;
1939 case X86::VMOVDQU16Zrrk:
1940 Opc = X86::VPBLENDMWZrrk;
1941 break;
1942 case X86::VMOVDQU32Z128rrk:
1943 Opc = X86::VPBLENDMDZ128rrk;
1944 break;
1945 case X86::VMOVDQU32Z256rrk:
1946 Opc = X86::VPBLENDMDZ256rrk;
1947 break;
1948 case X86::VMOVDQU32Zrrk:
1949 Opc = X86::VPBLENDMDZrrk;
1950 break;
1951 case X86::VMOVDQU64Z128rrk:
1952 Opc = X86::VPBLENDMQZ128rrk;
1953 break;
1954 case X86::VMOVDQU64Z256rrk:
1955 Opc = X86::VPBLENDMQZ256rrk;
1956 break;
1957 case X86::VMOVDQU64Zrrk:
1958 Opc = X86::VPBLENDMQZrrk;
1959 break;
1960 case X86::VMOVUPDZ128rrk:
1961 Opc = X86::VBLENDMPDZ128rrk;
1962 break;
1963 case X86::VMOVUPDZ256rrk:
1964 Opc = X86::VBLENDMPDZ256rrk;
1965 break;
1966 case X86::VMOVUPDZrrk:
1967 Opc = X86::VBLENDMPDZrrk;
1968 break;
1969 case X86::VMOVUPSZ128rrk:
1970 Opc = X86::VBLENDMPSZ128rrk;
1971 break;
1972 case X86::VMOVUPSZ256rrk:
1973 Opc = X86::VBLENDMPSZ256rrk;
1974 break;
1975 case X86::VMOVUPSZrrk:
1976 Opc = X86::VBLENDMPSZrrk;
1977 break;
1978 case X86::VMOVDQA32Z128rrk:
1979 Opc = X86::VPBLENDMDZ128rrk;
1980 break;
1981 case X86::VMOVDQA32Z256rrk:
1982 Opc = X86::VPBLENDMDZ256rrk;
1983 break;
1984 case X86::VMOVDQA32Zrrk:
1985 Opc = X86::VPBLENDMDZrrk;
1986 break;
1987 case X86::VMOVDQA64Z128rrk:
1988 Opc = X86::VPBLENDMQZ128rrk;
1989 break;
1990 case X86::VMOVDQA64Z256rrk:
1991 Opc = X86::VPBLENDMQZ256rrk;
1992 break;
1993 case X86::VMOVDQA64Zrrk:
1994 Opc = X86::VPBLENDMQZrrk;
1995 break;
1996 case X86::VMOVAPDZ128rrk:
1997 Opc = X86::VBLENDMPDZ128rrk;
1998 break;
1999 case X86::VMOVAPDZ256rrk:
2000 Opc = X86::VBLENDMPDZ256rrk;
2001 break;
2002 case X86::VMOVAPDZrrk:
2003 Opc = X86::VBLENDMPDZrrk;
2004 break;
2005 case X86::VMOVAPSZ128rrk:
2006 Opc = X86::VBLENDMPSZ128rrk;
2007 break;
2008 case X86::VMOVAPSZ256rrk:
2009 Opc = X86::VBLENDMPSZ256rrk;
2010 break;
2011 case X86::VMOVAPSZrrk:
2012 Opc = X86::VBLENDMPSZrrk;
2013 break;
2014 }
2015
2016 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
2017 .add(Dest)
2018 .add(MI.getOperand(2))
2019 .add(Src)
2020 .add(MI.getOperand(3));
2021 NumRegOperands = 4;
2022 break;
2023 }
2024 }
2025#undef CASE_NF
2026
2027 if (!NewMI)
2028 return nullptr;
2029
2030 if (LV) { // Update live variables
2031 for (unsigned I = 0; I < NumRegOperands; ++I) {
2032 MachineOperand &Op = MI.getOperand(I);
2033 if (Op.isReg() && (Op.isDead() || Op.isKill()))
2034 LV->replaceKillInstruction(Op.getReg(), MI, *NewMI);
2035 }
2036 }
2037
2038 MachineBasicBlock &MBB = *MI.getParent();
2039 MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
2040
2041 if (LIS) {
2042 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
2043 if (SrcReg)
2044 LIS->getInterval(SrcReg);
2045 if (SrcReg2)
2046 LIS->getInterval(SrcReg2);
2047 }
2048
2049 return NewMI;
2050}
2051
2052/// This determines which of three possible cases of a three source commute
2053/// the source indexes correspond to taking into account any mask operands.
2054/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
2055/// possible.
2056/// Case 0 - Possible to commute the first and second operands.
2057/// Case 1 - Possible to commute the first and third operands.
2058/// Case 2 - Possible to commute the second and third operands.
2059static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
2060 unsigned SrcOpIdx2) {
2061 // Put the lowest index to SrcOpIdx1 to simplify the checks below.
2062 if (SrcOpIdx1 > SrcOpIdx2)
2063 std::swap(SrcOpIdx1, SrcOpIdx2);
2064
2065 unsigned Op1 = 1, Op2 = 2, Op3 = 3;
2066 if (X86II::isKMasked(TSFlags)) {
2067 Op2++;
2068 Op3++;
2069 }
2070
2071 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
2072 return 0;
2073 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
2074 return 1;
2075 if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
2076 return 2;
2077 llvm_unreachable("Unknown three src commute case.");
2078}
2079
2081 const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
2082 const X86InstrFMA3Group &FMA3Group) const {
2083
2084 unsigned Opc = MI.getOpcode();
2085
2086 // TODO: Commuting the 1st operand of FMA*_Int requires some additional
2087 // analysis. The commute optimization is legal only if all users of FMA*_Int
2088 // use only the lowest element of the FMA*_Int instruction. Such analysis are
2089 // not implemented yet. So, just return 0 in that case.
2090 // When such analysis are available this place will be the right place for
2091 // calling it.
2092 assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
2093 "Intrinsic instructions can't commute operand 1");
2094
2095 // Determine which case this commute is or if it can't be done.
2096 unsigned Case =
2097 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2098 assert(Case < 3 && "Unexpected case number!");
2099
2100 // Define the FMA forms mapping array that helps to map input FMA form
2101 // to output FMA form to preserve the operation semantics after
2102 // commuting the operands.
2103 const unsigned Form132Index = 0;
2104 const unsigned Form213Index = 1;
2105 const unsigned Form231Index = 2;
2106 static const unsigned FormMapping[][3] = {
2107 // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
2108 // FMA132 A, C, b; ==> FMA231 C, A, b;
2109 // FMA213 B, A, c; ==> FMA213 A, B, c;
2110 // FMA231 C, A, b; ==> FMA132 A, C, b;
2111 {Form231Index, Form213Index, Form132Index},
2112 // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
2113 // FMA132 A, c, B; ==> FMA132 B, c, A;
2114 // FMA213 B, a, C; ==> FMA231 C, a, B;
2115 // FMA231 C, a, B; ==> FMA213 B, a, C;
2116 {Form132Index, Form231Index, Form213Index},
2117 // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
2118 // FMA132 a, C, B; ==> FMA213 a, B, C;
2119 // FMA213 b, A, C; ==> FMA132 b, C, A;
2120 // FMA231 c, A, B; ==> FMA231 c, B, A;
2121 {Form213Index, Form132Index, Form231Index}};
2122
2123 unsigned FMAForms[3];
2124 FMAForms[0] = FMA3Group.get132Opcode();
2125 FMAForms[1] = FMA3Group.get213Opcode();
2126 FMAForms[2] = FMA3Group.get231Opcode();
2127
2128 // Everything is ready, just adjust the FMA opcode and return it.
2129 for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
2130 if (Opc == FMAForms[FormIndex])
2131 return FMAForms[FormMapping[Case][FormIndex]];
2132
2133 llvm_unreachable("Illegal FMA3 format");
2134}
2135
2136static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
2137 unsigned SrcOpIdx2) {
2138 // Determine which case this commute is or if it can't be done.
2139 unsigned Case =
2140 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2141 assert(Case < 3 && "Unexpected case value!");
2142
2143 // For each case we need to swap two pairs of bits in the final immediate.
2144 static const uint8_t SwapMasks[3][4] = {
2145 {0x04, 0x10, 0x08, 0x20}, // Swap bits 2/4 and 3/5.
2146 {0x02, 0x10, 0x08, 0x40}, // Swap bits 1/4 and 3/6.
2147 {0x02, 0x04, 0x20, 0x40}, // Swap bits 1/2 and 5/6.
2148 };
2149
2150 uint8_t Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2151 // Clear out the bits we are swapping.
2152 uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
2153 SwapMasks[Case][2] | SwapMasks[Case][3]);
2154 // If the immediate had a bit of the pair set, then set the opposite bit.
2155 if (Imm & SwapMasks[Case][0])
2156 NewImm |= SwapMasks[Case][1];
2157 if (Imm & SwapMasks[Case][1])
2158 NewImm |= SwapMasks[Case][0];
2159 if (Imm & SwapMasks[Case][2])
2160 NewImm |= SwapMasks[Case][3];
2161 if (Imm & SwapMasks[Case][3])
2162 NewImm |= SwapMasks[Case][2];
2163 MI.getOperand(MI.getNumOperands() - 1).setImm(NewImm);
2164}
2165
2166// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
2167// commuted.
2168static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
2169#define VPERM_CASES(Suffix) \
2170 case X86::VPERMI2##Suffix##Z128rr: \
2171 case X86::VPERMT2##Suffix##Z128rr: \
2172 case X86::VPERMI2##Suffix##Z256rr: \
2173 case X86::VPERMT2##Suffix##Z256rr: \
2174 case X86::VPERMI2##Suffix##Zrr: \
2175 case X86::VPERMT2##Suffix##Zrr: \
2176 case X86::VPERMI2##Suffix##Z128rm: \
2177 case X86::VPERMT2##Suffix##Z128rm: \
2178 case X86::VPERMI2##Suffix##Z256rm: \
2179 case X86::VPERMT2##Suffix##Z256rm: \
2180 case X86::VPERMI2##Suffix##Zrm: \
2181 case X86::VPERMT2##Suffix##Zrm: \
2182 case X86::VPERMI2##Suffix##Z128rrkz: \
2183 case X86::VPERMT2##Suffix##Z128rrkz: \
2184 case X86::VPERMI2##Suffix##Z256rrkz: \
2185 case X86::VPERMT2##Suffix##Z256rrkz: \
2186 case X86::VPERMI2##Suffix##Zrrkz: \
2187 case X86::VPERMT2##Suffix##Zrrkz: \
2188 case X86::VPERMI2##Suffix##Z128rmkz: \
2189 case X86::VPERMT2##Suffix##Z128rmkz: \
2190 case X86::VPERMI2##Suffix##Z256rmkz: \
2191 case X86::VPERMT2##Suffix##Z256rmkz: \
2192 case X86::VPERMI2##Suffix##Zrmkz: \
2193 case X86::VPERMT2##Suffix##Zrmkz:
2194
2195#define VPERM_CASES_BROADCAST(Suffix) \
2196 VPERM_CASES(Suffix) \
2197 case X86::VPERMI2##Suffix##Z128rmb: \
2198 case X86::VPERMT2##Suffix##Z128rmb: \
2199 case X86::VPERMI2##Suffix##Z256rmb: \
2200 case X86::VPERMT2##Suffix##Z256rmb: \
2201 case X86::VPERMI2##Suffix##Zrmb: \
2202 case X86::VPERMT2##Suffix##Zrmb: \
2203 case X86::VPERMI2##Suffix##Z128rmbkz: \
2204 case X86::VPERMT2##Suffix##Z128rmbkz: \
2205 case X86::VPERMI2##Suffix##Z256rmbkz: \
2206 case X86::VPERMT2##Suffix##Z256rmbkz: \
2207 case X86::VPERMI2##Suffix##Zrmbkz: \
2208 case X86::VPERMT2##Suffix##Zrmbkz:
2209
2210 switch (Opcode) {
2211 default:
2212 return false;
2213 VPERM_CASES(B)
2218 VPERM_CASES(W)
2219 return true;
2220 }
2221#undef VPERM_CASES_BROADCAST
2222#undef VPERM_CASES
2223}
2224
2225// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
2226// from the I opcode to the T opcode and vice versa.
2227static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
2228#define VPERM_CASES(Orig, New) \
2229 case X86::Orig##Z128rr: \
2230 return X86::New##Z128rr; \
2231 case X86::Orig##Z128rrkz: \
2232 return X86::New##Z128rrkz; \
2233 case X86::Orig##Z128rm: \
2234 return X86::New##Z128rm; \
2235 case X86::Orig##Z128rmkz: \
2236 return X86::New##Z128rmkz; \
2237 case X86::Orig##Z256rr: \
2238 return X86::New##Z256rr; \
2239 case X86::Orig##Z256rrkz: \
2240 return X86::New##Z256rrkz; \
2241 case X86::Orig##Z256rm: \
2242 return X86::New##Z256rm; \
2243 case X86::Orig##Z256rmkz: \
2244 return X86::New##Z256rmkz; \
2245 case X86::Orig##Zrr: \
2246 return X86::New##Zrr; \
2247 case X86::Orig##Zrrkz: \
2248 return X86::New##Zrrkz; \
2249 case X86::Orig##Zrm: \
2250 return X86::New##Zrm; \
2251 case X86::Orig##Zrmkz: \
2252 return X86::New##Zrmkz;
2253
2254#define VPERM_CASES_BROADCAST(Orig, New) \
2255 VPERM_CASES(Orig, New) \
2256 case X86::Orig##Z128rmb: \
2257 return X86::New##Z128rmb; \
2258 case X86::Orig##Z128rmbkz: \
2259 return X86::New##Z128rmbkz; \
2260 case X86::Orig##Z256rmb: \
2261 return X86::New##Z256rmb; \
2262 case X86::Orig##Z256rmbkz: \
2263 return X86::New##Z256rmbkz; \
2264 case X86::Orig##Zrmb: \
2265 return X86::New##Zrmb; \
2266 case X86::Orig##Zrmbkz: \
2267 return X86::New##Zrmbkz;
2268
2269 switch (Opcode) {
2270 VPERM_CASES(VPERMI2B, VPERMT2B)
2271 VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
2272 VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
2273 VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
2274 VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
2275 VPERM_CASES(VPERMI2W, VPERMT2W)
2276 VPERM_CASES(VPERMT2B, VPERMI2B)
2277 VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2278 VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2279 VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2280 VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2281 VPERM_CASES(VPERMT2W, VPERMI2W)
2282 }
2283
2284 llvm_unreachable("Unreachable!");
2285#undef VPERM_CASES_BROADCAST
2286#undef VPERM_CASES
2287}
2288
2290 unsigned OpIdx1,
2291 unsigned OpIdx2) const {
2292 auto CloneIfNew = [&](MachineInstr &MI) {
2293 return std::exchange(NewMI, false)
2294 ? MI.getParent()->getParent()->CloneMachineInstr(&MI)
2295 : &MI;
2296 };
2297 MachineInstr *WorkingMI = nullptr;
2298 unsigned Opc = MI.getOpcode();
2299
2300#define CASE_ND(OP) \
2301 case X86::OP: \
2302 case X86::OP##_ND:
2303
2304 switch (Opc) {
2305 // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
2306 CASE_ND(SHRD16rri8)
2307 CASE_ND(SHLD16rri8)
2308 CASE_ND(SHRD32rri8)
2309 CASE_ND(SHLD32rri8)
2310 CASE_ND(SHRD64rri8)
2311 CASE_ND(SHLD64rri8) {
2312 unsigned Size;
2313 switch (Opc) {
2314 default:
2315 llvm_unreachable("Unreachable!");
2316#define FROM_TO_SIZE(A, B, S) \
2317 case X86::A: \
2318 Opc = X86::B; \
2319 Size = S; \
2320 break; \
2321 case X86::A##_ND: \
2322 Opc = X86::B##_ND; \
2323 Size = S; \
2324 break; \
2325 case X86::B: \
2326 Opc = X86::A; \
2327 Size = S; \
2328 break; \
2329 case X86::B##_ND: \
2330 Opc = X86::A##_ND; \
2331 Size = S; \
2332 break;
2333
2334 FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
2335 FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
2336 FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
2337#undef FROM_TO_SIZE
2338 }
2339 WorkingMI = CloneIfNew(MI);
2340 WorkingMI->setDesc(get(Opc));
2341 WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
2342 break;
2343 }
2344 case X86::PFSUBrr:
2345 case X86::PFSUBRrr:
2346 // PFSUB x, y: x = x - y
2347 // PFSUBR x, y: x = y - x
2348 WorkingMI = CloneIfNew(MI);
2349 WorkingMI->setDesc(
2350 get(X86::PFSUBRrr == Opc ? X86::PFSUBrr : X86::PFSUBRrr));
2351 break;
2352 case X86::BLENDPDrri:
2353 case X86::BLENDPSrri:
2354 case X86::PBLENDWrri:
2355 case X86::VBLENDPDrri:
2356 case X86::VBLENDPSrri:
2357 case X86::VBLENDPDYrri:
2358 case X86::VBLENDPSYrri:
2359 case X86::VPBLENDDrri:
2360 case X86::VPBLENDWrri:
2361 case X86::VPBLENDDYrri:
2362 case X86::VPBLENDWYrri: {
2363 int8_t Mask;
2364 switch (Opc) {
2365 default:
2366 llvm_unreachable("Unreachable!");
2367 case X86::BLENDPDrri:
2368 Mask = (int8_t)0x03;
2369 break;
2370 case X86::BLENDPSrri:
2371 Mask = (int8_t)0x0F;
2372 break;
2373 case X86::PBLENDWrri:
2374 Mask = (int8_t)0xFF;
2375 break;
2376 case X86::VBLENDPDrri:
2377 Mask = (int8_t)0x03;
2378 break;
2379 case X86::VBLENDPSrri:
2380 Mask = (int8_t)0x0F;
2381 break;
2382 case X86::VBLENDPDYrri:
2383 Mask = (int8_t)0x0F;
2384 break;
2385 case X86::VBLENDPSYrri:
2386 Mask = (int8_t)0xFF;
2387 break;
2388 case X86::VPBLENDDrri:
2389 Mask = (int8_t)0x0F;
2390 break;
2391 case X86::VPBLENDWrri:
2392 Mask = (int8_t)0xFF;
2393 break;
2394 case X86::VPBLENDDYrri:
2395 Mask = (int8_t)0xFF;
2396 break;
2397 case X86::VPBLENDWYrri:
2398 Mask = (int8_t)0xFF;
2399 break;
2400 }
2401 // Only the least significant bits of Imm are used.
2402 // Using int8_t to ensure it will be sign extended to the int64_t that
2403 // setImm takes in order to match isel behavior.
2404 int8_t Imm = MI.getOperand(3).getImm() & Mask;
2405 WorkingMI = CloneIfNew(MI);
2406 WorkingMI->getOperand(3).setImm(Mask ^ Imm);
2407 break;
2408 }
2409 case X86::INSERTPSrri:
2410 case X86::VINSERTPSrri:
2411 case X86::VINSERTPSZrri: {
2412 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2413 unsigned ZMask = Imm & 15;
2414 unsigned DstIdx = (Imm >> 4) & 3;
2415 unsigned SrcIdx = (Imm >> 6) & 3;
2416
2417 // We can commute insertps if we zero 2 of the elements, the insertion is
2418 // "inline" and we don't override the insertion with a zero.
2419 if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2420 llvm::popcount(ZMask) == 2) {
2421 unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15);
2422 assert(AltIdx < 4 && "Illegal insertion index");
2423 unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2424 WorkingMI = CloneIfNew(MI);
2425 WorkingMI->getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2426 break;
2427 }
2428 return nullptr;
2429 }
2430 case X86::MOVSDrr:
2431 case X86::MOVSSrr:
2432 case X86::VMOVSDrr:
2433 case X86::VMOVSSrr: {
2434 // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2435 if (Subtarget.hasSSE41()) {
2436 unsigned Mask;
2437 switch (Opc) {
2438 default:
2439 llvm_unreachable("Unreachable!");
2440 case X86::MOVSDrr:
2441 Opc = X86::BLENDPDrri;
2442 Mask = 0x02;
2443 break;
2444 case X86::MOVSSrr:
2445 Opc = X86::BLENDPSrri;
2446 Mask = 0x0E;
2447 break;
2448 case X86::VMOVSDrr:
2449 Opc = X86::VBLENDPDrri;
2450 Mask = 0x02;
2451 break;
2452 case X86::VMOVSSrr:
2453 Opc = X86::VBLENDPSrri;
2454 Mask = 0x0E;
2455 break;
2456 }
2457
2458 WorkingMI = CloneIfNew(MI);
2459 WorkingMI->setDesc(get(Opc));
2460 WorkingMI->addOperand(MachineOperand::CreateImm(Mask));
2461 break;
2462 }
2463
2464 assert(Opc == X86::MOVSDrr && "Only MOVSD can commute to SHUFPD");
2465 WorkingMI = CloneIfNew(MI);
2466 WorkingMI->setDesc(get(X86::SHUFPDrri));
2467 WorkingMI->addOperand(MachineOperand::CreateImm(0x02));
2468 break;
2469 }
2470 case X86::SHUFPDrri: {
2471 // Commute to MOVSD.
2472 assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2473 WorkingMI = CloneIfNew(MI);
2474 WorkingMI->setDesc(get(X86::MOVSDrr));
2475 WorkingMI->removeOperand(3);
2476 break;
2477 }
2478 case X86::PCLMULQDQrri:
2479 case X86::VPCLMULQDQrri:
2480 case X86::VPCLMULQDQYrri:
2481 case X86::VPCLMULQDQZrri:
2482 case X86::VPCLMULQDQZ128rri:
2483 case X86::VPCLMULQDQZ256rri: {
2484 // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2485 // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2486 unsigned Imm = MI.getOperand(3).getImm();
2487 unsigned Src1Hi = Imm & 0x01;
2488 unsigned Src2Hi = Imm & 0x10;
2489 WorkingMI = CloneIfNew(MI);
2490 WorkingMI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2491 break;
2492 }
2493 case X86::VPCMPBZ128rri:
2494 case X86::VPCMPUBZ128rri:
2495 case X86::VPCMPBZ256rri:
2496 case X86::VPCMPUBZ256rri:
2497 case X86::VPCMPBZrri:
2498 case X86::VPCMPUBZrri:
2499 case X86::VPCMPDZ128rri:
2500 case X86::VPCMPUDZ128rri:
2501 case X86::VPCMPDZ256rri:
2502 case X86::VPCMPUDZ256rri:
2503 case X86::VPCMPDZrri:
2504 case X86::VPCMPUDZrri:
2505 case X86::VPCMPQZ128rri:
2506 case X86::VPCMPUQZ128rri:
2507 case X86::VPCMPQZ256rri:
2508 case X86::VPCMPUQZ256rri:
2509 case X86::VPCMPQZrri:
2510 case X86::VPCMPUQZrri:
2511 case X86::VPCMPWZ128rri:
2512 case X86::VPCMPUWZ128rri:
2513 case X86::VPCMPWZ256rri:
2514 case X86::VPCMPUWZ256rri:
2515 case X86::VPCMPWZrri:
2516 case X86::VPCMPUWZrri:
2517 case X86::VPCMPBZ128rrik:
2518 case X86::VPCMPUBZ128rrik:
2519 case X86::VPCMPBZ256rrik:
2520 case X86::VPCMPUBZ256rrik:
2521 case X86::VPCMPBZrrik:
2522 case X86::VPCMPUBZrrik:
2523 case X86::VPCMPDZ128rrik:
2524 case X86::VPCMPUDZ128rrik:
2525 case X86::VPCMPDZ256rrik:
2526 case X86::VPCMPUDZ256rrik:
2527 case X86::VPCMPDZrrik:
2528 case X86::VPCMPUDZrrik:
2529 case X86::VPCMPQZ128rrik:
2530 case X86::VPCMPUQZ128rrik:
2531 case X86::VPCMPQZ256rrik:
2532 case X86::VPCMPUQZ256rrik:
2533 case X86::VPCMPQZrrik:
2534 case X86::VPCMPUQZrrik:
2535 case X86::VPCMPWZ128rrik:
2536 case X86::VPCMPUWZ128rrik:
2537 case X86::VPCMPWZ256rrik:
2538 case X86::VPCMPUWZ256rrik:
2539 case X86::VPCMPWZrrik:
2540 case X86::VPCMPUWZrrik:
2541 WorkingMI = CloneIfNew(MI);
2542 // Flip comparison mode immediate (if necessary).
2543 WorkingMI->getOperand(MI.getNumOperands() - 1)
2545 MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7));
2546 break;
2547 case X86::VPCOMBri:
2548 case X86::VPCOMUBri:
2549 case X86::VPCOMDri:
2550 case X86::VPCOMUDri:
2551 case X86::VPCOMQri:
2552 case X86::VPCOMUQri:
2553 case X86::VPCOMWri:
2554 case X86::VPCOMUWri:
2555 WorkingMI = CloneIfNew(MI);
2556 // Flip comparison mode immediate (if necessary).
2557 WorkingMI->getOperand(3).setImm(
2558 X86::getSwappedVPCOMImm(MI.getOperand(3).getImm() & 0x7));
2559 break;
2560 case X86::VCMPSDZrri:
2561 case X86::VCMPSSZrri:
2562 case X86::VCMPPDZrri:
2563 case X86::VCMPPSZrri:
2564 case X86::VCMPSHZrri:
2565 case X86::VCMPPHZrri:
2566 case X86::VCMPPHZ128rri:
2567 case X86::VCMPPHZ256rri:
2568 case X86::VCMPPDZ128rri:
2569 case X86::VCMPPSZ128rri:
2570 case X86::VCMPPDZ256rri:
2571 case X86::VCMPPSZ256rri:
2572 case X86::VCMPPDZrrik:
2573 case X86::VCMPPSZrrik:
2574 case X86::VCMPPHZrrik:
2575 case X86::VCMPPDZ128rrik:
2576 case X86::VCMPPSZ128rrik:
2577 case X86::VCMPPHZ128rrik:
2578 case X86::VCMPPDZ256rrik:
2579 case X86::VCMPPSZ256rrik:
2580 case X86::VCMPPHZ256rrik:
2581 WorkingMI = CloneIfNew(MI);
2582 WorkingMI->getOperand(MI.getNumExplicitOperands() - 1)
2584 MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f));
2585 break;
2586 case X86::VPERM2F128rri:
2587 case X86::VPERM2I128rri:
2588 // Flip permute source immediate.
2589 // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2590 // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2591 WorkingMI = CloneIfNew(MI);
2592 WorkingMI->getOperand(3).setImm((MI.getOperand(3).getImm() & 0xFF) ^ 0x22);
2593 break;
2594 case X86::MOVHLPSrr:
2595 case X86::UNPCKHPDrr:
2596 case X86::VMOVHLPSrr:
2597 case X86::VUNPCKHPDrr:
2598 case X86::VMOVHLPSZrr:
2599 case X86::VUNPCKHPDZ128rr:
2600 assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2601
2602 switch (Opc) {
2603 default:
2604 llvm_unreachable("Unreachable!");
2605 case X86::MOVHLPSrr:
2606 Opc = X86::UNPCKHPDrr;
2607 break;
2608 case X86::UNPCKHPDrr:
2609 Opc = X86::MOVHLPSrr;
2610 break;
2611 case X86::VMOVHLPSrr:
2612 Opc = X86::VUNPCKHPDrr;
2613 break;
2614 case X86::VUNPCKHPDrr:
2615 Opc = X86::VMOVHLPSrr;
2616 break;
2617 case X86::VMOVHLPSZrr:
2618 Opc = X86::VUNPCKHPDZ128rr;
2619 break;
2620 case X86::VUNPCKHPDZ128rr:
2621 Opc = X86::VMOVHLPSZrr;
2622 break;
2623 }
2624 WorkingMI = CloneIfNew(MI);
2625 WorkingMI->setDesc(get(Opc));
2626 break;
2627 CASE_ND(CMOV16rr)
2628 CASE_ND(CMOV32rr)
2629 CASE_ND(CMOV64rr) {
2630 WorkingMI = CloneIfNew(MI);
2631 unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2632 X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2634 break;
2635 }
2636 case X86::VPTERNLOGDZrri:
2637 case X86::VPTERNLOGDZrmi:
2638 case X86::VPTERNLOGDZ128rri:
2639 case X86::VPTERNLOGDZ128rmi:
2640 case X86::VPTERNLOGDZ256rri:
2641 case X86::VPTERNLOGDZ256rmi:
2642 case X86::VPTERNLOGQZrri:
2643 case X86::VPTERNLOGQZrmi:
2644 case X86::VPTERNLOGQZ128rri:
2645 case X86::VPTERNLOGQZ128rmi:
2646 case X86::VPTERNLOGQZ256rri:
2647 case X86::VPTERNLOGQZ256rmi:
2648 case X86::VPTERNLOGDZrrik:
2649 case X86::VPTERNLOGDZ128rrik:
2650 case X86::VPTERNLOGDZ256rrik:
2651 case X86::VPTERNLOGQZrrik:
2652 case X86::VPTERNLOGQZ128rrik:
2653 case X86::VPTERNLOGQZ256rrik:
2654 case X86::VPTERNLOGDZrrikz:
2655 case X86::VPTERNLOGDZrmikz:
2656 case X86::VPTERNLOGDZ128rrikz:
2657 case X86::VPTERNLOGDZ128rmikz:
2658 case X86::VPTERNLOGDZ256rrikz:
2659 case X86::VPTERNLOGDZ256rmikz:
2660 case X86::VPTERNLOGQZrrikz:
2661 case X86::VPTERNLOGQZrmikz:
2662 case X86::VPTERNLOGQZ128rrikz:
2663 case X86::VPTERNLOGQZ128rmikz:
2664 case X86::VPTERNLOGQZ256rrikz:
2665 case X86::VPTERNLOGQZ256rmikz:
2666 case X86::VPTERNLOGDZ128rmbi:
2667 case X86::VPTERNLOGDZ256rmbi:
2668 case X86::VPTERNLOGDZrmbi:
2669 case X86::VPTERNLOGQZ128rmbi:
2670 case X86::VPTERNLOGQZ256rmbi:
2671 case X86::VPTERNLOGQZrmbi:
2672 case X86::VPTERNLOGDZ128rmbikz:
2673 case X86::VPTERNLOGDZ256rmbikz:
2674 case X86::VPTERNLOGDZrmbikz:
2675 case X86::VPTERNLOGQZ128rmbikz:
2676 case X86::VPTERNLOGQZ256rmbikz:
2677 case X86::VPTERNLOGQZrmbikz: {
2678 WorkingMI = CloneIfNew(MI);
2679 commuteVPTERNLOG(*WorkingMI, OpIdx1, OpIdx2);
2680 break;
2681 }
2682 default:
2684 WorkingMI = CloneIfNew(MI);
2686 break;
2687 }
2688
2689 if (auto *FMA3Group = getFMA3Group(Opc, MI.getDesc().TSFlags)) {
2690 WorkingMI = CloneIfNew(MI);
2691 WorkingMI->setDesc(
2692 get(getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group)));
2693 break;
2694 }
2695 }
2696 return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2697}
2698
2699bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2700 unsigned &SrcOpIdx1,
2701 unsigned &SrcOpIdx2,
2702 bool IsIntrinsic) const {
2703 uint64_t TSFlags = MI.getDesc().TSFlags;
2704
2705 unsigned FirstCommutableVecOp = 1;
2706 unsigned LastCommutableVecOp = 3;
2707 unsigned KMaskOp = -1U;
2708 if (X86II::isKMasked(TSFlags)) {
2709 // For k-zero-masked operations it is Ok to commute the first vector
2710 // operand. Unless this is an intrinsic instruction.
2711 // For regular k-masked operations a conservative choice is done as the
2712 // elements of the first vector operand, for which the corresponding bit
2713 // in the k-mask operand is set to 0, are copied to the result of the
2714 // instruction.
2715 // TODO/FIXME: The commute still may be legal if it is known that the
2716 // k-mask operand is set to either all ones or all zeroes.
2717 // It is also Ok to commute the 1st operand if all users of MI use only
2718 // the elements enabled by the k-mask operand. For example,
2719 // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2720 // : v1[i];
2721 // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2722 // // Ok, to commute v1 in FMADD213PSZrk.
2723
2724 // The k-mask operand has index = 2 for masked and zero-masked operations.
2725 KMaskOp = 2;
2726
2727 // The operand with index = 1 is used as a source for those elements for
2728 // which the corresponding bit in the k-mask is set to 0.
2729 if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2730 FirstCommutableVecOp = 3;
2731
2732 LastCommutableVecOp++;
2733 } else if (IsIntrinsic) {
2734 // Commuting the first operand of an intrinsic instruction isn't possible
2735 // unless we can prove that only the lowest element of the result is used.
2736 FirstCommutableVecOp = 2;
2737 }
2738
2739 if (isMem(MI, LastCommutableVecOp))
2740 LastCommutableVecOp--;
2741
2742 // Only the first RegOpsNum operands are commutable.
2743 // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2744 // that the operand is not specified/fixed.
2745 if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2746 (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2747 SrcOpIdx1 == KMaskOp))
2748 return false;
2749 if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2750 (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2751 SrcOpIdx2 == KMaskOp))
2752 return false;
2753
2754 // Look for two different register operands assumed to be commutable
2755 // regardless of the FMA opcode. The FMA opcode is adjusted later.
2756 if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2757 SrcOpIdx2 == CommuteAnyOperandIndex) {
2758 unsigned CommutableOpIdx2 = SrcOpIdx2;
2759
2760 // At least one of operands to be commuted is not specified and
2761 // this method is free to choose appropriate commutable operands.
2762 if (SrcOpIdx1 == SrcOpIdx2)
2763 // Both of operands are not fixed. By default set one of commutable
2764 // operands to the last register operand of the instruction.
2765 CommutableOpIdx2 = LastCommutableVecOp;
2766 else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2767 // Only one of operands is not fixed.
2768 CommutableOpIdx2 = SrcOpIdx1;
2769
2770 // CommutableOpIdx2 is well defined now. Let's choose another commutable
2771 // operand and assign its index to CommutableOpIdx1.
2772 Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2773
2774 unsigned CommutableOpIdx1;
2775 for (CommutableOpIdx1 = LastCommutableVecOp;
2776 CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2777 // Just ignore and skip the k-mask operand.
2778 if (CommutableOpIdx1 == KMaskOp)
2779 continue;
2780
2781 // The commuted operands must have different registers.
2782 // Otherwise, the commute transformation does not change anything and
2783 // is useless then.
2784 if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2785 break;
2786 }
2787
2788 // No appropriate commutable operands were found.
2789 if (CommutableOpIdx1 < FirstCommutableVecOp)
2790 return false;
2791
2792 // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2793 // to return those values.
2794 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
2795 CommutableOpIdx2))
2796 return false;
2797 }
2798
2799 return true;
2800}
2801
2803 unsigned &SrcOpIdx1,
2804 unsigned &SrcOpIdx2) const {
2805 const MCInstrDesc &Desc = MI.getDesc();
2806 if (!Desc.isCommutable())
2807 return false;
2808
2809 switch (MI.getOpcode()) {
2810 case X86::CMPSDrri:
2811 case X86::CMPSSrri:
2812 case X86::CMPPDrri:
2813 case X86::CMPPSrri:
2814 case X86::VCMPSDrri:
2815 case X86::VCMPSSrri:
2816 case X86::VCMPPDrri:
2817 case X86::VCMPPSrri:
2818 case X86::VCMPPDYrri:
2819 case X86::VCMPPSYrri:
2820 case X86::VCMPSDZrri:
2821 case X86::VCMPSSZrri:
2822 case X86::VCMPPDZrri:
2823 case X86::VCMPPSZrri:
2824 case X86::VCMPSHZrri:
2825 case X86::VCMPPHZrri:
2826 case X86::VCMPPHZ128rri:
2827 case X86::VCMPPHZ256rri:
2828 case X86::VCMPPDZ128rri:
2829 case X86::VCMPPSZ128rri:
2830 case X86::VCMPPDZ256rri:
2831 case X86::VCMPPSZ256rri:
2832 case X86::VCMPPDZrrik:
2833 case X86::VCMPPSZrrik:
2834 case X86::VCMPPHZrrik:
2835 case X86::VCMPPDZ128rrik:
2836 case X86::VCMPPSZ128rrik:
2837 case X86::VCMPPHZ128rrik:
2838 case X86::VCMPPDZ256rrik:
2839 case X86::VCMPPSZ256rrik:
2840 case X86::VCMPPHZ256rrik: {
2841 unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2842
2843 // Float comparison can be safely commuted for
2844 // Ordered/Unordered/Equal/NotEqual tests
2845 unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2846 switch (Imm) {
2847 default:
2848 // EVEX versions can be commuted.
2849 if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2850 break;
2851 return false;
2852 case 0x00: // EQUAL
2853 case 0x03: // UNORDERED
2854 case 0x04: // NOT EQUAL
2855 case 0x07: // ORDERED
2856 break;
2857 }
2858
2859 // The indices of the commutable operands are 1 and 2 (or 2 and 3
2860 // when masked).
2861 // Assign them to the returned operand indices here.
2862 return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2863 2 + OpOffset);
2864 }
2865 case X86::MOVSSrr:
2866 // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2867 // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2868 // AVX implies sse4.1.
2869 if (Subtarget.hasSSE41())
2870 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2871 return false;
2872 case X86::SHUFPDrri:
2873 // We can commute this to MOVSD.
2874 if (MI.getOperand(3).getImm() == 0x02)
2875 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2876 return false;
2877 case X86::MOVHLPSrr:
2878 case X86::UNPCKHPDrr:
2879 case X86::VMOVHLPSrr:
2880 case X86::VUNPCKHPDrr:
2881 case X86::VMOVHLPSZrr:
2882 case X86::VUNPCKHPDZ128rr:
2883 if (Subtarget.hasSSE2())
2884 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2885 return false;
2886 case X86::VPTERNLOGDZrri:
2887 case X86::VPTERNLOGDZrmi:
2888 case X86::VPTERNLOGDZ128rri:
2889 case X86::VPTERNLOGDZ128rmi:
2890 case X86::VPTERNLOGDZ256rri:
2891 case X86::VPTERNLOGDZ256rmi:
2892 case X86::VPTERNLOGQZrri:
2893 case X86::VPTERNLOGQZrmi:
2894 case X86::VPTERNLOGQZ128rri:
2895 case X86::VPTERNLOGQZ128rmi:
2896 case X86::VPTERNLOGQZ256rri:
2897 case X86::VPTERNLOGQZ256rmi:
2898 case X86::VPTERNLOGDZrrik:
2899 case X86::VPTERNLOGDZ128rrik:
2900 case X86::VPTERNLOGDZ256rrik:
2901 case X86::VPTERNLOGQZrrik:
2902 case X86::VPTERNLOGQZ128rrik:
2903 case X86::VPTERNLOGQZ256rrik:
2904 case X86::VPTERNLOGDZrrikz:
2905 case X86::VPTERNLOGDZrmikz:
2906 case X86::VPTERNLOGDZ128rrikz:
2907 case X86::VPTERNLOGDZ128rmikz:
2908 case X86::VPTERNLOGDZ256rrikz:
2909 case X86::VPTERNLOGDZ256rmikz:
2910 case X86::VPTERNLOGQZrrikz:
2911 case X86::VPTERNLOGQZrmikz:
2912 case X86::VPTERNLOGQZ128rrikz:
2913 case X86::VPTERNLOGQZ128rmikz:
2914 case X86::VPTERNLOGQZ256rrikz:
2915 case X86::VPTERNLOGQZ256rmikz:
2916 case X86::VPTERNLOGDZ128rmbi:
2917 case X86::VPTERNLOGDZ256rmbi:
2918 case X86::VPTERNLOGDZrmbi:
2919 case X86::VPTERNLOGQZ128rmbi:
2920 case X86::VPTERNLOGQZ256rmbi:
2921 case X86::VPTERNLOGQZrmbi:
2922 case X86::VPTERNLOGDZ128rmbikz:
2923 case X86::VPTERNLOGDZ256rmbikz:
2924 case X86::VPTERNLOGDZrmbikz:
2925 case X86::VPTERNLOGQZ128rmbikz:
2926 case X86::VPTERNLOGQZ256rmbikz:
2927 case X86::VPTERNLOGQZrmbikz:
2928 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2929 case X86::VPDPWSSDYrr:
2930 case X86::VPDPWSSDrr:
2931 case X86::VPDPWSSDSYrr:
2932 case X86::VPDPWSSDSrr:
2933 case X86::VPDPWUUDrr:
2934 case X86::VPDPWUUDYrr:
2935 case X86::VPDPWUUDSrr:
2936 case X86::VPDPWUUDSYrr:
2937 case X86::VPDPBSSDSrr:
2938 case X86::VPDPBSSDSYrr:
2939 case X86::VPDPBSSDrr:
2940 case X86::VPDPBSSDYrr:
2941 case X86::VPDPBUUDSrr:
2942 case X86::VPDPBUUDSYrr:
2943 case X86::VPDPBUUDrr:
2944 case X86::VPDPBUUDYrr:
2945 case X86::VPDPBSSDSZ128rr:
2946 case X86::VPDPBSSDSZ128rrk:
2947 case X86::VPDPBSSDSZ128rrkz:
2948 case X86::VPDPBSSDSZ256rr:
2949 case X86::VPDPBSSDSZ256rrk:
2950 case X86::VPDPBSSDSZ256rrkz:
2951 case X86::VPDPBSSDSZrr:
2952 case X86::VPDPBSSDSZrrk:
2953 case X86::VPDPBSSDSZrrkz:
2954 case X86::VPDPBSSDZ128rr:
2955 case X86::VPDPBSSDZ128rrk:
2956 case X86::VPDPBSSDZ128rrkz:
2957 case X86::VPDPBSSDZ256rr:
2958 case X86::VPDPBSSDZ256rrk:
2959 case X86::VPDPBSSDZ256rrkz:
2960 case X86::VPDPBSSDZrr:
2961 case X86::VPDPBSSDZrrk:
2962 case X86::VPDPBSSDZrrkz:
2963 case X86::VPDPBUUDSZ128rr:
2964 case X86::VPDPBUUDSZ128rrk:
2965 case X86::VPDPBUUDSZ128rrkz:
2966 case X86::VPDPBUUDSZ256rr:
2967 case X86::VPDPBUUDSZ256rrk:
2968 case X86::VPDPBUUDSZ256rrkz:
2969 case X86::VPDPBUUDSZrr:
2970 case X86::VPDPBUUDSZrrk:
2971 case X86::VPDPBUUDSZrrkz:
2972 case X86::VPDPBUUDZ128rr:
2973 case X86::VPDPBUUDZ128rrk:
2974 case X86::VPDPBUUDZ128rrkz:
2975 case X86::VPDPBUUDZ256rr:
2976 case X86::VPDPBUUDZ256rrk:
2977 case X86::VPDPBUUDZ256rrkz:
2978 case X86::VPDPBUUDZrr:
2979 case X86::VPDPBUUDZrrk:
2980 case X86::VPDPBUUDZrrkz:
2981 case X86::VPDPWSSDZ128rr:
2982 case X86::VPDPWSSDZ128rrk:
2983 case X86::VPDPWSSDZ128rrkz:
2984 case X86::VPDPWSSDZ256rr:
2985 case X86::VPDPWSSDZ256rrk:
2986 case X86::VPDPWSSDZ256rrkz:
2987 case X86::VPDPWSSDZrr:
2988 case X86::VPDPWSSDZrrk:
2989 case X86::VPDPWSSDZrrkz:
2990 case X86::VPDPWSSDSZ128rr:
2991 case X86::VPDPWSSDSZ128rrk:
2992 case X86::VPDPWSSDSZ128rrkz:
2993 case X86::VPDPWSSDSZ256rr:
2994 case X86::VPDPWSSDSZ256rrk:
2995 case X86::VPDPWSSDSZ256rrkz:
2996 case X86::VPDPWSSDSZrr:
2997 case X86::VPDPWSSDSZrrk:
2998 case X86::VPDPWSSDSZrrkz:
2999 case X86::VPDPWUUDZ128rr:
3000 case X86::VPDPWUUDZ128rrk:
3001 case X86::VPDPWUUDZ128rrkz:
3002 case X86::VPDPWUUDZ256rr:
3003 case X86::VPDPWUUDZ256rrk:
3004 case X86::VPDPWUUDZ256rrkz:
3005 case X86::VPDPWUUDZrr:
3006 case X86::VPDPWUUDZrrk:
3007 case X86::VPDPWUUDZrrkz:
3008 case X86::VPDPWUUDSZ128rr:
3009 case X86::VPDPWUUDSZ128rrk:
3010 case X86::VPDPWUUDSZ128rrkz:
3011 case X86::VPDPWUUDSZ256rr:
3012 case X86::VPDPWUUDSZ256rrk:
3013 case X86::VPDPWUUDSZ256rrkz:
3014 case X86::VPDPWUUDSZrr:
3015 case X86::VPDPWUUDSZrrk:
3016 case X86::VPDPWUUDSZrrkz:
3017 case X86::VPMADD52HUQrr:
3018 case X86::VPMADD52HUQYrr:
3019 case X86::VPMADD52HUQZ128r:
3020 case X86::VPMADD52HUQZ128rk:
3021 case X86::VPMADD52HUQZ128rkz:
3022 case X86::VPMADD52HUQZ256r:
3023 case X86::VPMADD52HUQZ256rk:
3024 case X86::VPMADD52HUQZ256rkz:
3025 case X86::VPMADD52HUQZr:
3026 case X86::VPMADD52HUQZrk:
3027 case X86::VPMADD52HUQZrkz:
3028 case X86::VPMADD52LUQrr:
3029 case X86::VPMADD52LUQYrr:
3030 case X86::VPMADD52LUQZ128r:
3031 case X86::VPMADD52LUQZ128rk:
3032 case X86::VPMADD52LUQZ128rkz:
3033 case X86::VPMADD52LUQZ256r:
3034 case X86::VPMADD52LUQZ256rk:
3035 case X86::VPMADD52LUQZ256rkz:
3036 case X86::VPMADD52LUQZr:
3037 case X86::VPMADD52LUQZrk:
3038 case X86::VPMADD52LUQZrkz:
3039 case X86::VFMADDCPHZr:
3040 case X86::VFMADDCPHZrk:
3041 case X86::VFMADDCPHZrkz:
3042 case X86::VFMADDCPHZ128r:
3043 case X86::VFMADDCPHZ128rk:
3044 case X86::VFMADDCPHZ128rkz:
3045 case X86::VFMADDCPHZ256r:
3046 case X86::VFMADDCPHZ256rk:
3047 case X86::VFMADDCPHZ256rkz:
3048 case X86::VFMADDCSHZr:
3049 case X86::VFMADDCSHZrk:
3050 case X86::VFMADDCSHZrkz: {
3051 unsigned CommutableOpIdx1 = 2;
3052 unsigned CommutableOpIdx2 = 3;
3053 if (X86II::isKMasked(Desc.TSFlags)) {
3054 // Skip the mask register.
3055 ++CommutableOpIdx1;
3056 ++CommutableOpIdx2;
3057 }
3058 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3059 CommutableOpIdx2))
3060 return false;
3061 if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
3062 // No idea.
3063 return false;
3064 return true;
3065 }
3066
3067 default:
3068 const X86InstrFMA3Group *FMA3Group =
3069 getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
3070 if (FMA3Group)
3071 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
3072 FMA3Group->isIntrinsic());
3073
3074 // Handled masked instructions since we need to skip over the mask input
3075 // and the preserved input.
3076 if (X86II::isKMasked(Desc.TSFlags)) {
3077 // First assume that the first input is the mask operand and skip past it.
3078 unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
3079 unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
3080 // Check if the first input is tied. If there isn't one then we only
3081 // need to skip the mask operand which we did above.
3082 if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
3083 MCOI::TIED_TO) != -1)) {
3084 // If this is zero masking instruction with a tied operand, we need to
3085 // move the first index back to the first input since this must
3086 // be a 3 input instruction and we want the first two non-mask inputs.
3087 // Otherwise this is a 2 input instruction with a preserved input and
3088 // mask, so we need to move the indices to skip one more input.
3089 if (X86II::isKMergeMasked(Desc.TSFlags)) {
3090 ++CommutableOpIdx1;
3091 ++CommutableOpIdx2;
3092 } else {
3093 --CommutableOpIdx1;
3094 }
3095 }
3096
3097 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3098 CommutableOpIdx2))
3099 return false;
3100
3101 if (!MI.getOperand(SrcOpIdx1).isReg() ||
3102 !MI.getOperand(SrcOpIdx2).isReg())
3103 // No idea.
3104 return false;
3105 return true;
3106 }
3107
3108 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
3109 }
3110 return false;
3111}
3112
3114 unsigned Opcode = MI->getOpcode();
3115 if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
3116 Opcode != X86::LEA64_32r)
3117 return false;
3118
3119 const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
3120 const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
3121 const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
3122
3123 if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
3124 Scale.getImm() > 1)
3125 return false;
3126
3127 return true;
3128}
3129
3131 // Currently we're interested in following sequence only.
3132 // r3 = lea r1, r2
3133 // r5 = add r3, r4
3134 // Both r3 and r4 are killed in add, we hope the add instruction has the
3135 // operand order
3136 // r5 = add r4, r3
3137 // So later in X86FixupLEAs the lea instruction can be rewritten as add.
3138 unsigned Opcode = MI.getOpcode();
3139 if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
3140 return false;
3141
3142 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3143 Register Reg1 = MI.getOperand(1).getReg();
3144 Register Reg2 = MI.getOperand(2).getReg();
3145
3146 // Check if Reg1 comes from LEA in the same MBB.
3147 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
3148 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3149 Commute = true;
3150 return true;
3151 }
3152 }
3153
3154 // Check if Reg2 comes from LEA in the same MBB.
3155 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
3156 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3157 Commute = false;
3158 return true;
3159 }
3160 }
3161
3162 return false;
3163}
3164
3166 unsigned Opcode = MCID.getOpcode();
3167 if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isSETZUCC(Opcode) ||
3168 X86::isCMOVCC(Opcode) || X86::isCFCMOVCC(Opcode) ||
3169 X86::isCCMPCC(Opcode) || X86::isCTESTCC(Opcode)))
3170 return -1;
3171 // Assume that condition code is always the last use operand.
3172 unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
3173 return NumUses - 1;
3174}
3175
3177 const MCInstrDesc &MCID = MI.getDesc();
3178 int CondNo = getCondSrcNoFromDesc(MCID);
3179 if (CondNo < 0)
3180 return X86::COND_INVALID;
3181 CondNo += MCID.getNumDefs();
3182 return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
3183}
3184
3186 return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3188}
3189
3191 return X86::isSETCC(MI.getOpcode()) || X86::isSETZUCC(MI.getOpcode())
3194}
3195
3197 return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3199}
3200
3202 return X86::isCFCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3204}
3205
3207 return X86::isCCMPCC(MI.getOpcode()) || X86::isCTESTCC(MI.getOpcode())
3210}
3211
3213 // CCMP/CTEST has two conditional operands:
3214 // - SCC: source conditonal code (same as CMOV)
3215 // - DCF: destination conditional flags, which has 4 valid bits
3216 //
3217 // +----+----+----+----+
3218 // | OF | SF | ZF | CF |
3219 // +----+----+----+----+
3220 //
3221 // If SCC(source conditional code) evaluates to false, CCMP/CTEST will updates
3222 // the conditional flags by as follows:
3223 //
3224 // OF = DCF.OF
3225 // SF = DCF.SF
3226 // ZF = DCF.ZF
3227 // CF = DCF.CF
3228 // PF = DCF.CF
3229 // AF = 0 (Auxiliary Carry Flag)
3230 //
3231 // Otherwise, the CMP or TEST is executed and it updates the
3232 // CSPAZO flags normally.
3233 //
3234 // NOTE:
3235 // If SCC = P, then SCC evaluates to true regardless of the CSPAZO value.
3236 // If SCC = NP, then SCC evaluates to false regardless of the CSPAZO value.
3237
3238 enum { CF = 1, ZF = 2, SF = 4, OF = 8, PF = CF };
3239
3240 switch (CC) {
3241 default:
3242 llvm_unreachable("Illegal condition code!");
3243 case X86::COND_NO:
3244 case X86::COND_NE:
3245 case X86::COND_GE:
3246 case X86::COND_G:
3247 case X86::COND_AE:
3248 case X86::COND_A:
3249 case X86::COND_NS:
3250 case X86::COND_NP:
3251 return 0;
3252 case X86::COND_O:
3253 return OF;
3254 case X86::COND_B:
3255 case X86::COND_BE:
3256 return CF;
3257 break;
3258 case X86::COND_E:
3259 case X86::COND_LE:
3260 return ZF;
3261 case X86::COND_S:
3262 case X86::COND_L:
3263 return SF;
3264 case X86::COND_P:
3265 return PF;
3266 }
3267}
3268
3269#define GET_X86_NF_TRANSFORM_TABLE
3270#define GET_X86_ND2NONND_TABLE
3271#include "X86GenInstrMapping.inc"
3272
3274 unsigned Opc) {
3275 const auto I = llvm::lower_bound(Table, Opc);
3276 return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc;
3277}
3278unsigned X86::getNFVariant(unsigned Opc) {
3279#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3280 // Make sure the tables are sorted.
3281 static std::atomic<bool> NFTableChecked(false);
3282 if (!NFTableChecked.load(std::memory_order_relaxed)) {
3283 assert(llvm::is_sorted(X86NFTransformTable) &&
3284 "X86NFTransformTable is not sorted!");
3285 NFTableChecked.store(true, std::memory_order_relaxed);
3286 }
3287#endif
3288 return getNewOpcFromTable(X86NFTransformTable, Opc);
3289}
3290
3291unsigned X86::getNonNDVariant(unsigned Opc) {
3292#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3293 // Make sure the tables are sorted.
3294 static std::atomic<bool> NDTableChecked(false);
3295 if (!NDTableChecked.load(std::memory_order_relaxed)) {
3296 assert(llvm::is_sorted(X86ND2NonNDTable) &&
3297 "X86ND2NonNDTableis not sorted!");
3298 NDTableChecked.store(true, std::memory_order_relaxed);
3299 }
3300#endif
3301 return getNewOpcFromTable(X86ND2NonNDTable, Opc);
3302}
3303
3304/// Return the inverse of the specified condition,
3305/// e.g. turning COND_E to COND_NE.
3307 switch (CC) {
3308 default:
3309 llvm_unreachable("Illegal condition code!");
3310 case X86::COND_E:
3311 return X86::COND_NE;
3312 case X86::COND_NE:
3313 return X86::COND_E;
3314 case X86::COND_L:
3315 return X86::COND_GE;
3316 case X86::COND_LE:
3317 return X86::COND_G;
3318 case X86::COND_G:
3319 return X86::COND_LE;
3320 case X86::COND_GE:
3321 return X86::COND_L;
3322 case X86::COND_B:
3323 return X86::COND_AE;
3324 case X86::COND_BE:
3325 return X86::COND_A;
3326 case X86::COND_A:
3327 return X86::COND_BE;
3328 case X86::COND_AE:
3329 return X86::COND_B;
3330 case X86::COND_S:
3331 return X86::COND_NS;
3332 case X86::COND_NS:
3333 return X86::COND_S;
3334 case X86::COND_P:
3335 return X86::COND_NP;
3336 case X86::COND_NP:
3337 return X86::COND_P;
3338 case X86::COND_O:
3339 return X86::COND_NO;
3340 case X86::COND_NO:
3341 return X86::COND_O;
3342 case X86::COND_NE_OR_P:
3343 return X86::COND_E_AND_NP;
3344 case X86::COND_E_AND_NP:
3345 return X86::COND_NE_OR_P;
3346 }
3347}
3348
3349/// Assuming the flags are set by MI(a,b), return the condition code if we
3350/// modify the instructions such that flags are set by MI(b,a).
3352 switch (CC) {
3353 default:
3354 return X86::COND_INVALID;
3355 case X86::COND_E:
3356 return X86::COND_E;
3357 case X86::COND_NE:
3358 return X86::COND_NE;
3359 case X86::COND_L:
3360 return X86::COND_G;
3361 case X86::COND_LE:
3362 return X86::COND_GE;
3363 case X86::COND_G:
3364 return X86::COND_L;
3365 case X86::COND_GE:
3366 return X86::COND_LE;
3367 case X86::COND_B:
3368 return X86::COND_A;
3369 case X86::COND_BE:
3370 return X86::COND_AE;
3371 case X86::COND_A:
3372 return X86::COND_B;
3373 case X86::COND_AE:
3374 return X86::COND_BE;
3375 }
3376}
3377
3378std::pair<X86::CondCode, bool>
3381 bool NeedSwap = false;
3382 switch (Predicate) {
3383 default:
3384 break;
3385 // Floating-point Predicates
3386 case CmpInst::FCMP_UEQ:
3387 CC = X86::COND_E;
3388 break;
3389 case CmpInst::FCMP_OLT:
3390 NeedSwap = true;
3391 [[fallthrough]];
3392 case CmpInst::FCMP_OGT:
3393 CC = X86::COND_A;
3394 break;
3395 case CmpInst::FCMP_OLE:
3396 NeedSwap = true;
3397 [[fallthrough]];
3398 case CmpInst::FCMP_OGE:
3399 CC = X86::COND_AE;
3400 break;
3401 case CmpInst::FCMP_UGT:
3402 NeedSwap = true;
3403 [[fallthrough]];
3404 case CmpInst::FCMP_ULT:
3405 CC = X86::COND_B;
3406 break;
3407 case CmpInst::FCMP_UGE:
3408 NeedSwap = true;
3409 [[fallthrough]];
3410 case CmpInst::FCMP_ULE:
3411 CC = X86::COND_BE;
3412 break;
3413 case CmpInst::FCMP_ONE:
3414 CC = X86::COND_NE;
3415 break;
3416 case CmpInst::FCMP_UNO:
3417 CC = X86::COND_P;
3418 break;
3419 case CmpInst::FCMP_ORD:
3420 CC = X86::COND_NP;
3421 break;
3422 case CmpInst::FCMP_OEQ:
3423 [[fallthrough]];
3424 case CmpInst::FCMP_UNE:
3425 CC = X86::COND_INVALID;
3426 break;
3427
3428 // Integer Predicates
3429 case CmpInst::ICMP_EQ:
3430 CC = X86::COND_E;
3431 break;
3432 case CmpInst::ICMP_NE:
3433 CC = X86::COND_NE;
3434 break;
3435 case CmpInst::ICMP_UGT:
3436 CC = X86::COND_A;
3437 break;
3438 case CmpInst::ICMP_UGE:
3439 CC = X86::COND_AE;
3440 break;
3441 case CmpInst::ICMP_ULT:
3442 CC = X86::COND_B;
3443 break;
3444 case CmpInst::ICMP_ULE:
3445 CC = X86::COND_BE;
3446 break;
3447 case CmpInst::ICMP_SGT:
3448 CC = X86::COND_G;
3449 break;
3450 case CmpInst::ICMP_SGE:
3451 CC = X86::COND_GE;
3452 break;
3453 case CmpInst::ICMP_SLT:
3454 CC = X86::COND_L;
3455 break;
3456 case CmpInst::ICMP_SLE:
3457 CC = X86::COND_LE;
3458 break;
3459 }
3460
3461 return std::make_pair(CC, NeedSwap);
3462}
3463
3464/// Return a cmov opcode for the given register size in bytes, and operand type.
3465unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand,
3466 bool HasNDD) {
3467 switch (RegBytes) {
3468 default:
3469 llvm_unreachable("Illegal register size!");
3470#define GET_ND_IF_ENABLED(OPC) (HasNDD ? OPC##_ND : OPC)
3471 case 2:
3472 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV16rm)
3473 : GET_ND_IF_ENABLED(X86::CMOV16rr);
3474 case 4:
3475 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV32rm)
3476 : GET_ND_IF_ENABLED(X86::CMOV32rr);
3477 case 8:
3478 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV64rm)
3479 : GET_ND_IF_ENABLED(X86::CMOV64rr);
3480 }
3481}
3482
3483/// Get the VPCMP immediate for the given condition.
3485 switch (CC) {
3486 default:
3487 llvm_unreachable("Unexpected SETCC condition");
3488 case ISD::SETNE:
3489 return 4;
3490 case ISD::SETEQ:
3491 return 0;
3492 case ISD::SETULT:
3493 case ISD::SETLT:
3494 return 1;
3495 case ISD::SETUGT:
3496 case ISD::SETGT:
3497 return 6;
3498 case ISD::SETUGE:
3499 case ISD::SETGE:
3500 return 5;
3501 case ISD::SETULE:
3502 case ISD::SETLE:
3503 return 2;
3504 }
3505}
3506
3507/// Get the VPCMP immediate if the operands are swapped.
3508unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
3509 switch (Imm) {
3510 default:
3511 llvm_unreachable("Unreachable!");
3512 case 0x01:
3513 Imm = 0x06;
3514 break; // LT -> NLE
3515 case 0x02:
3516 Imm = 0x05;
3517 break; // LE -> NLT
3518 case 0x05:
3519 Imm = 0x02;
3520 break; // NLT -> LE
3521 case 0x06:
3522 Imm = 0x01;
3523 break; // NLE -> LT
3524 case 0x00: // EQ
3525 case 0x03: // FALSE
3526 case 0x04: // NE
3527 case 0x07: // TRUE
3528 break;
3529 }
3530
3531 return Imm;
3532}
3533
3534/// Get the VPCOM immediate if the operands are swapped.
3535unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
3536 switch (Imm) {
3537 default:
3538 llvm_unreachable("Unreachable!");
3539 case 0x00:
3540 Imm = 0x02;
3541 break; // LT -> GT
3542 case 0x01:
3543 Imm = 0x03;
3544 break; // LE -> GE
3545 case 0x02:
3546 Imm = 0x00;
3547 break; // GT -> LT
3548 case 0x03:
3549 Imm = 0x01;
3550 break; // GE -> LE
3551 case 0x04: // EQ
3552 case 0x05: // NE
3553 case 0x06: // FALSE
3554 case 0x07: // TRUE
3555 break;
3556 }
3557
3558 return Imm;
3559}
3560
3561/// Get the VCMP immediate if the operands are swapped.
3562unsigned X86::getSwappedVCMPImm(unsigned Imm) {
3563 // Only need the lower 2 bits to distinquish.
3564 switch (Imm & 0x3) {
3565 default:
3566 llvm_unreachable("Unreachable!");
3567 case 0x00:
3568 case 0x03:
3569 // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
3570 break;
3571 case 0x01:
3572 case 0x02:
3573 // Need to toggle bits 3:0. Bit 4 stays the same.
3574 Imm ^= 0xf;
3575 break;
3576 }
3577
3578 return Imm;
3579}
3580
3582 if (Info.RegClass == X86::VR128RegClassID ||
3583 Info.RegClass == X86::VR128XRegClassID)
3584 return 128;
3585 if (Info.RegClass == X86::VR256RegClassID ||
3586 Info.RegClass == X86::VR256XRegClassID)
3587 return 256;
3588 if (Info.RegClass == X86::VR512RegClassID)
3589 return 512;
3590 llvm_unreachable("Unknown register class!");
3591}
3592
3593/// Return true if the Reg is X87 register.
3594static bool isX87Reg(Register Reg) {
3595 return (Reg == X86::FPCW || Reg == X86::FPSW ||
3596 (Reg >= X86::ST0 && Reg <= X86::ST7));
3597}
3598
3599/// check if the instruction is X87 instruction
3601 // Call and inlineasm defs X87 register, so we special case it here because
3602 // otherwise calls are incorrectly flagged as x87 instructions
3603 // as a result.
3604 if (MI.isCall() || MI.isInlineAsm())
3605 return false;
3606 for (const MachineOperand &MO : MI.operands()) {
3607 if (!MO.isReg())
3608 continue;
3609 if (isX87Reg(MO.getReg()))
3610 return true;
3611 }
3612 return false;
3613}
3614
3616 auto IsMemOp = [](const MCOperandInfo &OpInfo) {
3617 return OpInfo.OperandType == MCOI::OPERAND_MEMORY;
3618 };
3619
3620 const MCInstrDesc &Desc = MI.getDesc();
3621
3622 // Directly invoke the MC-layer routine for real (i.e., non-pseudo)
3623 // instructions (fast case).
3624 if (!X86II::isPseudo(Desc.TSFlags)) {
3625 int MemRefIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
3626 if (MemRefIdx >= 0)
3627 return MemRefIdx + X86II::getOperandBias(Desc);
3628#ifdef EXPENSIVE_CHECKS
3629 assert(none_of(Desc.operands(), IsMemOp) &&
3630 "Got false negative from X86II::getMemoryOperandNo()!");
3631#endif
3632 return -1;
3633 }
3634
3635 // Otherwise, handle pseudo instructions by examining the type of their
3636 // operands (slow case). An instruction cannot have a memory reference if it
3637 // has fewer than AddrNumOperands (= 5) explicit operands.
3638 unsigned NumOps = Desc.getNumOperands();
3640#ifdef EXPENSIVE_CHECKS
3641 assert(none_of(Desc.operands(), IsMemOp) &&
3642 "Expected no operands to have OPERAND_MEMORY type!");
3643#endif
3644 return -1;
3645 }
3646
3647 // The first operand with type OPERAND_MEMORY indicates the start of a memory
3648 // reference. We expect the following AddrNumOperand-1 operands to also have
3649 // OPERAND_MEMORY type.
3650 for (unsigned I = 0, E = NumOps - X86::AddrNumOperands; I != E; ++I) {
3651 if (IsMemOp(Desc.operands()[I])) {
3652#ifdef EXPENSIVE_CHECKS
3653 assert(std::all_of(Desc.operands().begin() + I,
3654 Desc.operands().begin() + I + X86::AddrNumOperands,
3655 IsMemOp) &&
3656 "Expected all five operands in the memory reference to have "
3657 "OPERAND_MEMORY type!");
3658#endif
3659 return I;
3660 }
3661 }
3662
3663 return -1;
3664}
3665
3667 unsigned OpNo) {
3668 assert(MI.getNumOperands() >= (OpNo + X86::AddrNumOperands) &&
3669 "Unexpected number of operands!");
3670
3671 const MachineOperand &Index = MI.getOperand(OpNo + X86::AddrIndexReg);
3672 if (!Index.isReg() || Index.getReg() != X86::NoRegister)
3673 return nullptr;
3674
3675 const MachineOperand &Disp = MI.getOperand(OpNo + X86::AddrDisp);
3676 if (!Disp.isCPI() || Disp.getOffset() != 0)
3677 return nullptr;
3678
3680 MI.getParent()->getParent()->getConstantPool()->getConstants();
3681 const MachineConstantPoolEntry &ConstantEntry = Constants[Disp.getIndex()];
3682
3683 // Bail if this is a machine constant pool entry, we won't be able to dig out
3684 // anything useful.
3685 if (ConstantEntry.isMachineConstantPoolEntry())
3686 return nullptr;
3687
3688 return ConstantEntry.Val.ConstVal;
3689}
3690
3692 switch (MI.getOpcode()) {
3693 case X86::TCRETURNdi:
3694 case X86::TCRETURNri:
3695 case X86::TCRETURNmi:
3696 case X86::TCRETURNdi64:
3697 case X86::TCRETURNri64:
3698 case X86::TCRETURNri64_ImpCall:
3699 case X86::TCRETURNmi64:
3700 return true;
3701 default:
3702 return false;
3703 }
3704}
3705
3708 const MachineInstr &TailCall) const {
3709
3710 const MachineFunction *MF = TailCall.getMF();
3711
3712 if (MF->getTarget().getCodeModel() == CodeModel::Kernel) {
3713 // Kernel patches thunk calls in runtime, these should never be conditional.
3714 const MachineOperand &Target = TailCall.getOperand(0);
3715 if (Target.isSymbol()) {
3716 StringRef Symbol(Target.getSymbolName());
3717 // this is currently only relevant to r11/kernel indirect thunk.
3718 if (Symbol == "__x86_indirect_thunk_r11")
3719 return false;
3720 }
3721 }
3722
3723 if (TailCall.getOpcode() != X86::TCRETURNdi &&
3724 TailCall.getOpcode() != X86::TCRETURNdi64) {
3725 // Only direct calls can be done with a conditional branch.
3726 return false;
3727 }
3728
3729 if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
3730 // Conditional tail calls confuse the Win64 unwinder.
3731 return false;
3732 }
3733
3734 assert(BranchCond.size() == 1);
3735 if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
3736 // Can't make a conditional tail call with this condition.
3737 return false;
3738 }
3739
3741 if (X86FI->getTCReturnAddrDelta() != 0 ||
3742 TailCall.getOperand(1).getImm() != 0) {
3743 // A conditional tail call cannot do any stack adjustment.
3744 return false;
3745 }
3746
3747 return true;
3748}
3749
3752 const MachineInstr &TailCall) const {
3753 assert(canMakeTailCallConditional(BranchCond, TailCall));
3754
3756 while (I != MBB.begin()) {
3757 --I;
3758 if (I->isDebugInstr())
3759 continue;
3760 if (!I->isBranch())
3761 assert(0 && "Can't find the branch to replace!");
3762
3764 assert(BranchCond.size() == 1);
3765 if (CC != BranchCond[0].getImm())
3766 continue;
3767
3768 break;
3769 }
3770
3771 unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3772 : X86::TCRETURNdi64cc;
3773
3774 auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3775 MIB->addOperand(TailCall.getOperand(0)); // Destination.
3776 MIB.addImm(0); // Stack offset (not used).
3777 MIB->addOperand(BranchCond[0]); // Condition.
3778 MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
3779
3780 // Add implicit uses and defs of all live regs potentially clobbered by the
3781 // call. This way they still appear live across the call.
3783 LiveRegs.addLiveOuts(MBB);
3785 LiveRegs.stepForward(*MIB, Clobbers);
3786 for (const auto &C : Clobbers) {
3787 MIB.addReg(C.first, RegState::Implicit);
3789 }
3790
3791 I->eraseFromParent();
3792}
3793
3794// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3795// not be a fallthrough MBB now due to layout changes). Return nullptr if the
3796// fallthrough MBB cannot be identified.
3799 // Look for non-EHPad successors other than TBB. If we find exactly one, it
3800 // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3801 // and fallthrough MBB. If we find more than one, we cannot identify the
3802 // fallthrough MBB and should return nullptr.
3803 MachineBasicBlock *FallthroughBB = nullptr;
3804 for (MachineBasicBlock *Succ : MBB->successors()) {
3805 if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
3806 continue;
3807 // Return a nullptr if we found more than one fallthrough successor.
3808 if (FallthroughBB && FallthroughBB != TBB)
3809 return nullptr;
3810 FallthroughBB = Succ;
3811 }
3812 return FallthroughBB;
3813}
3814
3815bool X86InstrInfo::analyzeBranchImpl(
3818 SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3819
3820 // Start from the bottom of the block and work up, examining the
3821 // terminator instructions.
3823 MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3824 while (I != MBB.begin()) {
3825 --I;
3826 if (I->isDebugInstr())
3827 continue;
3828
3829 // Working from the bottom, when we see a non-terminator instruction, we're
3830 // done.
3831 if (!isUnpredicatedTerminator(*I))
3832 break;
3833
3834 // A terminator that isn't a branch can't easily be handled by this
3835 // analysis.
3836 if (!I->isBranch())
3837 return true;
3838
3839 // Handle unconditional branches.
3840 if (I->getOpcode() == X86::JMP_1) {
3841 UnCondBrIter = I;
3842
3843 if (!AllowModify) {
3844 TBB = I->getOperand(0).getMBB();
3845 continue;
3846 }
3847
3848 // If the block has any instructions after a JMP, delete them.
3849 MBB.erase(std::next(I), MBB.end());
3850
3851 Cond.clear();
3852 FBB = nullptr;
3853
3854 // Delete the JMP if it's equivalent to a fall-through.
3855 if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3856 TBB = nullptr;
3857 I->eraseFromParent();
3858 I = MBB.end();
3859 UnCondBrIter = MBB.end();
3860 continue;
3861 }
3862
3863 // TBB is used to indicate the unconditional destination.
3864 TBB = I->getOperand(0).getMBB();
3865 continue;
3866 }
3867
3868 // Handle conditional branches.
3869 X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3870 if (BranchCode == X86::COND_INVALID)
3871 return true; // Can't handle indirect branch.
3872
3873 // In practice we should never have an undef eflags operand, if we do
3874 // abort here as we are not prepared to preserve the flag.
3875 if (I->findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->isUndef())
3876 return true;
3877
3878 // Working from the bottom, handle the first conditional branch.
3879 if (Cond.empty()) {
3880 FBB = TBB;
3881 TBB = I->getOperand(0).getMBB();
3883 CondBranches.push_back(&*I);
3884 continue;
3885 }
3886
3887 // Handle subsequent conditional branches. Only handle the case where all
3888 // conditional branches branch to the same destination and their condition
3889 // opcodes fit one of the special multi-branch idioms.
3890 assert(Cond.size() == 1);
3891 assert(TBB);
3892
3893 // If the conditions are the same, we can leave them alone.
3894 X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3895 auto NewTBB = I->getOperand(0).getMBB();
3896 if (OldBranchCode == BranchCode && TBB == NewTBB)
3897 continue;
3898
3899 // If they differ, see if they fit one of the known patterns. Theoretically,
3900 // we could handle more patterns here, but we shouldn't expect to see them
3901 // if instruction selection has done a reasonable job.
3902 if (TBB == NewTBB &&
3903 ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3904 (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3905 BranchCode = X86::COND_NE_OR_P;
3906 } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3907 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3908 if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3909 return true;
3910
3911 // X86::COND_E_AND_NP usually has two different branch destinations.
3912 //
3913 // JP B1
3914 // JE B2
3915 // JMP B1
3916 // B1:
3917 // B2:
3918 //
3919 // Here this condition branches to B2 only if NP && E. It has another
3920 // equivalent form:
3921 //
3922 // JNE B1
3923 // JNP B2
3924 // JMP B1
3925 // B1:
3926 // B2:
3927 //
3928 // Similarly it branches to B2 only if E && NP. That is why this condition
3929 // is named with COND_E_AND_NP.
3930 BranchCode = X86::COND_E_AND_NP;
3931 } else
3932 return true;
3933
3934 // Update the MachineOperand.
3935 Cond[0].setImm(BranchCode);
3936 CondBranches.push_back(&*I);
3937 }
3938
3939 return false;
3940}
3941
3944 MachineBasicBlock *&FBB,
3946 bool AllowModify) const {
3947 SmallVector<MachineInstr *, 4> CondBranches;
3948 return analyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3949}
3950
3952 const MCInstrDesc &Desc = MI.getDesc();
3953 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3954 assert(MemRefBegin >= 0 && "instr should have memory operand");
3955 MemRefBegin += X86II::getOperandBias(Desc);
3956
3957 const MachineOperand &MO = MI.getOperand(MemRefBegin + X86::AddrDisp);
3958 if (!MO.isJTI())
3959 return -1;
3960
3961 return MO.getIndex();
3962}
3963
3965 Register Reg) {
3966 if (!Reg.isVirtual())
3967 return -1;
3968 MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
3969 if (MI == nullptr)
3970 return -1;
3971 unsigned Opcode = MI->getOpcode();
3972 if (Opcode != X86::LEA64r && Opcode != X86::LEA32r)
3973 return -1;
3975}
3976
3978 unsigned Opcode = MI.getOpcode();
3979 // Switch-jump pattern for non-PIC code looks like:
3980 // JMP64m $noreg, 8, %X, %jump-table.X, $noreg
3981 if (Opcode == X86::JMP64m || Opcode == X86::JMP32m) {
3983 }
3984 // The pattern for PIC code looks like:
3985 // %0 = LEA64r $rip, 1, $noreg, %jump-table.X
3986 // %1 = MOVSX64rm32 %0, 4, XX, 0, $noreg
3987 // %2 = ADD64rr %1, %0
3988 // JMP64r %2
3989 if (Opcode == X86::JMP64r || Opcode == X86::JMP32r) {
3990 Register Reg = MI.getOperand(0).getReg();
3991 if (!Reg.isVirtual())
3992 return -1;
3993 const MachineFunction &MF = *MI.getParent()->getParent();
3994 const MachineRegisterInfo &MRI = MF.getRegInfo();
3995 MachineInstr *Add = MRI.getUniqueVRegDef(Reg);
3996 if (Add == nullptr)
3997 return -1;
3998 if (Add->getOpcode() != X86::ADD64rr && Add->getOpcode() != X86::ADD32rr)
3999 return -1;
4000 int JTI1 = getJumpTableIndexFromReg(MRI, Add->getOperand(1).getReg());
4001 if (JTI1 >= 0)
4002 return JTI1;
4003 int JTI2 = getJumpTableIndexFromReg(MRI, Add->getOperand(2).getReg());
4004 if (JTI2 >= 0)
4005 return JTI2;
4006 }
4007 return -1;
4008}
4009
4011 MachineBranchPredicate &MBP,
4012 bool AllowModify) const {
4013 using namespace std::placeholders;
4014
4016 SmallVector<MachineInstr *, 4> CondBranches;
4017 if (analyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
4018 AllowModify))
4019 return true;
4020
4021 if (Cond.size() != 1)
4022 return true;
4023
4024 assert(MBP.TrueDest && "expected!");
4025
4026 if (!MBP.FalseDest)
4027 MBP.FalseDest = MBB.getNextNode();
4028
4030
4031 MachineInstr *ConditionDef = nullptr;
4032 bool SingleUseCondition = true;
4033
4035 if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
4036 ConditionDef = &MI;
4037 break;
4038 }
4039
4040 if (MI.readsRegister(X86::EFLAGS, TRI))
4041 SingleUseCondition = false;
4042 }
4043
4044 if (!ConditionDef)
4045 return true;
4046
4047 if (SingleUseCondition) {
4048 for (auto *Succ : MBB.successors())
4049 if (Succ->isLiveIn(X86::EFLAGS))
4050 SingleUseCondition = false;
4051 }
4052
4053 MBP.ConditionDef = ConditionDef;
4054 MBP.SingleUseCondition = SingleUseCondition;
4055
4056 // Currently we only recognize the simple pattern:
4057 //
4058 // test %reg, %reg
4059 // je %label
4060 //
4061 const unsigned TestOpcode =
4062 Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
4063
4064 if (ConditionDef->getOpcode() == TestOpcode &&
4065 ConditionDef->getNumOperands() == 3 &&
4066 ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
4067 (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
4068 MBP.LHS = ConditionDef->getOperand(0);
4069 MBP.RHS = MachineOperand::CreateImm(0);
4070 MBP.Predicate = Cond[0].getImm() == X86::COND_NE
4071 ? MachineBranchPredicate::PRED_NE
4072 : MachineBranchPredicate::PRED_EQ;
4073 return false;
4074 }
4075
4076 return true;
4077}
4078
4080 int *BytesRemoved) const {
4081 assert(!BytesRemoved && "code size not handled");
4082
4084 unsigned Count = 0;
4085
4086 while (I != MBB.begin()) {
4087 --I;
4088 if (I->isDebugInstr())
4089 continue;
4090 if (I->getOpcode() != X86::JMP_1 &&
4092 break;
4093 // Remove the branch.
4094 I->eraseFromParent();
4095 I = MBB.end();
4096 ++Count;
4097 }
4098
4099 return Count;
4100}
4101
4104 MachineBasicBlock *FBB,
4106 const DebugLoc &DL, int *BytesAdded) const {
4107 // Shouldn't be a fall through.
4108 assert(TBB && "insertBranch must not be told to insert a fallthrough");
4109 assert((Cond.size() == 1 || Cond.size() == 0) &&
4110 "X86 branch conditions have one component!");
4111 assert(!BytesAdded && "code size not handled");
4112
4113 if (Cond.empty()) {
4114 // Unconditional branch?
4115 assert(!FBB && "Unconditional branch with multiple successors!");
4116 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
4117 return 1;
4118 }
4119
4120 // If FBB is null, it is implied to be a fall-through block.
4121 bool FallThru = FBB == nullptr;
4122
4123 // Conditional branch.
4124 unsigned Count = 0;
4126 switch (CC) {
4127 case X86::COND_NE_OR_P:
4128 // Synthesize NE_OR_P with two branches.
4129 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
4130 ++Count;
4131 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
4132 ++Count;
4133 break;
4134 case X86::COND_E_AND_NP:
4135 // Use the next block of MBB as FBB if it is null.
4136 if (FBB == nullptr) {
4137 FBB = getFallThroughMBB(&MBB, TBB);
4138 assert(FBB && "MBB cannot be the last block in function when the false "
4139 "body is a fall-through.");
4140 }
4141 // Synthesize COND_E_AND_NP with two branches.
4142 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
4143 ++Count;
4144 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
4145 ++Count;
4146 break;
4147 default: {
4148 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
4149 ++Count;
4150 }
4151 }
4152 if (!FallThru) {
4153 // Two-way Conditional branch. Insert the second branch.
4154 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
4155 ++Count;
4156 }
4157 return Count;
4158}
4159
4162 Register DstReg, Register TrueReg,
4163 Register FalseReg, int &CondCycles,
4164 int &TrueCycles, int &FalseCycles) const {
4165 // Not all subtargets have cmov instructions.
4166 if (!Subtarget.canUseCMOV())
4167 return false;
4168 if (Cond.size() != 1)
4169 return false;
4170 // We cannot do the composite conditions, at least not in SSA form.
4172 return false;
4173
4174 // Check register classes.
4175 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4176 const TargetRegisterClass *RC =
4177 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
4178 if (!RC)
4179 return false;
4180
4181 // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
4182 if (X86::GR16RegClass.hasSubClassEq(RC) ||
4183 X86::GR32RegClass.hasSubClassEq(RC) ||
4184 X86::GR64RegClass.hasSubClassEq(RC)) {
4185 // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
4186 // Bridge. Probably Ivy Bridge as well.
4187 CondCycles = 2;
4188 TrueCycles = 2;
4189 FalseCycles = 2;
4190 return true;
4191 }
4192
4193 // Can't do vectors.
4194 return false;
4195}
4196
4199 const DebugLoc &DL, Register DstReg,
4201 Register FalseReg) const {
4202 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4203 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
4204 const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
4205 assert(Cond.size() == 1 && "Invalid Cond array");
4206 unsigned Opc =
4207 X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
4208 false /*HasMemoryOperand*/, Subtarget.hasNDD());
4209 BuildMI(MBB, I, DL, get(Opc), DstReg)
4210 .addReg(FalseReg)
4211 .addReg(TrueReg)
4212 .addImm(Cond[0].getImm());
4213}
4214
4215/// Test if the given register is a physical h register.
4216static bool isHReg(Register Reg) {
4217 return X86::GR8_ABCD_HRegClass.contains(Reg);
4218}
4219
4220// Try and copy between VR128/VR64 and GR64 registers.
4221static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg,
4222 const X86Subtarget &Subtarget) {
4223 bool HasAVX = Subtarget.hasAVX();
4224 bool HasAVX512 = Subtarget.hasAVX512();
4225 bool HasEGPR = Subtarget.hasEGPR();
4226
4227 // SrcReg(MaskReg) -> DestReg(GR64)
4228 // SrcReg(MaskReg) -> DestReg(GR32)
4229
4230 // All KMASK RegClasses hold the same k registers, can be tested against
4231 // anyone.
4232 if (X86::VK16RegClass.contains(SrcReg)) {
4233 if (X86::GR64RegClass.contains(DestReg)) {
4234 assert(Subtarget.hasBWI());
4235 return HasEGPR ? X86::KMOVQrk_EVEX : X86::KMOVQrk;
4236 }
4237 if (X86::GR32RegClass.contains(DestReg))
4238 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDrk_EVEX : X86::KMOVDrk)
4239 : (HasEGPR ? X86::KMOVWrk_EVEX : X86::KMOVWrk);
4240 }
4241
4242 // SrcReg(GR64) -> DestReg(MaskReg)
4243 // SrcReg(GR32) -> DestReg(MaskReg)
4244
4245 // All KMASK RegClasses hold the same k registers, can be tested against
4246 // anyone.
4247 if (X86::VK16RegClass.contains(DestReg)) {
4248 if (X86::GR64RegClass.contains(SrcReg)) {
4249 assert(Subtarget.hasBWI());
4250 return HasEGPR ? X86::KMOVQkr_EVEX : X86::KMOVQkr;
4251 }
4252 if (X86::GR32RegClass.contains(SrcReg))
4253 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDkr_EVEX : X86::KMOVDkr)
4254 : (HasEGPR ? X86::KMOVWkr_EVEX : X86::KMOVWkr);
4255 }
4256
4257 // SrcReg(VR128) -> DestReg(GR64)
4258 // SrcReg(VR64) -> DestReg(GR64)
4259 // SrcReg(GR64) -> DestReg(VR128)
4260 // SrcReg(GR64) -> DestReg(VR64)
4261
4262 if (X86::GR64RegClass.contains(DestReg)) {
4263 if (X86::VR128XRegClass.contains(SrcReg))
4264 // Copy from a VR128 register to a GR64 register.
4265 return HasAVX512 ? X86::VMOVPQIto64Zrr
4266 : HasAVX ? X86::VMOVPQIto64rr
4267 : X86::MOVPQIto64rr;
4268 if (X86::VR64RegClass.contains(SrcReg))
4269 // Copy from a VR64 register to a GR64 register.
4270 return X86::MMX_MOVD64from64rr;
4271 } else if (X86::GR64RegClass.contains(SrcReg)) {
4272 // Copy from a GR64 register to a VR128 register.
4273 if (X86::VR128XRegClass.contains(DestReg))
4274 return HasAVX512 ? X86::VMOV64toPQIZrr
4275 : HasAVX ? X86::VMOV64toPQIrr
4276 : X86::MOV64toPQIrr;
4277 // Copy from a GR64 register to a VR64 register.
4278 if (X86::VR64RegClass.contains(DestReg))
4279 return X86::MMX_MOVD64to64rr;
4280 }
4281
4282 // SrcReg(VR128) -> DestReg(GR32)
4283 // SrcReg(GR32) -> DestReg(VR128)
4284
4285 if (X86::GR32RegClass.contains(DestReg) &&
4286 X86::VR128XRegClass.contains(SrcReg))
4287 // Copy from a VR128 register to a GR32 register.
4288 return HasAVX512 ? X86::VMOVPDI2DIZrr
4289 : HasAVX ? X86::VMOVPDI2DIrr
4290 : X86::MOVPDI2DIrr;
4291
4292 if (X86::VR128XRegClass.contains(DestReg) &&
4293 X86::GR32RegClass.contains(SrcReg))
4294 // Copy from a GR32 register to a VR128 register.
4295 return HasAVX512 ? X86::VMOVDI2PDIZrr
4296 : HasAVX ? X86::VMOVDI2PDIrr
4297 : X86::MOVDI2PDIrr;
4298
4299 return 0;
4300}
4301
4304 const DebugLoc &DL, Register DestReg,
4305 Register SrcReg, bool KillSrc,
4306 bool RenamableDest, bool RenamableSrc) const {
4307 // First deal with the normal symmetric copies.
4308 bool HasAVX = Subtarget.hasAVX();
4309 bool HasVLX = Subtarget.hasVLX();
4310 bool HasEGPR = Subtarget.hasEGPR();
4311 unsigned Opc = 0;
4312 if (X86::GR64RegClass.contains(DestReg, SrcReg))
4313 Opc = X86::MOV64rr;
4314 else if (X86::GR32RegClass.contains(DestReg, SrcReg))
4315 Opc = X86::MOV32rr;
4316 else if (X86::GR16RegClass.contains(DestReg, SrcReg))
4317 Opc = X86::MOV16rr;
4318 else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
4319 // Copying to or from a physical H register on x86-64 requires a NOREX
4320 // move. Otherwise use a normal move.
4321 if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) {
4322 Opc = X86::MOV8rr_NOREX;
4323 // Both operands must be encodable without an REX prefix.
4324 assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
4325 "8-bit H register can not be copied outside GR8_NOREX");
4326 } else
4327 Opc = X86::MOV8rr;
4328 } else if (X86::VR64RegClass.contains(DestReg, SrcReg))
4329 Opc = X86::MMX_MOVQ64rr;
4330 else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
4331 if (HasVLX)
4332 Opc = X86::VMOVAPSZ128rr;
4333 else if (X86::VR128RegClass.contains(DestReg, SrcReg))
4334 Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
4335 else {
4336 // If this an extended register and we don't have VLX we need to use a
4337 // 512-bit move.
4338 Opc = X86::VMOVAPSZrr;
4340 DestReg =
4341 TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass);
4342 SrcReg =
4343 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4344 }
4345 } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
4346 if (HasVLX)
4347 Opc = X86::VMOVAPSZ256rr;
4348 else if (X86::VR256RegClass.contains(DestReg, SrcReg))
4349 Opc = X86::VMOVAPSYrr;
4350 else {
4351 // If this an extended register and we don't have VLX we need to use a
4352 // 512-bit move.
4353 Opc = X86::VMOVAPSZrr;
4355 DestReg =
4356 TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass);
4357 SrcReg =
4358 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4359 }
4360 } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
4361 Opc = X86::VMOVAPSZrr;
4362 // All KMASK RegClasses hold the same k registers, can be tested against
4363 // anyone.
4364 else if (X86::VK16RegClass.contains(DestReg, SrcReg))
4365 Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk)
4366 : (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVWkk);
4367
4368 if (!Opc)
4369 Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
4370
4371 if (Opc) {
4372 BuildMI(MBB, MI, DL, get(Opc), DestReg)
4373 .addReg(SrcReg, getKillRegState(KillSrc));
4374 return;
4375 }
4376
4377 if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
4378 // FIXME: We use a fatal error here because historically LLVM has tried
4379 // lower some of these physreg copies and we want to ensure we get
4380 // reasonable bug reports if someone encounters a case no other testing
4381 // found. This path should be removed after the LLVM 7 release.
4382 report_fatal_error("Unable to copy EFLAGS physical register!");
4383 }
4384
4385 LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
4386 << RI.getName(DestReg) << '\n');
4387 report_fatal_error("Cannot emit physreg copy instruction");
4388}
4389
4390std::optional<DestSourcePair>
4392 if (MI.isMoveReg()) {
4393 // FIXME: Dirty hack for apparent invariant that doesn't hold when
4394 // subreg_to_reg is coalesced with ordinary copies, such that the bits that
4395 // were asserted as 0 are now undef.
4396 if (MI.getOperand(0).isUndef() && MI.getOperand(0).getSubReg())
4397 return std::nullopt;
4398
4399 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
4400 }
4401 return std::nullopt;
4402}
4403
4404static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
4405 if (STI.hasFP16())
4406 return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
4407 if (Load)
4408 return X86::MOVSHPrm;
4409 return X86::MOVSHPmr;
4410}
4411
4413 const TargetRegisterClass *RC,
4414 bool IsStackAligned,
4415 const X86Subtarget &STI, bool Load) {
4416 bool HasAVX = STI.hasAVX();
4417 bool HasAVX512 = STI.hasAVX512();
4418 bool HasVLX = STI.hasVLX();
4419 bool HasEGPR = STI.hasEGPR();
4420
4421 assert(RC != nullptr && "Invalid target register class");
4422 switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
4423 default:
4424 llvm_unreachable("Unknown spill size");
4425 case 1:
4426 assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
4427 if (STI.is64Bit())
4428 // Copying to or from a physical H register on x86-64 requires a NOREX
4429 // move. Otherwise use a normal move.
4430 if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
4431 return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
4432 return Load ? X86::MOV8rm : X86::MOV8mr;
4433 case 2:
4434 if (X86::VK16RegClass.hasSubClassEq(RC))
4435 return Load ? (HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm)
4436 : (HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk);
4437 assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
4438 return Load ? X86::MOV16rm : X86::MOV16mr;
4439 case 4:
4440 if (X86::GR32RegClass.hasSubClassEq(RC))
4441 return Load ? X86::MOV32rm : X86::MOV32mr;
4442 if (X86::FR32XRegClass.hasSubClassEq(RC))
4443 return Load ? (HasAVX512 ? X86::VMOVSSZrm_alt
4444 : HasAVX ? X86::VMOVSSrm_alt
4445 : X86::MOVSSrm_alt)
4446 : (HasAVX512 ? X86::VMOVSSZmr
4447 : HasAVX ? X86::VMOVSSmr
4448 : X86::MOVSSmr);
4449 if (X86::RFP32RegClass.hasSubClassEq(RC))
4450 return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
4451 if (X86::VK32RegClass.hasSubClassEq(RC)) {
4452 assert(STI.hasBWI() && "KMOVD requires BWI");
4453 return Load ? (HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm)
4454 : (HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk);
4455 }
4456 // All of these mask pair classes have the same spill size, the same kind
4457 // of kmov instructions can be used with all of them.
4458 if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
4459 X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
4460 X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
4461 X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
4462 X86::VK16PAIRRegClass.hasSubClassEq(RC))
4463 return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
4464 if (X86::FR16RegClass.hasSubClassEq(RC) ||
4465 X86::FR16XRegClass.hasSubClassEq(RC))
4466 return getLoadStoreOpcodeForFP16(Load, STI);
4467 llvm_unreachable("Unknown 4-byte regclass");
4468 case 8:
4469 if (X86::GR64RegClass.hasSubClassEq(RC))
4470 return Load ? X86::MOV64rm : X86::MOV64mr;
4471 if (X86::FR64XRegClass.hasSubClassEq(RC))
4472 return Load ? (HasAVX512 ? X86::VMOVSDZrm_alt
4473 : HasAVX ? X86::VMOVSDrm_alt
4474 : X86::MOVSDrm_alt)
4475 : (HasAVX512 ? X86::VMOVSDZmr
4476 : HasAVX ? X86::VMOVSDmr
4477 : X86::MOVSDmr);
4478 if (X86::VR64RegClass.hasSubClassEq(RC))
4479 return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
4480 if (X86::RFP64RegClass.hasSubClassEq(RC))
4481 return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
4482 if (X86::VK64RegClass.hasSubClassEq(RC)) {
4483 assert(STI.hasBWI() && "KMOVQ requires BWI");
4484 return Load ? (HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm)
4485 : (HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk);
4486 }
4487 llvm_unreachable("Unknown 8-byte regclass");
4488 case 10:
4489 assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
4490 return Load ? X86::LD_Fp80m : X86::ST_FpP80m;
4491 case 16: {
4492 if (X86::VR128XRegClass.hasSubClassEq(RC)) {
4493 // If stack is realigned we can use aligned stores.
4494 if (IsStackAligned)
4495 return Load ? (HasVLX ? X86::VMOVAPSZ128rm
4496 : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX
4497 : HasAVX ? X86::VMOVAPSrm
4498 : X86::MOVAPSrm)
4499 : (HasVLX ? X86::VMOVAPSZ128mr
4500 : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX
4501 : HasAVX ? X86::VMOVAPSmr
4502 : X86::MOVAPSmr);
4503 else
4504 return Load ? (HasVLX ? X86::VMOVUPSZ128rm
4505 : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX
4506 : HasAVX ? X86::VMOVUPSrm
4507 : X86::MOVUPSrm)
4508 : (HasVLX ? X86::VMOVUPSZ128mr
4509 : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX
4510 : HasAVX ? X86::VMOVUPSmr
4511 : X86::MOVUPSmr);
4512 }
4513 llvm_unreachable("Unknown 16-byte regclass");
4514 }
4515 case 32:
4516 assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
4517 // If stack is realigned we can use aligned stores.
4518 if (IsStackAligned)
4519 return Load ? (HasVLX ? X86::VMOVAPSZ256rm
4520 : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
4521 : X86::VMOVAPSYrm)
4522 : (HasVLX ? X86::VMOVAPSZ256mr
4523 : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
4524 : X86::VMOVAPSYmr);
4525 else
4526 return Load ? (HasVLX ? X86::VMOVUPSZ256rm
4527 : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
4528 : X86::VMOVUPSYrm)
4529 : (HasVLX ? X86::VMOVUPSZ256mr
4530 : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
4531 : X86::VMOVUPSYmr);
4532 case 64:
4533 assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
4534 assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
4535 if (IsStackAligned)
4536 return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
4537 else
4538 return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
4539 case 1024:
4540 assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
4541 assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
4542#define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC)
4543 return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
4544 : GET_EGPR_IF_ENABLED(X86::TILESTORED);
4545#undef GET_EGPR_IF_ENABLED
4546 }
4547}
4548
4549std::optional<ExtAddrMode>
4551 const TargetRegisterInfo *TRI) const {
4552 const MCInstrDesc &Desc = MemI.getDesc();
4553 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4554 if (MemRefBegin < 0)
4555 return std::nullopt;
4556
4557 MemRefBegin += X86II::getOperandBias(Desc);
4558
4559 auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
4560 if (!BaseOp.isReg()) // Can be an MO_FrameIndex
4561 return std::nullopt;
4562
4563 const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
4564 // Displacement can be symbolic
4565 if (!DispMO.isImm())
4566 return std::nullopt;
4567
4568 ExtAddrMode AM;
4569 AM.BaseReg = BaseOp.getReg();
4570 AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
4571 AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
4572 AM.Displacement = DispMO.getImm();
4573 return AM;
4574}
4575
4577 StringRef &ErrInfo) const {
4578 std::optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
4579 if (!AMOrNone)
4580 return true;
4581
4582 ExtAddrMode AM = *AMOrNone;
4584 if (AM.ScaledReg != X86::NoRegister) {
4585 switch (AM.Scale) {
4586 case 1:
4587 case 2:
4588 case 4:
4589 case 8:
4590 break;
4591 default:
4592 ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
4593 return false;
4594 }
4595 }
4596 if (!isInt<32>(AM.Displacement)) {
4597 ErrInfo = "Displacement in address must fit into 32-bit signed "
4598 "integer";
4599 return false;
4600 }
4601
4602 return true;
4603}
4604
4606 const Register Reg,
4607 int64_t &ImmVal) const {
4608 Register MovReg = Reg;
4609 const MachineInstr *MovMI = &MI;
4610
4611 // Follow use-def for SUBREG_TO_REG to find the real move immediate
4612 // instruction. It is quite common for x86-64.
4613 if (MI.isSubregToReg()) {
4614 // We use following pattern to setup 64b immediate.
4615 // %8:gr32 = MOV32r0 implicit-def dead $eflags
4616 // %6:gr64 = SUBREG_TO_REG killed %8:gr32, %subreg.sub_32bit
4617 unsigned SubIdx = MI.getOperand(2).getImm();
4618 MovReg = MI.getOperand(1).getReg();
4619 if (SubIdx != X86::sub_32bit)
4620 return false;
4621 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4622 MovMI = MRI.getUniqueVRegDef(MovReg);
4623 if (!MovMI)
4624 return false;
4625 }
4626
4627 if (MovMI->getOpcode() == X86::MOV32r0 &&
4628 MovMI->getOperand(0).getReg() == MovReg) {
4629 ImmVal = 0;
4630 return true;
4631 }
4632
4633 if (MovMI->getOpcode() != X86::MOV32ri &&
4634 MovMI->getOpcode() != X86::MOV64ri &&
4635 MovMI->getOpcode() != X86::MOV32ri64 && MovMI->getOpcode() != X86::MOV8ri)
4636 return false;
4637 // Mov Src can be a global address.
4638 if (!MovMI->getOperand(1).isImm() || MovMI->getOperand(0).getReg() != MovReg)
4639 return false;
4640 ImmVal = MovMI->getOperand(1).getImm();
4641 return true;
4642}
4643
4645 const MachineInstr *MI, const Register NullValueReg,
4646 const TargetRegisterInfo *TRI) const {
4647 if (!MI->modifiesRegister(NullValueReg, TRI))
4648 return true;
4649 switch (MI->getOpcode()) {
4650 // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
4651 // X.
4652 case X86::SHR64ri:
4653 case X86::SHR32ri:
4654 case X86::SHL64ri:
4655 case X86::SHL32ri:
4656 assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
4657 "expected for shift opcode!");
4658 return MI->getOperand(0).getReg() == NullValueReg &&
4659 MI->getOperand(1).getReg() == NullValueReg;
4660 // Zero extend of a sub-reg of NullValueReg into itself does not change the
4661 // null value.
4662 case X86::MOV32rr:
4663 return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
4664 return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
4665 });
4666 default:
4667 return false;
4668 }
4669 llvm_unreachable("Should be handled above!");
4670}
4671
4674 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
4675 const TargetRegisterInfo *TRI) const {
4676 const MCInstrDesc &Desc = MemOp.getDesc();
4677 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4678 if (MemRefBegin < 0)
4679 return false;
4680
4681 MemRefBegin += X86II::getOperandBias(Desc);
4682
4683 const MachineOperand *BaseOp =
4684 &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
4685 if (!BaseOp->isReg()) // Can be an MO_FrameIndex
4686 return false;
4687
4688 if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
4689 return false;
4690
4691 if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
4692 X86::NoRegister)
4693 return false;
4694
4695 const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
4696
4697 // Displacement can be symbolic
4698 if (!DispMO.isImm())
4699 return false;
4700
4701 Offset = DispMO.getImm();
4702
4703 if (!BaseOp->isReg())
4704 return false;
4705
4706 OffsetIsScalable = false;
4707 // FIXME: Relying on memoperands() may not be right thing to do here. Check
4708 // with X86 maintainers, and fix it accordingly. For now, it is ok, since
4709 // there is no use of `Width` for X86 back-end at the moment.
4710 Width = !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize()
4712 BaseOps.push_back(BaseOp);
4713 return true;
4714}
4715
4716static unsigned getStoreRegOpcode(Register SrcReg,
4717 const TargetRegisterClass *RC,
4718 bool IsStackAligned,
4719 const X86Subtarget &STI) {
4720 return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
4721}
4722
4723static unsigned getLoadRegOpcode(Register DestReg,
4724 const TargetRegisterClass *RC,
4725 bool IsStackAligned, const X86Subtarget &STI) {
4726 return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
4727}
4728
4729static bool isAMXOpcode(unsigned Opc) {
4730 switch (Opc) {
4731 default:
4732 return false;
4733 case X86::TILELOADD:
4734 case X86::TILESTORED:
4735 case X86::TILELOADD_EVEX:
4736 case X86::TILESTORED_EVEX:
4737 return true;
4738 }
4739}
4740
4743 unsigned Opc, Register Reg, int FrameIdx,
4744 bool isKill) const {
4745 switch (Opc) {
4746 default:
4747 llvm_unreachable("Unexpected special opcode!");
4748 case X86::TILESTORED:
4749 case X86::TILESTORED_EVEX: {
4750 // tilestored %tmm, (%sp, %idx)
4751 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4752 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4753 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4754 MachineInstr *NewMI =
4755 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4756 .addReg(Reg, getKillRegState(isKill));
4758 MO.setReg(VirtReg);
4759 MO.setIsKill(true);
4760 break;
4761 }
4762 case X86::TILELOADD:
4763 case X86::TILELOADD_EVEX: {
4764 // tileloadd (%sp, %idx), %tmm
4765 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4766 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4767 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4769 BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx);
4771 MO.setReg(VirtReg);
4772 MO.setIsKill(true);
4773 break;
4774 }
4775 }
4776}
4777
4780 bool isKill, int FrameIdx, const TargetRegisterClass *RC,
4781
4782 Register VReg, MachineInstr::MIFlag Flags) const {
4783 const MachineFunction &MF = *MBB.getParent();
4784 const MachineFrameInfo &MFI = MF.getFrameInfo();
4785 assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
4786 "Stack slot too small for store");
4787
4788 unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
4789 bool isAligned =
4790 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4791 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4792
4793 unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
4794 if (isAMXOpcode(Opc))
4795 loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill);
4796 else
4797 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4798 .addReg(SrcReg, getKillRegState(isKill))
4799 .setMIFlag(Flags);
4800}
4801
4804 Register DestReg, int FrameIdx,
4805 const TargetRegisterClass *RC,
4806 Register VReg, unsigned SubReg,
4807 MachineInstr::MIFlag Flags) const {
4808 const MachineFunction &MF = *MBB.getParent();
4809 const MachineFrameInfo &MFI = MF.getFrameInfo();
4810 assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
4811 "Load size exceeds stack slot");
4812 unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
4813 bool isAligned =
4814 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4815 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4816
4817 unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
4818 if (isAMXOpcode(Opc))
4819 loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx);
4820 else
4821 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx)
4822 .setMIFlag(Flags);
4823}
4824
4826 Register &SrcReg2, int64_t &CmpMask,
4827 int64_t &CmpValue) const {
4828 switch (MI.getOpcode()) {
4829 default:
4830 break;
4831 case X86::CMP64ri32:
4832 case X86::CMP32ri:
4833 case X86::CMP16ri:
4834 case X86::CMP8ri:
4835 SrcReg = MI.getOperand(0).getReg();
4836 SrcReg2 = 0;
4837 if (MI.getOperand(1).isImm()) {
4838 CmpMask = ~0;
4839 CmpValue = MI.getOperand(1).getImm();
4840 } else {
4841 CmpMask = CmpValue = 0;
4842 }
4843 return true;
4844 // A SUB can be used to perform comparison.
4845 CASE_ND(SUB64rm)
4846 CASE_ND(SUB32rm)
4847 CASE_ND(SUB16rm)
4848 CASE_ND(SUB8rm)
4849 SrcReg = MI.getOperand(1).getReg();
4850 SrcReg2 = 0;
4851 CmpMask = 0;
4852 CmpValue = 0;
4853 return true;
4854 CASE_ND(SUB64rr)
4855 CASE_ND(SUB32rr)
4856 CASE_ND(SUB16rr)
4857 CASE_ND(SUB8rr)
4858 SrcReg = MI.getOperand(1).getReg();
4859 SrcReg2 = MI.getOperand(2).getReg();
4860 CmpMask = 0;
4861 CmpValue = 0;
4862 return true;
4863 CASE_ND(SUB64ri32)
4864 CASE_ND(SUB32ri)
4865 CASE_ND(SUB16ri)
4866 CASE_ND(SUB8ri)
4867 SrcReg = MI.getOperand(1).getReg();
4868 SrcReg2 = 0;
4869 if (MI.getOperand(2).isImm()) {
4870 CmpMask = ~0;
4871 CmpValue = MI.getOperand(2).getImm();
4872 } else {
4873 CmpMask = CmpValue = 0;
4874 }
4875 return true;
4876 case X86::CMP64rr:
4877 case X86::CMP32rr:
4878 case X86::CMP16rr:
4879 case X86::CMP8rr:
4880 SrcReg = MI.getOperand(0).getReg();
4881 SrcReg2 = MI.getOperand(1).getReg();
4882 CmpMask = 0;
4883 CmpValue = 0;
4884 return true;
4885 case X86::TEST8rr:
4886 case X86::TEST16rr:
4887 case X86::TEST32rr:
4888 case X86::TEST64rr:
4889 SrcReg = MI.getOperand(0).getReg();
4890 if (MI.getOperand(1).getReg() != SrcReg)
4891 return false;
4892 // Compare against zero.
4893 SrcReg2 = 0;
4894 CmpMask = ~0;
4895 CmpValue = 0;
4896 return true;
4897 case X86::TEST64ri32:
4898 case X86::TEST32ri:
4899 case X86::TEST16ri:
4900 case X86::TEST8ri:
4901 SrcReg = MI.getOperand(0).getReg();
4902 SrcReg2 = 0;
4903 // Force identical compare.
4904 CmpMask = 0;
4905 CmpValue = 0;
4906 return true;
4907 }
4908 return false;
4909}
4910
4911bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
4912 Register SrcReg, Register SrcReg2,
4913 int64_t ImmMask, int64_t ImmValue,
4914 const MachineInstr &OI, bool *IsSwapped,
4915 int64_t *ImmDelta) const {
4916 switch (OI.getOpcode()) {
4917 case X86::CMP64rr:
4918 case X86::CMP32rr:
4919 case X86::CMP16rr:
4920 case X86::CMP8rr:
4921 CASE_ND(SUB64rr)
4922 CASE_ND(SUB32rr)
4923 CASE_ND(SUB16rr)
4924 CASE_ND(SUB8rr) {
4925 Register OISrcReg;
4926 Register OISrcReg2;
4927 int64_t OIMask;
4928 int64_t OIValue;
4929 if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4930 OIMask != ImmMask || OIValue != ImmValue)
4931 return false;
4932 if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4933 *IsSwapped = false;
4934 return true;
4935 }
4936 if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4937 *IsSwapped = true;
4938 return true;
4939 }
4940 return false;
4941 }
4942 case X86::CMP64ri32:
4943 case X86::CMP32ri:
4944 case X86::CMP16ri:
4945 case X86::CMP8ri:
4946 case X86::TEST64ri32:
4947 case X86::TEST32ri:
4948 case X86::TEST16ri:
4949 case X86::TEST8ri:
4950 CASE_ND(SUB64ri32)
4951 CASE_ND(SUB32ri)
4952 CASE_ND(SUB16ri)
4953 CASE_ND(SUB8ri)
4954 case X86::TEST64rr:
4955 case X86::TEST32rr:
4956 case X86::TEST16rr:
4957 case X86::TEST8rr: {
4958 if (ImmMask != 0) {
4959 Register OISrcReg;
4960 Register OISrcReg2;
4961 int64_t OIMask;
4962 int64_t OIValue;
4963 if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4964 SrcReg == OISrcReg && ImmMask == OIMask) {
4965 if (OIValue == ImmValue) {
4966 *ImmDelta = 0;
4967 return true;
4968 } else if (static_cast<uint64_t>(ImmValue) ==
4969 static_cast<uint64_t>(OIValue) - 1) {
4970 *ImmDelta = -1;
4971 return true;
4972 } else if (static_cast<uint64_t>(ImmValue) ==
4973 static_cast<uint64_t>(OIValue) + 1) {
4974 *ImmDelta = 1;
4975 return true;
4976 } else {
4977 return false;
4978 }
4979 }
4980 }
4981 return FlagI.isIdenticalTo(OI);
4982 }
4983 default:
4984 return false;
4985 }
4986}
4987
4988/// Check whether the definition can be converted
4989/// to remove a comparison against zero.
4990inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4991 bool &ClearsOverflowFlag) {
4992 NoSignFlag = false;
4993 ClearsOverflowFlag = false;
4994
4995 // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and
4996 // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during
4997 // Initial Exec to Local Exec relaxation. In these cases, we must not depend
4998 // on the EFLAGS modification of ADD actually happening in the final binary.
4999 if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) {
5000 unsigned Flags = MI.getOperand(5).getTargetFlags();
5001 if (Flags == X86II::MO_GOTTPOFF || Flags == X86II::MO_INDNTPOFF ||
5002 Flags == X86II::MO_GOTNTPOFF)
5003 return false;
5004 }
5005
5006 switch (MI.getOpcode()) {
5007 default:
5008 return false;
5009
5010 // The shift instructions only modify ZF if their shift count is non-zero.
5011 // N.B.: The processor truncates the shift count depending on the encoding.
5012 CASE_ND(SAR8ri)
5013 CASE_ND(SAR16ri)
5014 CASE_ND(SAR32ri)
5015 CASE_ND(SAR64ri)
5016 CASE_ND(SHR8ri)
5017 CASE_ND(SHR16ri)
5018 CASE_ND(SHR32ri)
5019 CASE_ND(SHR64ri)
5020 return getTruncatedShiftCount(MI, 2) != 0;
5021
5022 // Some left shift instructions can be turned into LEA instructions but only
5023 // if their flags aren't used. Avoid transforming such instructions.
5024 CASE_ND(SHL8ri)
5025 CASE_ND(SHL16ri)
5026 CASE_ND(SHL32ri)
5027 CASE_ND(SHL64ri) {
5028 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
5029 if (isTruncatedShiftCountForLEA(ShAmt))
5030 return false;
5031 return ShAmt != 0;
5032 }
5033
5034 CASE_ND(SHRD16rri8)
5035 CASE_ND(SHRD32rri8)
5036 CASE_ND(SHRD64rri8)
5037 CASE_ND(SHLD16rri8)
5038 CASE_ND(SHLD32rri8)
5039 CASE_ND(SHLD64rri8)
5040 return getTruncatedShiftCount(MI, 3) != 0;
5041
5042 CASE_ND(SUB64ri32)
5043 CASE_ND(SUB32ri)
5044 CASE_ND(SUB16ri)
5045 CASE_ND(SUB8ri)
5046 CASE_ND(SUB64rr)
5047 CASE_ND(SUB32rr)
5048 CASE_ND(SUB16rr)
5049 CASE_ND(SUB8rr)
5050 CASE_ND(SUB64rm)
5051 CASE_ND(SUB32rm)
5052 CASE_ND(SUB16rm)
5053 CASE_ND(SUB8rm)
5054 CASE_ND(DEC64r)
5055 CASE_ND(DEC32r)
5056 CASE_ND(DEC16r)
5057 CASE_ND(DEC8r)
5058 CASE_ND(ADD64ri32)
5059 CASE_ND(ADD32ri)
5060 CASE_ND(ADD16ri)
5061 CASE_ND(ADD8ri)
5062 CASE_ND(ADD64rr)
5063 CASE_ND(ADD32rr)
5064 CASE_ND(ADD16rr)
5065 CASE_ND(ADD8rr)
5066 CASE_ND(ADD64rm)
5067 CASE_ND(ADD32rm)
5068 CASE_ND(ADD16rm)
5069 CASE_ND(ADD8rm)
5070 CASE_ND(INC64r)
5071 CASE_ND(INC32r)
5072 CASE_ND(INC16r)
5073 CASE_ND(INC8r)
5074 CASE_ND(ADC64ri32)
5075 CASE_ND(ADC32ri)
5076 CASE_ND(ADC16ri)
5077 CASE_ND(ADC8ri)
5078 CASE_ND(ADC64rr)
5079 CASE_ND(ADC32rr)
5080 CASE_ND(ADC16rr)
5081 CASE_ND(ADC8rr)
5082 CASE_ND(ADC64rm)
5083 CASE_ND(ADC32rm)
5084 CASE_ND(ADC16rm)
5085 CASE_ND(ADC8rm)
5086 CASE_ND(SBB64ri32)
5087 CASE_ND(SBB32ri)
5088 CASE_ND(SBB16ri)
5089 CASE_ND(SBB8ri)
5090 CASE_ND(SBB64rr)
5091 CASE_ND(SBB32rr)
5092 CASE_ND(SBB16rr)
5093 CASE_ND(SBB8rr)
5094 CASE_ND(SBB64rm)
5095 CASE_ND(SBB32rm)
5096 CASE_ND(SBB16rm)
5097 CASE_ND(SBB8rm)
5098 CASE_ND(NEG8r)
5099 CASE_ND(NEG16r)
5100 CASE_ND(NEG32r)
5101 CASE_ND(NEG64r)
5102 case X86::LZCNT16rr:
5103 case X86::LZCNT16rm:
5104 case X86::LZCNT32rr:
5105 case X86::LZCNT32rm:
5106 case X86::LZCNT64rr:
5107 case X86::LZCNT64rm:
5108 case X86::POPCNT16rr:
5109 case X86::POPCNT16rm:
5110 case X86::POPCNT32rr:
5111 case X86::POPCNT32rm:
5112 case X86::POPCNT64rr:
5113 case X86::POPCNT64rm:
5114 case X86::TZCNT16rr:
5115 case X86::TZCNT16rm:
5116 case X86::TZCNT32rr:
5117 case X86::TZCNT32rm:
5118 case X86::TZCNT64rr:
5119 case X86::TZCNT64rm:
5120 return true;
5121 CASE_ND(AND64ri32)
5122 CASE_ND(AND32ri)
5123 CASE_ND(AND16ri)
5124 CASE_ND(AND8ri)
5125 CASE_ND(AND64rr)
5126 CASE_ND(AND32rr)
5127 CASE_ND(AND16rr)
5128 CASE_ND(AND8rr)
5129 CASE_ND(AND64rm)
5130 CASE_ND(AND32rm)
5131 CASE_ND(AND16rm)
5132 CASE_ND(AND8rm)
5133 CASE_ND(XOR64ri32)
5134 CASE_ND(XOR32ri)
5135 CASE_ND(XOR16ri)
5136 CASE_ND(XOR8ri)
5137 CASE_ND(XOR64rr)
5138 CASE_ND(XOR32rr)
5139 CASE_ND(XOR16rr)
5140 CASE_ND(XOR8rr)
5141 CASE_ND(XOR64rm)
5142 CASE_ND(XOR32rm)
5143 CASE_ND(XOR16rm)
5144 CASE_ND(XOR8rm)
5145 CASE_ND(OR64ri32)
5146 CASE_ND(OR32ri)
5147 CASE_ND(OR16ri)
5148 CASE_ND(OR8ri)
5149 CASE_ND(OR64rr)
5150 CASE_ND(OR32rr)
5151 CASE_ND(OR16rr)
5152 CASE_ND(OR8rr)
5153 CASE_ND(OR64rm)
5154 CASE_ND(OR32rm)
5155 CASE_ND(OR16rm)
5156 CASE_ND(OR8rm)
5157 case X86::ANDN32rr:
5158 case X86::ANDN32rm:
5159 case X86::ANDN64rr:
5160 case X86::ANDN64rm:
5161 case X86::BLSI32rr:
5162 case X86::BLSI32rm:
5163 case X86::BLSI64rr:
5164 case X86::BLSI64rm:
5165 case X86::BLSMSK32rr:
5166 case X86::BLSMSK32rm:
5167 case X86::BLSMSK64rr:
5168 case X86::BLSMSK64rm:
5169 case X86::BLSR32rr:
5170 case X86::BLSR32rm:
5171 case X86::BLSR64rr:
5172 case X86::BLSR64rm:
5173 case X86::BLCFILL32rr:
5174 case X86::BLCFILL32rm:
5175 case X86::BLCFILL64rr:
5176 case X86::BLCFILL64rm:
5177 case X86::BLCI32rr:
5178 case X86::BLCI32rm:
5179 case X86::BLCI64rr:
5180 case X86::BLCI64rm:
5181 case X86::BLCIC32rr:
5182 case X86::BLCIC32rm:
5183 case X86::BLCIC64rr:
5184 case X86::BLCIC64rm:
5185 case X86::BLCMSK32rr:
5186 case X86::BLCMSK32rm:
5187 case X86::BLCMSK64rr:
5188 case X86::BLCMSK64rm:
5189 case X86::BLCS32rr:
5190 case X86::BLCS32rm:
5191 case X86::BLCS64rr:
5192 case X86::BLCS64rm:
5193 case X86::BLSFILL32rr:
5194 case X86::BLSFILL32rm:
5195 case X86::BLSFILL64rr:
5196 case X86::BLSFILL64rm:
5197 case X86::BLSIC32rr:
5198 case X86::BLSIC32rm:
5199 case X86::BLSIC64rr:
5200 case X86::BLSIC64rm:
5201 case X86::BZHI32rr:
5202 case X86::BZHI32rm:
5203 case X86::BZHI64rr:
5204 case X86::BZHI64rm:
5205 case X86::T1MSKC32rr:
5206 case X86::T1MSKC32rm:
5207 case X86::T1MSKC64rr:
5208 case X86::T1MSKC64rm:
5209 case X86::TZMSK32rr:
5210 case X86::TZMSK32rm:
5211 case X86::TZMSK64rr:
5212 case X86::TZMSK64rm:
5213 // These instructions clear the overflow flag just like TEST.
5214 // FIXME: These are not the only instructions in this switch that clear the
5215 // overflow flag.
5216 ClearsOverflowFlag = true;
5217 return true;
5218 case X86::BEXTR32rr:
5219 case X86::BEXTR64rr:
5220 case X86::BEXTR32rm:
5221 case X86::BEXTR64rm:
5222 case X86::BEXTRI32ri:
5223 case X86::BEXTRI32mi:
5224 case X86::BEXTRI64ri:
5225 case X86::BEXTRI64mi:
5226 // BEXTR doesn't update the sign flag so we can't use it. It does clear
5227 // the overflow flag, but that's not useful without the sign flag.
5228 NoSignFlag = true;
5229 return true;
5230 }
5231}
5232
5233/// Check whether the use can be converted to remove a comparison against zero.
5234/// Returns the EFLAGS condition and the operand that we are comparing against zero.
5235static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
5236 switch (MI.getOpcode()) {
5237 default:
5238 return std::make_pair(X86::COND_INVALID, ~0U);
5239 CASE_ND(NEG8r)
5240 CASE_ND(NEG16r)
5241 CASE_ND(NEG32r)
5242 CASE_ND(NEG64r)
5243 return std::make_pair(X86::COND_AE, 1U);
5244 case X86::LZCNT16rr:
5245 case X86::LZCNT32rr:
5246 case X86::LZCNT64rr:
5247 return std::make_pair(X86::COND_B, 1U);
5248 case X86::POPCNT16rr:
5249 case X86::POPCNT32rr:
5250 case X86::POPCNT64rr:
5251 return std::make_pair(X86::COND_E, 1U);
5252 case X86::TZCNT16rr:
5253 case X86::TZCNT32rr:
5254 case X86::TZCNT64rr:
5255 return std::make_pair(X86::COND_B, 1U);
5256 case X86::BSF16rr:
5257 case X86::BSF32rr:
5258 case X86::BSF64rr:
5259 case X86::BSR16rr:
5260 case X86::BSR32rr:
5261 case X86::BSR64rr:
5262 return std::make_pair(X86::COND_E, 2U);
5263 case X86::BLSI32rr:
5264 case X86::BLSI64rr:
5265 return std::make_pair(X86::COND_AE, 1U);
5266 case X86::BLSR32rr:
5267 case X86::BLSR64rr:
5268 case X86::BLSMSK32rr:
5269 case X86::BLSMSK64rr:
5270 return std::make_pair(X86::COND_B, 1U);
5271 // TODO: TBM instructions.
5272 }
5273}
5274
5275/// Check if there exists an earlier instruction that
5276/// operates on the same source operands and sets flags in the same way as
5277/// Compare; remove Compare if possible.
5279 Register SrcReg2, int64_t CmpMask,
5280 int64_t CmpValue,
5281 const MachineRegisterInfo *MRI) const {
5282 // Check whether we can replace SUB with CMP.
5283 switch (CmpInstr.getOpcode()) {
5284 default:
5285 break;
5286 CASE_ND(SUB64ri32)
5287 CASE_ND(SUB32ri)
5288 CASE_ND(SUB16ri)
5289 CASE_ND(SUB8ri)
5290 CASE_ND(SUB64rm)
5291 CASE_ND(SUB32rm)
5292 CASE_ND(SUB16rm)
5293 CASE_ND(SUB8rm)
5294 CASE_ND(SUB64rr)
5295 CASE_ND(SUB32rr)
5296 CASE_ND(SUB16rr)
5297 CASE_ND(SUB8rr) {
5298 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
5299 return false;
5300 // There is no use of the destination register, we can replace SUB with CMP.
5301 unsigned NewOpcode = 0;
5302#define FROM_TO(A, B) \
5303 CASE_ND(A) NewOpcode = X86::B; \
5304 break;
5305 switch (CmpInstr.getOpcode()) {
5306 default:
5307 llvm_unreachable("Unreachable!");
5308 FROM_TO(SUB64rm, CMP64rm)
5309 FROM_TO(SUB32rm, CMP32rm)
5310 FROM_TO(SUB16rm, CMP16rm)
5311 FROM_TO(SUB8rm, CMP8rm)
5312 FROM_TO(SUB64rr, CMP64rr)
5313 FROM_TO(SUB32rr, CMP32rr)
5314 FROM_TO(SUB16rr, CMP16rr)
5315 FROM_TO(SUB8rr, CMP8rr)
5316 FROM_TO(SUB64ri32, CMP64ri32)
5317 FROM_TO(SUB32ri, CMP32ri)
5318 FROM_TO(SUB16ri, CMP16ri)
5319 FROM_TO(SUB8ri, CMP8ri)
5320 }
5321#undef FROM_TO
5322 CmpInstr.setDesc(get(NewOpcode));
5323 CmpInstr.removeOperand(0);
5324 // Mutating this instruction invalidates any debug data associated with it.
5325 CmpInstr.dropDebugNumber();
5326 // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
5327 if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
5328 NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
5329 return false;
5330 }
5331 }
5332
5333 // The following code tries to remove the comparison by re-using EFLAGS
5334 // from earlier instructions.
5335
5336 bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
5337
5338 // Transformation currently requires SSA values.
5339 if (SrcReg2.isPhysical())
5340 return false;
5341 MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
5342 assert(SrcRegDef && "Must have a definition (SSA)");
5343
5344 MachineInstr *MI = nullptr;
5345 MachineInstr *Sub = nullptr;
5346 MachineInstr *Movr0Inst = nullptr;
5348 bool NoSignFlag = false;
5349 bool ClearsOverflowFlag = false;
5350 bool ShouldUpdateCC = false;
5351 bool IsSwapped = false;
5352 bool HasNF = Subtarget.hasNF();
5353 unsigned OpNo = 0;
5355 int64_t ImmDelta = 0;
5356
5357 // Search backward from CmpInstr for the next instruction defining EFLAGS.
5359 MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
5361 std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
5362 for (MachineBasicBlock *MBB = &CmpMBB;;) {
5363 for (MachineInstr &Inst : make_range(From, MBB->rend())) {
5364 // Try to use EFLAGS from the instruction defining %SrcReg. Example:
5365 // %eax = addl ...
5366 // ... // EFLAGS not changed
5367 // testl %eax, %eax // <-- can be removed
5368 if (&Inst == SrcRegDef) {
5369 if (IsCmpZero &&
5370 isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
5371 MI = &Inst;
5372 break;
5373 }
5374
5375 // Look back for the following pattern, in which case the
5376 // test16rr/test64rr instruction could be erased.
5377 //
5378 // Example for test16rr:
5379 // %reg = and32ri %in_reg, 5
5380 // ... // EFLAGS not changed.
5381 // %src_reg = copy %reg.sub_16bit:gr32
5382 // test16rr %src_reg, %src_reg, implicit-def $eflags
5383 // Example for test64rr:
5384 // %reg = and32ri %in_reg, 5
5385 // ... // EFLAGS not changed.
5386 // %src_reg = subreg_to_reg %reg, %subreg.sub_index
5387 // test64rr %src_reg, %src_reg, implicit-def $eflags
5388 MachineInstr *AndInstr = nullptr;
5389 if (IsCmpZero &&
5390 findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
5391 Subtarget, NoSignFlag, ClearsOverflowFlag)) {
5392 assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
5393 MI = AndInstr;
5394 break;
5395 }
5396 // Cannot find other candidates before definition of SrcReg.
5397 return false;
5398 }
5399
5400 if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
5401 // Try to use EFLAGS produced by an instruction reading %SrcReg.
5402 // Example:
5403 // %eax = ...
5404 // ...
5405 // popcntl %eax
5406 // ... // EFLAGS not changed
5407 // testl %eax, %eax // <-- can be removed
5408 if (IsCmpZero) {
5409 std::tie(NewCC, OpNo) = isUseDefConvertible(Inst);
5410 if (NewCC != X86::COND_INVALID && Inst.getOperand(OpNo).isReg() &&
5411 Inst.getOperand(OpNo).getReg() == SrcReg) {
5412 ShouldUpdateCC = true;
5413 MI = &Inst;
5414 break;
5415 }
5416 }
5417
5418 // Try to use EFLAGS from an instruction with similar flag results.
5419 // Example:
5420 // sub x, y or cmp x, y
5421 // ... // EFLAGS not changed
5422 // cmp x, y // <-- can be removed
5423 if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
5424 Inst, &IsSwapped, &ImmDelta)) {
5425 Sub = &Inst;
5426 break;
5427 }
5428
5429 // MOV32r0 is implemented with xor which clobbers condition code. It is
5430 // safe to move up, if the definition to EFLAGS is dead and earlier
5431 // instructions do not read or write EFLAGS.
5432 if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
5433 Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
5434 Movr0Inst = &Inst;
5435 continue;
5436 }
5437
5438 // For the instructions are ADDrm/ADDmr with relocation, we'll skip the
5439 // optimization for replacing non-NF with NF. This is to keep backward
5440 // compatiblity with old version of linkers without APX relocation type
5441 // support on Linux OS.
5442 bool IsWithReloc = X86EnableAPXForRelocation
5443 ? false
5445
5446 // Try to replace non-NF with NF instructions.
5447 if (HasNF && Inst.registerDefIsDead(X86::EFLAGS, TRI) && !IsWithReloc) {
5448 unsigned NewOp = X86::getNFVariant(Inst.getOpcode());
5449 if (!NewOp)
5450 return false;
5451
5452 InstsToUpdate.push_back(std::make_pair(&Inst, NewOp));
5453 continue;
5454 }
5455
5456 // Cannot do anything for any other EFLAG changes.
5457 return false;
5458 }
5459 }
5460
5461 if (MI || Sub)
5462 break;
5463
5464 // Reached begin of basic block. Continue in predecessor if there is
5465 // exactly one.
5466 if (MBB->pred_size() != 1)
5467 return false;
5468 MBB = *MBB->pred_begin();
5469 From = MBB->rbegin();
5470 }
5471
5472 // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
5473 // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
5474 // If we are done with the basic block, we need to check whether EFLAGS is
5475 // live-out.
5476 bool FlagsMayLiveOut = true;
5478 MachineBasicBlock::iterator AfterCmpInstr =
5479 std::next(MachineBasicBlock::iterator(CmpInstr));
5480 for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
5481 bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
5482 bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
5483 // We should check the usage if this instruction uses and updates EFLAGS.
5484 if (!UseEFLAGS && ModifyEFLAGS) {
5485 // It is safe to remove CmpInstr if EFLAGS is updated again.
5486 FlagsMayLiveOut = false;
5487 break;
5488 }
5489 if (!UseEFLAGS && !ModifyEFLAGS)
5490 continue;
5491
5492 // EFLAGS is used by this instruction.
5493 X86::CondCode OldCC = X86::getCondFromMI(Instr);
5494 if ((MI || IsSwapped || ImmDelta != 0) && OldCC == X86::COND_INVALID)
5495 return false;
5496
5497 X86::CondCode ReplacementCC = X86::COND_INVALID;
5498 if (MI) {
5499 switch (OldCC) {
5500 default:
5501 break;
5502 case X86::COND_A:
5503 case X86::COND_AE:
5504 case X86::COND_B:
5505 case X86::COND_BE:
5506 // CF is used, we can't perform this optimization.
5507 return false;
5508 case X86::COND_G:
5509 case X86::COND_GE:
5510 case X86::COND_L:
5511 case X86::COND_LE:
5512 // If SF is used, but the instruction doesn't update the SF, then we
5513 // can't do the optimization.
5514 if (NoSignFlag)
5515 return false;
5516 [[fallthrough]];
5517 case X86::COND_O:
5518 case X86::COND_NO:
5519 // If OF is used, the instruction needs to clear it like CmpZero does.
5520 if (!ClearsOverflowFlag)
5521 return false;
5522 break;
5523 case X86::COND_S:
5524 case X86::COND_NS:
5525 // If SF is used, but the instruction doesn't update the SF, then we
5526 // can't do the optimization.
5527 if (NoSignFlag)
5528 return false;
5529 break;
5530 }
5531
5532 // If we're updating the condition code check if we have to reverse the
5533 // condition.
5534 if (ShouldUpdateCC)
5535 switch (OldCC) {
5536 default:
5537 return false;
5538 case X86::COND_E:
5539 ReplacementCC = NewCC;
5540 break;
5541 case X86::COND_NE:
5542 ReplacementCC = GetOppositeBranchCondition(NewCC);
5543 break;
5544 }
5545 } else if (IsSwapped) {
5546 // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
5547 // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
5548 // We swap the condition code and synthesize the new opcode.
5549 ReplacementCC = getSwappedCondition(OldCC);
5550 if (ReplacementCC == X86::COND_INVALID)
5551 return false;
5552 ShouldUpdateCC = true;
5553 } else if (ImmDelta != 0) {
5554 unsigned BitWidth = RI.getRegSizeInBits(*MRI->getRegClass(SrcReg));
5555 // Shift amount for min/max constants to adjust for 8/16/32 instruction
5556 // sizes.
5557 switch (OldCC) {
5558 case X86::COND_L: // x <s (C + 1) --> x <=s C
5559 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5560 return false;
5561 ReplacementCC = X86::COND_LE;
5562 break;
5563 case X86::COND_B: // x <u (C + 1) --> x <=u C
5564 if (ImmDelta != 1 || CmpValue == 0)
5565 return false;
5566 ReplacementCC = X86::COND_BE;
5567 break;
5568 case X86::COND_GE: // x >=s (C + 1) --> x >s C
5569 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5570 return false;
5571 ReplacementCC = X86::COND_G;
5572 break;
5573 case X86::COND_AE: // x >=u (C + 1) --> x >u C
5574 if (ImmDelta != 1 || CmpValue == 0)
5575 return false;
5576 ReplacementCC = X86::COND_A;
5577 break;
5578 case X86::COND_G: // x >s (C - 1) --> x >=s C
5579 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5580 return false;
5581 ReplacementCC = X86::COND_GE;
5582 break;
5583 case X86::COND_A: // x >u (C - 1) --> x >=u C
5584 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5585 return false;
5586 ReplacementCC = X86::COND_AE;
5587 break;
5588 case X86::COND_LE: // x <=s (C - 1) --> x <s C
5589 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5590 return false;
5591 ReplacementCC = X86::COND_L;
5592 break;
5593 case X86::COND_BE: // x <=u (C - 1) --> x <u C
5594 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5595 return false;
5596 ReplacementCC = X86::COND_B;
5597 break;
5598 default:
5599 return false;
5600 }
5601 ShouldUpdateCC = true;
5602 }
5603
5604 if (ShouldUpdateCC && ReplacementCC != OldCC) {
5605 // Push the MachineInstr to OpsToUpdate.
5606 // If it is safe to remove CmpInstr, the condition code of these
5607 // instructions will be modified.
5608 OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
5609 }
5610 if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
5611 // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
5612 FlagsMayLiveOut = false;
5613 break;
5614 }
5615 }
5616
5617 // If we have to update users but EFLAGS is live-out abort, since we cannot
5618 // easily find all of the users.
5619 if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
5620 for (MachineBasicBlock *Successor : CmpMBB.successors())
5621 if (Successor->isLiveIn(X86::EFLAGS))
5622 return false;
5623 }
5624
5625 // The instruction to be updated is either Sub or MI.
5626 assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
5627 Sub = MI != nullptr ? MI : Sub;
5628 MachineBasicBlock *SubBB = Sub->getParent();
5629 // Move Movr0Inst to the appropriate place before Sub.
5630 if (Movr0Inst) {
5631 // Only move within the same block so we don't accidentally move to a
5632 // block with higher execution frequency.
5633 if (&CmpMBB != SubBB)
5634 return false;
5635 // Look backwards until we find a def that doesn't use the current EFLAGS.
5637 InsertE = Sub->getParent()->rend();
5638 for (; InsertI != InsertE; ++InsertI) {
5639 MachineInstr *Instr = &*InsertI;
5640 if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
5641 Instr->modifiesRegister(X86::EFLAGS, TRI)) {
5642 Movr0Inst->getParent()->remove(Movr0Inst);
5643 Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
5644 Movr0Inst);
5645 break;
5646 }
5647 }
5648 if (InsertI == InsertE)
5649 return false;
5650 }
5651
5652 // Replace non-NF with NF instructions.
5653 for (auto &Inst : InstsToUpdate) {
5654 Inst.first->setDesc(get(Inst.second));
5655 Inst.first->removeOperand(
5656 Inst.first->findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5657 }
5658
5659 // Make sure Sub instruction defines EFLAGS and mark the def live.
5660 MachineOperand *FlagDef =
5661 Sub->findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
5662 assert(FlagDef && "Unable to locate a def EFLAGS operand");
5663 FlagDef->setIsDead(false);
5664
5665 CmpInstr.eraseFromParent();
5666
5667 // Modify the condition code of instructions in OpsToUpdate.
5668 for (auto &Op : OpsToUpdate) {
5669 Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
5670 .setImm(Op.second);
5671 }
5672 // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
5673 for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
5674 MBB = *MBB->pred_begin()) {
5675 assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
5676 if (!MBB->isLiveIn(X86::EFLAGS))
5677 MBB->addLiveIn(X86::EFLAGS);
5678 }
5679 return true;
5680}
5681
5682/// \returns true if the instruction can be changed to COPY when imm is 0.
5683static bool canConvert2Copy(unsigned Opc) {
5684 switch (Opc) {
5685 default:
5686 return false;
5687 CASE_ND(ADD64ri32)
5688 CASE_ND(SUB64ri32)
5689 CASE_ND(OR64ri32)
5690 CASE_ND(XOR64ri32)
5691 CASE_ND(ADD32ri)
5692 CASE_ND(SUB32ri)
5693 CASE_ND(OR32ri)
5694 CASE_ND(XOR32ri)
5695 return true;
5696 }
5697}
5698
5699/// Convert an ALUrr opcode to corresponding ALUri opcode. Such as
5700/// ADD32rr ==> ADD32ri
5701static unsigned convertALUrr2ALUri(unsigned Opc) {
5702 switch (Opc) {
5703 default:
5704 return 0;
5705#define FROM_TO(FROM, TO) \
5706 case X86::FROM: \
5707 return X86::TO; \
5708 case X86::FROM##_ND: \
5709 return X86::TO##_ND;
5710 FROM_TO(ADD64rr, ADD64ri32)
5711 FROM_TO(ADC64rr, ADC64ri32)
5712 FROM_TO(SUB64rr, SUB64ri32)
5713 FROM_TO(SBB64rr, SBB64ri32)
5714 FROM_TO(AND64rr, AND64ri32)
5715 FROM_TO(OR64rr, OR64ri32)
5716 FROM_TO(XOR64rr, XOR64ri32)
5717 FROM_TO(SHR64rCL, SHR64ri)
5718 FROM_TO(SHL64rCL, SHL64ri)
5719 FROM_TO(SAR64rCL, SAR64ri)
5720 FROM_TO(ROL64rCL, ROL64ri)
5721 FROM_TO(ROR64rCL, ROR64ri)
5722 FROM_TO(RCL64rCL, RCL64ri)
5723 FROM_TO(RCR64rCL, RCR64ri)
5724 FROM_TO(ADD32rr, ADD32ri)
5725 FROM_TO(ADC32rr, ADC32ri)
5726 FROM_TO(SUB32rr, SUB32ri)
5727 FROM_TO(SBB32rr, SBB32ri)
5728 FROM_TO(AND32rr, AND32ri)
5729 FROM_TO(OR32rr, OR32ri)
5730 FROM_TO(XOR32rr, XOR32ri)
5731 FROM_TO(SHR32rCL, SHR32ri)
5732 FROM_TO(SHL32rCL, SHL32ri)
5733 FROM_TO(SAR32rCL, SAR32ri)
5734 FROM_TO(ROL32rCL, ROL32ri)
5735 FROM_TO(ROR32rCL, ROR32ri)
5736 FROM_TO(RCL32rCL, RCL32ri)
5737 FROM_TO(RCR32rCL, RCR32ri)
5738#undef FROM_TO
5739#define FROM_TO(FROM, TO) \
5740 case X86::FROM: \
5741 return X86::TO;
5742 FROM_TO(TEST64rr, TEST64ri32)
5743 FROM_TO(CTEST64rr, CTEST64ri32)
5744 FROM_TO(CMP64rr, CMP64ri32)
5745 FROM_TO(CCMP64rr, CCMP64ri32)
5746 FROM_TO(TEST32rr, TEST32ri)
5747 FROM_TO(CTEST32rr, CTEST32ri)
5748 FROM_TO(CMP32rr, CMP32ri)
5749 FROM_TO(CCMP32rr, CCMP32ri)
5750#undef FROM_TO
5751 }
5752}
5753
5754/// Reg is assigned ImmVal in DefMI, and is used in UseMI.
5755/// If MakeChange is true, this function tries to replace Reg by ImmVal in
5756/// UseMI. If MakeChange is false, just check if folding is possible.
5757//
5758/// \returns true if folding is successful or possible.
5759bool X86InstrInfo::foldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
5760 Register Reg, int64_t ImmVal,
5762 bool MakeChange) const {
5763 bool Modified = false;
5764
5765 // 64 bit operations accept sign extended 32 bit immediates.
5766 // 32 bit operations accept all 32 bit immediates, so we don't need to check
5767 // them.
5768 const TargetRegisterClass *RC = nullptr;
5769 if (Reg.isVirtual())
5770 RC = MRI->getRegClass(Reg);
5771 if ((Reg.isPhysical() && X86::GR64RegClass.contains(Reg)) ||
5772 (Reg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC))) {
5773 if (!isInt<32>(ImmVal))
5774 return false;
5775 }
5776
5777 if (UseMI.findRegisterUseOperand(Reg, /*TRI=*/nullptr)->getSubReg())
5778 return false;
5779 // Immediate has larger code size than register. So avoid folding the
5780 // immediate if it has more than 1 use and we are optimizing for size.
5781 if (UseMI.getMF()->getFunction().hasOptSize() && Reg.isVirtual() &&
5782 !MRI->hasOneNonDBGUse(Reg))
5783 return false;
5784
5785 unsigned Opc = UseMI.getOpcode();
5786 unsigned NewOpc;
5787 if (Opc == TargetOpcode::COPY) {
5788 Register ToReg = UseMI.getOperand(0).getReg();
5789 const TargetRegisterClass *RC = nullptr;
5790 if (ToReg.isVirtual())
5791 RC = MRI->getRegClass(ToReg);
5792 bool GR32Reg = (ToReg.isVirtual() && X86::GR32RegClass.hasSubClassEq(RC)) ||
5793 (ToReg.isPhysical() && X86::GR32RegClass.contains(ToReg));
5794 bool GR64Reg = (ToReg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC)) ||
5795 (ToReg.isPhysical() && X86::GR64RegClass.contains(ToReg));
5796 bool GR8Reg = (ToReg.isVirtual() && X86::GR8RegClass.hasSubClassEq(RC)) ||
5797 (ToReg.isPhysical() && X86::GR8RegClass.contains(ToReg));
5798
5799 if (ImmVal == 0) {
5800 // We have MOV32r0 only.
5801 if (!GR32Reg)
5802 return false;
5803 }
5804
5805 if (GR64Reg) {
5806 if (isUInt<32>(ImmVal))
5807 NewOpc = X86::MOV32ri64;
5808 else
5809 NewOpc = X86::MOV64ri;
5810 } else if (GR32Reg) {
5811 NewOpc = X86::MOV32ri;
5812 if (ImmVal == 0) {
5813 // MOV32r0 clobbers EFLAGS.
5814 const TargetRegisterInfo *TRI = &getRegisterInfo();
5815 if (UseMI.getParent()->computeRegisterLiveness(
5816 TRI, X86::EFLAGS, UseMI) != MachineBasicBlock::LQR_Dead)
5817 return false;
5818
5819 // MOV32r0 is different than other cases because it doesn't encode the
5820 // immediate in the instruction. So we directly modify it here.
5821 if (!MakeChange)
5822 return true;
5823 UseMI.setDesc(get(X86::MOV32r0));
5824 UseMI.removeOperand(
5825 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5826 UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/true,
5827 /*isImp=*/true,
5828 /*isKill=*/false,
5829 /*isDead=*/true));
5830 Modified = true;
5831 }
5832 } else if (GR8Reg)
5833 NewOpc = X86::MOV8ri;
5834 else
5835 return false;
5836 } else
5837 NewOpc = convertALUrr2ALUri(Opc);
5838
5839 if (!NewOpc)
5840 return false;
5841
5842 // For SUB instructions the immediate can only be the second source operand.
5843 if ((NewOpc == X86::SUB64ri32 || NewOpc == X86::SUB32ri ||
5844 NewOpc == X86::SBB64ri32 || NewOpc == X86::SBB32ri ||
5845 NewOpc == X86::SUB64ri32_ND || NewOpc == X86::SUB32ri_ND ||
5846 NewOpc == X86::SBB64ri32_ND || NewOpc == X86::SBB32ri_ND) &&
5847 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 2)
5848 return false;
5849 // For CMP instructions the immediate can only be at index 1.
5850 if (((NewOpc == X86::CMP64ri32 || NewOpc == X86::CMP32ri) ||
5851 (NewOpc == X86::CCMP64ri32 || NewOpc == X86::CCMP32ri)) &&
5852 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 1)
5853 return false;
5854
5855 using namespace X86;
5856 if (isSHL(Opc) || isSHR(Opc) || isSAR(Opc) || isROL(Opc) || isROR(Opc) ||
5857 isRCL(Opc) || isRCR(Opc)) {
5858 unsigned RegIdx = UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr);
5859 if (RegIdx < 2)
5860 return false;
5861 if (!isInt<8>(ImmVal))
5862 return false;
5863 assert(Reg == X86::CL);
5864
5865 if (!MakeChange)
5866 return true;
5867 UseMI.setDesc(get(NewOpc));
5868 UseMI.removeOperand(RegIdx);
5869 UseMI.addOperand(MachineOperand::CreateImm(ImmVal));
5870 // Reg is physical register $cl, so we don't know if DefMI is dead through
5871 // MRI. Let the caller handle it, or pass dead-mi-elimination can delete
5872 // the dead physical register define instruction.
5873 return true;
5874 }
5875
5876 if (!MakeChange)
5877 return true;
5878
5879 if (!Modified) {
5880 // Modify the instruction.
5881 if (ImmVal == 0 && canConvert2Copy(NewOpc) &&
5882 UseMI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr)) {
5883 // %100 = add %101, 0
5884 // ==>
5885 // %100 = COPY %101
5886 UseMI.setDesc(get(TargetOpcode::COPY));
5887 UseMI.removeOperand(
5888 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5889 UseMI.removeOperand(
5890 UseMI.findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5891 UseMI.untieRegOperand(0);
5894 } else {
5895 unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
5896 unsigned ImmOpNum = 2;
5897 if (!UseMI.getOperand(0).isDef()) {
5898 Op1 = 0; // TEST, CMP, CTEST, CCMP
5899 ImmOpNum = 1;
5900 }
5901 if (Opc == TargetOpcode::COPY)
5902 ImmOpNum = 1;
5903 if (findCommutedOpIndices(UseMI, Op1, Op2) &&
5904 UseMI.getOperand(Op1).getReg() == Reg)
5905 commuteInstruction(UseMI);
5906
5907 assert(UseMI.getOperand(ImmOpNum).getReg() == Reg);
5908 UseMI.setDesc(get(NewOpc));
5909 UseMI.getOperand(ImmOpNum).ChangeToImmediate(ImmVal);
5910 }
5911 }
5912
5913 if (Reg.isVirtual() && MRI->use_nodbg_empty(Reg))
5915
5916 return true;
5917}
5918
5919/// foldImmediate - 'Reg' is known to be defined by a move immediate
5920/// instruction, try to fold the immediate into the use instruction.
5922 Register Reg, MachineRegisterInfo *MRI) const {
5923 int64_t ImmVal;
5924 if (!getConstValDefinedInReg(DefMI, Reg, ImmVal))
5925 return false;
5926
5927 return foldImmediateImpl(UseMI, &DefMI, Reg, ImmVal, MRI, true);
5928}
5929
5930/// Expand a single-def pseudo instruction to a two-addr
5931/// instruction with two undef reads of the register being defined.
5932/// This is used for mapping:
5933/// %xmm4 = V_SET0
5934/// to:
5935/// %xmm4 = PXORrr undef %xmm4, undef %xmm4
5936///
5938 const MCInstrDesc &Desc) {
5939 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5940 Register Reg = MIB.getReg(0);
5941 MIB->setDesc(Desc);
5942
5943 // MachineInstr::addOperand() will insert explicit operands before any
5944 // implicit operands.
5946 // But we don't trust that.
5947 assert(MIB.getReg(1) == Reg && MIB.getReg(2) == Reg && "Misplaced operand");
5948 return true;
5949}
5950
5951/// Expand a single-def pseudo instruction to a two-addr
5952/// instruction with two %k0 reads.
5953/// This is used for mapping:
5954/// %k4 = K_SET1
5955/// to:
5956/// %k4 = KXNORrr %k0, %k0
5958 Register Reg) {
5959 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5960 MIB->setDesc(Desc);
5962 return true;
5963}
5964
5966 bool MinusOne) {
5967 MachineBasicBlock &MBB = *MIB->getParent();
5968 const DebugLoc &DL = MIB->getDebugLoc();
5969 Register Reg = MIB.getReg(0);
5970
5971 // Insert the XOR.
5972 BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
5975
5976 // Turn the pseudo into an INC or DEC.
5977 MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
5978 MIB.addReg(Reg);
5979
5980 return true;
5981}
5982
5984 const TargetInstrInfo &TII,
5985 const X86Subtarget &Subtarget) {
5986 MachineBasicBlock &MBB = *MIB->getParent();
5987 const DebugLoc &DL = MIB->getDebugLoc();
5988 int64_t Imm = MIB->getOperand(1).getImm();
5989 assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
5991
5992 int StackAdjustment;
5993
5994 if (Subtarget.is64Bit()) {
5995 assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
5996 MIB->getOpcode() == X86::MOV32ImmSExti8);
5997
5998 // Can't use push/pop lowering if the function might write to the red zone.
5999 X86MachineFunctionInfo *X86FI =
6000 MBB.getParent()->getInfo<X86MachineFunctionInfo>();
6001 if (X86FI->getUsesRedZone()) {
6002 MIB->setDesc(TII.get(MIB->getOpcode() == X86::MOV32ImmSExti8
6003 ? X86::MOV32ri
6004 : X86::MOV64ri));
6005 return true;
6006 }
6007
6008 // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
6009 // widen the register if necessary.
6010 StackAdjustment = 8;
6011 BuildMI(MBB, I, DL, TII.get(X86::PUSH64i32)).addImm(Imm);
6012 MIB->setDesc(TII.get(X86::POP64r));
6013 MIB->getOperand(0).setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
6014 } else {
6015 assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
6016 StackAdjustment = 4;
6017 BuildMI(MBB, I, DL, TII.get(X86::PUSH32i)).addImm(Imm);
6018 MIB->setDesc(TII.get(X86::POP32r));
6019 }
6020 MIB->removeOperand(1);
6021 MIB->addImplicitDefUseOperands(*MBB.getParent());
6022
6023 // Build CFI if necessary.
6024 MachineFunction &MF = *MBB.getParent();
6025 const X86FrameLowering *TFL = Subtarget.getFrameLowering();
6026 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
6027 bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
6028 bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
6029 if (EmitCFI) {
6030 TFL->BuildCFI(
6031 MBB, I, DL,
6032 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
6033 TFL->BuildCFI(
6034 MBB, std::next(I), DL,
6035 MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
6036 }
6037
6038 return true;
6039}
6040
6041// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
6042// code sequence is needed for other targets.
6044 const TargetInstrInfo &TII) {
6045 MachineBasicBlock &MBB = *MIB->getParent();
6046 const DebugLoc &DL = MIB->getDebugLoc();
6047 Register Reg = MIB.getReg(0);
6048 const GlobalValue *GV =
6049 cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
6050 auto Flags = MachineMemOperand::MOLoad |
6053 MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
6054 MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
6056
6057 BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg)
6058 .addReg(X86::RIP)
6059 .addImm(1)
6060 .addReg(0)
6062 .addReg(0)
6063 .addMemOperand(MMO);
6064 MIB->setDebugLoc(DL);
6065 MIB->setDesc(TII.get(X86::MOV64rm));
6067}
6068
6070 MachineBasicBlock &MBB = *MIB->getParent();
6071 MachineFunction &MF = *MBB.getParent();
6072 const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
6073 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
6074 unsigned XorOp =
6075 MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
6076 MIB->setDesc(TII.get(XorOp));
6077 MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
6078 return true;
6079}
6080
6081// This is used to handle spills for 128/256-bit registers when we have AVX512,
6082// but not VLX. If it uses an extended register we need to use an instruction
6083// that loads the lower 128/256-bit, but is available with only AVX512F.
6085 const TargetRegisterInfo *TRI,
6086 const MCInstrDesc &LoadDesc,
6087 const MCInstrDesc &BroadcastDesc, unsigned SubIdx) {
6088 Register DestReg = MIB.getReg(0);
6089 // Check if DestReg is XMM16-31 or YMM16-31.
6090 if (TRI->getEncodingValue(DestReg) < 16) {
6091 // We can use a normal VEX encoded load.
6092 MIB->setDesc(LoadDesc);
6093 } else {
6094 // Use a 128/256-bit VBROADCAST instruction.
6095 MIB->setDesc(BroadcastDesc);
6096 // Change the destination to a 512-bit register.
6097 DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
6098 MIB->getOperand(0).setReg(DestReg);
6099 }
6100 return true;
6101}
6102
6103// This is used to handle spills for 128/256-bit registers when we have AVX512,
6104// but not VLX. If it uses an extended register we need to use an instruction
6105// that stores the lower 128/256-bit, but is available with only AVX512F.
6107 const TargetRegisterInfo *TRI,
6108 const MCInstrDesc &StoreDesc,
6109 const MCInstrDesc &ExtractDesc, unsigned SubIdx) {
6110 Register SrcReg = MIB.getReg(X86::AddrNumOperands);
6111 // Check if DestReg is XMM16-31 or YMM16-31.
6112 if (TRI->getEncodingValue(SrcReg) < 16) {
6113 // We can use a normal VEX encoded store.
6114 MIB->setDesc(StoreDesc);
6115 } else {
6116 // Use a VEXTRACTF instruction.
6117 MIB->setDesc(ExtractDesc);
6118 // Change the destination to a 512-bit register.
6119 SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
6121 MIB.addImm(0x0); // Append immediate to extract from the lower bits.
6122 }
6123
6124 return true;
6125}
6126
6128 MIB->setDesc(Desc);
6129 int64_t ShiftAmt = MIB->getOperand(2).getImm();
6130 // Temporarily remove the immediate so we can add another source register.
6131 MIB->removeOperand(2);
6132 // Add the register. Don't copy the kill flag if there is one.
6133 MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef()));
6134 // Add back the immediate.
6135 MIB.addImm(ShiftAmt);
6136 return true;
6137}
6138
6140 const TargetInstrInfo &TII, bool HasAVX) {
6141 unsigned NewOpc;
6142 if (MI.getOpcode() == X86::MOVSHPrm) {
6143 NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
6144 Register Reg = MI.getOperand(0).getReg();
6145 if (Reg > X86::XMM15)
6146 NewOpc = X86::VMOVSSZrm;
6147 } else {
6148 NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
6149 Register Reg = MI.getOperand(5).getReg();
6150 if (Reg > X86::XMM15)
6151 NewOpc = X86::VMOVSSZmr;
6152 }
6153
6154 MIB->setDesc(TII.get(NewOpc));
6155 return true;
6156}
6157
6159 bool HasAVX = Subtarget.hasAVX();
6160 MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
6161 switch (MI.getOpcode()) {
6162 case X86::MOV32r0:
6163 return Expand2AddrUndef(MIB, get(X86::XOR32rr));
6164 case X86::MOV32r1:
6165 return expandMOV32r1(MIB, *this, /*MinusOne=*/false);
6166 case X86::MOV32r_1:
6167 return expandMOV32r1(MIB, *this, /*MinusOne=*/true);
6168 case X86::MOV32ImmSExti8:
6169 case X86::MOV64ImmSExti8:
6170 return ExpandMOVImmSExti8(MIB, *this, Subtarget);
6171 case X86::SETB_C32r:
6172 return Expand2AddrUndef(MIB, get(X86::SBB32rr));
6173 case X86::SETB_C64r:
6174 return Expand2AddrUndef(MIB, get(X86::SBB64rr));
6175 case X86::MMX_SET0:
6176 return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
6177 case X86::V_SET0:
6178 case X86::FsFLD0SS:
6179 case X86::FsFLD0SD:
6180 case X86::FsFLD0SH:
6181 case X86::FsFLD0F128:
6182 return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
6183 case X86::AVX_SET0: {
6184 assert(HasAVX && "AVX not supported");
6186 Register SrcReg = MIB.getReg(0);
6187 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6188 MIB->getOperand(0).setReg(XReg);
6189 Expand2AddrUndef(MIB, get(X86::VXORPSrr));
6190 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6191 return true;
6192 }
6193 case X86::AVX512_128_SET0:
6194 case X86::AVX512_FsFLD0SH:
6195 case X86::AVX512_FsFLD0SS:
6196 case X86::AVX512_FsFLD0SD:
6197 case X86::AVX512_FsFLD0F128: {
6198 bool HasVLX = Subtarget.hasVLX();
6199 Register SrcReg = MIB.getReg(0);
6201 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
6202 return Expand2AddrUndef(MIB,
6203 get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6204 // Extended register without VLX. Use a larger XOR.
6205 SrcReg =
6206 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
6207 MIB->getOperand(0).setReg(SrcReg);
6208 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6209 }
6210 case X86::AVX512_256_SET0:
6211 case X86::AVX512_512_SET0: {
6212 bool HasVLX = Subtarget.hasVLX();
6213 Register SrcReg = MIB.getReg(0);
6215 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
6216 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6217 MIB->getOperand(0).setReg(XReg);
6218 Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6219 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6220 return true;
6221 }
6222 if (MI.getOpcode() == X86::AVX512_256_SET0) {
6223 // No VLX so we must reference a zmm.
6224 MCRegister ZReg =
6225 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
6226 MIB->getOperand(0).setReg(ZReg);
6227 }
6228 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6229 }
6230 case X86::MOVSHPmr:
6231 case X86::MOVSHPrm:
6232 return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX());
6233 case X86::V_SETALLONES:
6234 return Expand2AddrUndef(MIB,
6235 get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
6236 case X86::AVX2_SETALLONES:
6237 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6238 case X86::AVX1_SETALLONES: {
6239 Register Reg = MIB.getReg(0);
6240 // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
6241 MIB->setDesc(get(X86::VCMPPSYrri));
6242 MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
6243 return true;
6244 }
6245 case X86::AVX512_128_SETALLONES:
6246 case X86::AVX512_256_SETALLONES:
6247 case X86::AVX512_512_SETALLONES: {
6248 Register Reg = MIB.getReg(0);
6249 unsigned Opc;
6250 switch (MI.getOpcode()) {
6251 case X86::AVX512_128_SETALLONES: {
6252 if (X86::VR128RegClass.contains(Reg))
6253 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDrr));
6254
6255 Opc = X86::VPTERNLOGDZ128rri;
6256 break;
6257 }
6258 case X86::AVX512_256_SETALLONES: {
6259 if (X86::VR256RegClass.contains(Reg))
6260 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6261
6262 Opc = X86::VPTERNLOGDZ256rri;
6263 break;
6264 }
6265 case X86::AVX512_512_SETALLONES:
6266 Opc = X86::VPTERNLOGDZrri;
6267 break;
6268 }
6269 MIB->setDesc(get(Opc));
6270 // VPTERNLOGD needs 3 register inputs and an immediate.
6271 // 0xff will return 1s for any input.
6272 MIB.addReg(Reg, RegState::Undef)
6273 .addReg(Reg, RegState::Undef)
6274 .addReg(Reg, RegState::Undef)
6275 .addImm(0xff);
6276 return true;
6277 }
6278 case X86::AVX512_512_SEXT_MASK_32:
6279 case X86::AVX512_512_SEXT_MASK_64: {
6280 Register Reg = MIB.getReg(0);
6281 Register MaskReg = MIB.getReg(1);
6282 RegState MaskState = getRegState(MIB->getOperand(1));
6283 unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64)
6284 ? X86::VPTERNLOGQZrrikz
6285 : X86::VPTERNLOGDZrrikz;
6286 MI.removeOperand(1);
6287 MIB->setDesc(get(Opc));
6288 // VPTERNLOG needs 3 register inputs and an immediate.
6289 // 0xff will return 1s for any input.
6290 MIB.addReg(Reg, RegState::Undef)
6291 .addReg(MaskReg, MaskState)
6292 .addReg(Reg, RegState::Undef)
6293 .addReg(Reg, RegState::Undef)
6294 .addImm(0xff);
6295 return true;
6296 }
6297 case X86::VMOVAPSZ128rm_NOVLX:
6298 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
6299 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6300 case X86::VMOVUPSZ128rm_NOVLX:
6301 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
6302 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6303 case X86::VMOVAPSZ256rm_NOVLX:
6304 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
6305 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6306 case X86::VMOVUPSZ256rm_NOVLX:
6307 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
6308 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6309 case X86::VMOVAPSZ128mr_NOVLX:
6310 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
6311 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6312 case X86::VMOVUPSZ128mr_NOVLX:
6313 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
6314 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6315 case X86::VMOVAPSZ256mr_NOVLX:
6316 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
6317 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6318 case X86::VMOVUPSZ256mr_NOVLX:
6319 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
6320 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6321 case X86::MOV32ri64: {
6322 Register Reg = MIB.getReg(0);
6323 Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
6324 MI.setDesc(get(X86::MOV32ri));
6325 MIB->getOperand(0).setReg(Reg32);
6327 return true;
6328 }
6329
6330 case X86::RDFLAGS32:
6331 case X86::RDFLAGS64: {
6332 unsigned Is64Bit = MI.getOpcode() == X86::RDFLAGS64;
6333 MachineBasicBlock &MBB = *MIB->getParent();
6334
6335 MachineInstr *NewMI = BuildMI(MBB, MI, MIB->getDebugLoc(),
6336 get(Is64Bit ? X86::PUSHF64 : X86::PUSHF32))
6337 .getInstr();
6338
6339 // Permit reads of the EFLAGS and DF registers without them being defined.
6340 // This intrinsic exists to read external processor state in flags, such as
6341 // the trap flag, interrupt flag, and direction flag, none of which are
6342 // modeled by the backend.
6343 assert(NewMI->getOperand(2).getReg() == X86::EFLAGS &&
6344 "Unexpected register in operand! Should be EFLAGS.");
6345 NewMI->getOperand(2).setIsUndef();
6346 assert(NewMI->getOperand(3).getReg() == X86::DF &&
6347 "Unexpected register in operand! Should be DF.");
6348 NewMI->getOperand(3).setIsUndef();
6349
6350 MIB->setDesc(get(Is64Bit ? X86::POP64r : X86::POP32r));
6351 return true;
6352 }
6353
6354 case X86::WRFLAGS32:
6355 case X86::WRFLAGS64: {
6356 unsigned Is64Bit = MI.getOpcode() == X86::WRFLAGS64;
6357 MachineBasicBlock &MBB = *MIB->getParent();
6358
6359 BuildMI(MBB, MI, MIB->getDebugLoc(),
6360 get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
6361 .addReg(MI.getOperand(0).getReg());
6362 BuildMI(MBB, MI, MIB->getDebugLoc(),
6363 get(Is64Bit ? X86::POPF64 : X86::POPF32));
6364 MI.eraseFromParent();
6365 return true;
6366 }
6367
6368 // KNL does not recognize dependency-breaking idioms for mask registers,
6369 // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
6370 // Using %k0 as the undef input register is a performance heuristic based
6371 // on the assumption that %k0 is used less frequently than the other mask
6372 // registers, since it is not usable as a write mask.
6373 // FIXME: A more advanced approach would be to choose the best input mask
6374 // register based on context.
6375 case X86::KSET0B:
6376 return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0);
6377 case X86::KSET0W:
6378 return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
6379 case X86::KSET0D:
6380 return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
6381 case X86::KSET0Q:
6382 return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
6383 case X86::KSET1B:
6384 return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0);
6385 case X86::KSET1W:
6386 return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
6387 case X86::KSET1D:
6388 return Expand2AddrKreg(MIB, get(X86::KXNORDkk), X86::K0);
6389 case X86::KSET1Q:
6390 return Expand2AddrKreg(MIB, get(X86::KXNORQkk), X86::K0);
6391 case TargetOpcode::LOAD_STACK_GUARD:
6392 expandLoadStackGuard(MIB, *this);
6393 return true;
6394 case X86::XOR64_FP:
6395 case X86::XOR32_FP:
6396 return expandXorFP(MIB, *this);
6397 case X86::SHLDROT32ri:
6398 return expandSHXDROT(MIB, get(X86::SHLD32rri8));
6399 case X86::SHLDROT64ri:
6400 return expandSHXDROT(MIB, get(X86::SHLD64rri8));
6401 case X86::SHRDROT32ri:
6402 return expandSHXDROT(MIB, get(X86::SHRD32rri8));
6403 case X86::SHRDROT64ri:
6404 return expandSHXDROT(MIB, get(X86::SHRD64rri8));
6405 case X86::ADD8rr_DB:
6406 MIB->setDesc(get(X86::OR8rr));
6407 break;
6408 case X86::ADD16rr_DB:
6409 MIB->setDesc(get(X86::OR16rr));
6410 break;
6411 case X86::ADD32rr_DB:
6412 MIB->setDesc(get(X86::OR32rr));
6413 break;
6414 case X86::ADD64rr_DB:
6415 MIB->setDesc(get(X86::OR64rr));
6416 break;
6417 case X86::ADD8ri_DB:
6418 MIB->setDesc(get(X86::OR8ri));
6419 break;
6420 case X86::ADD16ri_DB:
6421 MIB->setDesc(get(X86::OR16ri));
6422 break;
6423 case X86::ADD32ri_DB:
6424 MIB->setDesc(get(X86::OR32ri));
6425 break;
6426 case X86::ADD64ri32_DB:
6427 MIB->setDesc(get(X86::OR64ri32));
6428 break;
6429 }
6430 return false;
6431}
6432
6433/// Return true for all instructions that only update
6434/// the first 32 or 64-bits of the destination register and leave the rest
6435/// unmodified. This can be used to avoid folding loads if the instructions
6436/// only update part of the destination register, and the non-updated part is
6437/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
6438/// instructions breaks the partial register dependency and it can improve
6439/// performance. e.g.:
6440///
6441/// movss (%rdi), %xmm0
6442/// cvtss2sd %xmm0, %xmm0
6443///
6444/// Instead of
6445/// cvtss2sd (%rdi), %xmm0
6446///
6447/// FIXME: This should be turned into a TSFlags.
6448///
6449static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
6450 bool ForLoadFold = false) {
6451 switch (Opcode) {
6452 case X86::CVTSI2SSrr:
6453 case X86::CVTSI2SSrm:
6454 case X86::CVTSI642SSrr:
6455 case X86::CVTSI642SSrm:
6456 case X86::CVTSI2SDrr:
6457 case X86::CVTSI2SDrm:
6458 case X86::CVTSI642SDrr:
6459 case X86::CVTSI642SDrm:
6460 // Load folding won't effect the undef register update since the input is
6461 // a GPR.
6462 return !ForLoadFold;
6463 case X86::CVTSD2SSrr:
6464 case X86::CVTSD2SSrm:
6465 case X86::CVTSS2SDrr:
6466 case X86::CVTSS2SDrm:
6467 case X86::MOVHPDrm:
6468 case X86::MOVHPSrm:
6469 case X86::MOVLPDrm:
6470 case X86::MOVLPSrm:
6471 case X86::RCPSSr:
6472 case X86::RCPSSm:
6473 case X86::RCPSSr_Int:
6474 case X86::RCPSSm_Int:
6475 case X86::ROUNDSDri:
6476 case X86::ROUNDSDmi:
6477 case X86::ROUNDSSri:
6478 case X86::ROUNDSSmi:
6479 case X86::RSQRTSSr:
6480 case X86::RSQRTSSm:
6481 case X86::RSQRTSSr_Int:
6482 case X86::RSQRTSSm_Int:
6483 case X86::SQRTSSr:
6484 case X86::SQRTSSm:
6485 case X86::SQRTSSr_Int:
6486 case X86::SQRTSSm_Int:
6487 case X86::SQRTSDr:
6488 case X86::SQRTSDm:
6489 case X86::SQRTSDr_Int:
6490 case X86::SQRTSDm_Int:
6491 return true;
6492 case X86::VFCMULCPHZ128rm:
6493 case X86::VFCMULCPHZ128rmb:
6494 case X86::VFCMULCPHZ128rmbkz:
6495 case X86::VFCMULCPHZ128rmkz:
6496 case X86::VFCMULCPHZ128rr:
6497 case X86::VFCMULCPHZ128rrkz:
6498 case X86::VFCMULCPHZ256rm:
6499 case X86::VFCMULCPHZ256rmb:
6500 case X86::VFCMULCPHZ256rmbkz:
6501 case X86::VFCMULCPHZ256rmkz:
6502 case X86::VFCMULCPHZ256rr:
6503 case X86::VFCMULCPHZ256rrkz:
6504 case X86::VFCMULCPHZrm:
6505 case X86::VFCMULCPHZrmb:
6506 case X86::VFCMULCPHZrmbkz:
6507 case X86::VFCMULCPHZrmkz:
6508 case X86::VFCMULCPHZrr:
6509 case X86::VFCMULCPHZrrb:
6510 case X86::VFCMULCPHZrrbkz:
6511 case X86::VFCMULCPHZrrkz:
6512 case X86::VFMULCPHZ128rm:
6513 case X86::VFMULCPHZ128rmb:
6514 case X86::VFMULCPHZ128rmbkz:
6515 case X86::VFMULCPHZ128rmkz:
6516 case X86::VFMULCPHZ128rr:
6517 case X86::VFMULCPHZ128rrkz:
6518 case X86::VFMULCPHZ256rm:
6519 case X86::VFMULCPHZ256rmb:
6520 case X86::VFMULCPHZ256rmbkz:
6521 case X86::VFMULCPHZ256rmkz:
6522 case X86::VFMULCPHZ256rr:
6523 case X86::VFMULCPHZ256rrkz:
6524 case X86::VFMULCPHZrm:
6525 case X86::VFMULCPHZrmb:
6526 case X86::VFMULCPHZrmbkz:
6527 case X86::VFMULCPHZrmkz:
6528 case X86::VFMULCPHZrr:
6529 case X86::VFMULCPHZrrb:
6530 case X86::VFMULCPHZrrbkz:
6531 case X86::VFMULCPHZrrkz:
6532 case X86::VFCMULCSHZrm:
6533 case X86::VFCMULCSHZrmkz:
6534 case X86::VFCMULCSHZrr:
6535 case X86::VFCMULCSHZrrb:
6536 case X86::VFCMULCSHZrrbkz:
6537 case X86::VFCMULCSHZrrkz:
6538 case X86::VFMULCSHZrm:
6539 case X86::VFMULCSHZrmkz:
6540 case X86::VFMULCSHZrr:
6541 case X86::VFMULCSHZrrb:
6542 case X86::VFMULCSHZrrbkz:
6543 case X86::VFMULCSHZrrkz:
6544 return Subtarget.hasMULCFalseDeps();
6545 case X86::VPERMDYrm:
6546 case X86::VPERMDYrr:
6547 case X86::VPERMQYmi:
6548 case X86::VPERMQYri:
6549 case X86::VPERMPSYrm:
6550 case X86::VPERMPSYrr:
6551 case X86::VPERMPDYmi:
6552 case X86::VPERMPDYri:
6553 case X86::VPERMDZ256rm:
6554 case X86::VPERMDZ256rmb:
6555 case X86::VPERMDZ256rmbkz:
6556 case X86::VPERMDZ256rmkz:
6557 case X86::VPERMDZ256rr:
6558 case X86::VPERMDZ256rrkz:
6559 case X86::VPERMDZrm:
6560 case X86::VPERMDZrmb:
6561 case X86::VPERMDZrmbkz:
6562 case X86::VPERMDZrmkz:
6563 case X86::VPERMDZrr:
6564 case X86::VPERMDZrrkz:
6565 case X86::VPERMQZ256mbi:
6566 case X86::VPERMQZ256mbikz:
6567 case X86::VPERMQZ256mi:
6568 case X86::VPERMQZ256mikz:
6569 case X86::VPERMQZ256ri:
6570 case X86::VPERMQZ256rikz:
6571 case X86::VPERMQZ256rm:
6572 case X86::VPERMQZ256rmb:
6573 case X86::VPERMQZ256rmbkz:
6574 case X86::VPERMQZ256rmkz:
6575 case X86::VPERMQZ256rr:
6576 case X86::VPERMQZ256rrkz:
6577 case X86::VPERMQZmbi:
6578 case X86::VPERMQZmbikz:
6579 case X86::VPERMQZmi:
6580 case X86::VPERMQZmikz:
6581 case X86::VPERMQZri:
6582 case X86::VPERMQZrikz:
6583 case X86::VPERMQZrm:
6584 case X86::VPERMQZrmb:
6585 case X86::VPERMQZrmbkz:
6586 case X86::VPERMQZrmkz:
6587 case X86::VPERMQZrr:
6588 case X86::VPERMQZrrkz:
6589 case X86::VPERMPSZ256rm:
6590 case X86::VPERMPSZ256rmb:
6591 case X86::VPERMPSZ256rmbkz:
6592 case X86::VPERMPSZ256rmkz:
6593 case X86::VPERMPSZ256rr:
6594 case X86::VPERMPSZ256rrkz:
6595 case X86::VPERMPSZrm:
6596 case X86::VPERMPSZrmb:
6597 case X86::VPERMPSZrmbkz:
6598 case X86::VPERMPSZrmkz:
6599 case X86::VPERMPSZrr:
6600 case X86::VPERMPSZrrkz:
6601 case X86::VPERMPDZ256mbi:
6602 case X86::VPERMPDZ256mbikz:
6603 case X86::VPERMPDZ256mi:
6604 case X86::VPERMPDZ256mikz:
6605 case X86::VPERMPDZ256ri:
6606 case X86::VPERMPDZ256rikz:
6607 case X86::VPERMPDZ256rm:
6608 case X86::VPERMPDZ256rmb:
6609 case X86::VPERMPDZ256rmbkz:
6610 case X86::VPERMPDZ256rmkz:
6611 case X86::VPERMPDZ256rr:
6612 case X86::VPERMPDZ256rrkz:
6613 case X86::VPERMPDZmbi:
6614 case X86::VPERMPDZmbikz:
6615 case X86::VPERMPDZmi:
6616 case X86::VPERMPDZmikz:
6617 case X86::VPERMPDZri:
6618 case X86::VPERMPDZrikz:
6619 case X86::VPERMPDZrm:
6620 case X86::VPERMPDZrmb:
6621 case X86::VPERMPDZrmbkz:
6622 case X86::VPERMPDZrmkz:
6623 case X86::VPERMPDZrr:
6624 case X86::VPERMPDZrrkz:
6625 return Subtarget.hasPERMFalseDeps();
6626 case X86::VRANGEPDZ128rmbi:
6627 case X86::VRANGEPDZ128rmbikz:
6628 case X86::VRANGEPDZ128rmi:
6629 case X86::VRANGEPDZ128rmikz:
6630 case X86::VRANGEPDZ128rri:
6631 case X86::VRANGEPDZ128rrikz:
6632 case X86::VRANGEPDZ256rmbi:
6633 case X86::VRANGEPDZ256rmbikz:
6634 case X86::VRANGEPDZ256rmi:
6635 case X86::VRANGEPDZ256rmikz:
6636 case X86::VRANGEPDZ256rri:
6637 case X86::VRANGEPDZ256rrikz:
6638 case X86::VRANGEPDZrmbi:
6639 case X86::VRANGEPDZrmbikz:
6640 case X86::VRANGEPDZrmi:
6641 case X86::VRANGEPDZrmikz:
6642 case X86::VRANGEPDZrri:
6643 case X86::VRANGEPDZrrib:
6644 case X86::VRANGEPDZrribkz:
6645 case X86::VRANGEPDZrrikz:
6646 case X86::VRANGEPSZ128rmbi:
6647 case X86::VRANGEPSZ128rmbikz:
6648 case X86::VRANGEPSZ128rmi:
6649 case X86::VRANGEPSZ128rmikz:
6650 case X86::VRANGEPSZ128rri:
6651 case X86::VRANGEPSZ128rrikz:
6652 case X86::VRANGEPSZ256rmbi:
6653 case X86::VRANGEPSZ256rmbikz:
6654 case X86::VRANGEPSZ256rmi:
6655 case X86::VRANGEPSZ256rmikz:
6656 case X86::VRANGEPSZ256rri:
6657 case X86::VRANGEPSZ256rrikz:
6658 case X86::VRANGEPSZrmbi:
6659 case X86::VRANGEPSZrmbikz:
6660 case X86::VRANGEPSZrmi:
6661 case X86::VRANGEPSZrmikz:
6662 case X86::VRANGEPSZrri:
6663 case X86::VRANGEPSZrrib:
6664 case X86::VRANGEPSZrribkz:
6665 case X86::VRANGEPSZrrikz:
6666 case X86::VRANGESDZrmi:
6667 case X86::VRANGESDZrmikz:
6668 case X86::VRANGESDZrri:
6669 case X86::VRANGESDZrrib:
6670 case X86::VRANGESDZrribkz:
6671 case X86::VRANGESDZrrikz:
6672 case X86::VRANGESSZrmi:
6673 case X86::VRANGESSZrmikz:
6674 case X86::VRANGESSZrri:
6675 case X86::VRANGESSZrrib:
6676 case X86::VRANGESSZrribkz:
6677 case X86::VRANGESSZrrikz:
6678 return Subtarget.hasRANGEFalseDeps();
6679 case X86::VGETMANTSSZrmi:
6680 case X86::VGETMANTSSZrmikz:
6681 case X86::VGETMANTSSZrri:
6682 case X86::VGETMANTSSZrrib:
6683 case X86::VGETMANTSSZrribkz:
6684 case X86::VGETMANTSSZrrikz:
6685 case X86::VGETMANTSDZrmi:
6686 case X86::VGETMANTSDZrmikz:
6687 case X86::VGETMANTSDZrri:
6688 case X86::VGETMANTSDZrrib:
6689 case X86::VGETMANTSDZrribkz:
6690 case X86::VGETMANTSDZrrikz:
6691 case X86::VGETMANTSHZrmi:
6692 case X86::VGETMANTSHZrmikz:
6693 case X86::VGETMANTSHZrri:
6694 case X86::VGETMANTSHZrrib:
6695 case X86::VGETMANTSHZrribkz:
6696 case X86::VGETMANTSHZrrikz:
6697 case X86::VGETMANTPSZ128rmbi:
6698 case X86::VGETMANTPSZ128rmbikz:
6699 case X86::VGETMANTPSZ128rmi:
6700 case X86::VGETMANTPSZ128rmikz:
6701 case X86::VGETMANTPSZ256rmbi:
6702 case X86::VGETMANTPSZ256rmbikz:
6703 case X86::VGETMANTPSZ256rmi:
6704 case X86::VGETMANTPSZ256rmikz:
6705 case X86::VGETMANTPSZrmbi:
6706 case X86::VGETMANTPSZrmbikz:
6707 case X86::VGETMANTPSZrmi:
6708 case X86::VGETMANTPSZrmikz:
6709 case X86::VGETMANTPDZ128rmbi:
6710 case X86::VGETMANTPDZ128rmbikz:
6711 case X86::VGETMANTPDZ128rmi:
6712 case X86::VGETMANTPDZ128rmikz:
6713 case X86::VGETMANTPDZ256rmbi:
6714 case X86::VGETMANTPDZ256rmbikz:
6715 case X86::VGETMANTPDZ256rmi:
6716 case X86::VGETMANTPDZ256rmikz:
6717 case X86::VGETMANTPDZrmbi:
6718 case X86::VGETMANTPDZrmbikz:
6719 case X86::VGETMANTPDZrmi:
6720 case X86::VGETMANTPDZrmikz:
6721 return Subtarget.hasGETMANTFalseDeps();
6722 case X86::VPMULLQZ128rm:
6723 case X86::VPMULLQZ128rmb:
6724 case X86::VPMULLQZ128rmbkz:
6725 case X86::VPMULLQZ128rmkz:
6726 case X86::VPMULLQZ128rr:
6727 case X86::VPMULLQZ128rrkz:
6728 case X86::VPMULLQZ256rm:
6729 case X86::VPMULLQZ256rmb:
6730 case X86::VPMULLQZ256rmbkz:
6731 case X86::VPMULLQZ256rmkz:
6732 case X86::VPMULLQZ256rr:
6733 case X86::VPMULLQZ256rrkz:
6734 case X86::VPMULLQZrm:
6735 case X86::VPMULLQZrmb:
6736 case X86::VPMULLQZrmbkz:
6737 case X86::VPMULLQZrmkz:
6738 case X86::VPMULLQZrr:
6739 case X86::VPMULLQZrrkz:
6740 return Subtarget.hasMULLQFalseDeps();
6741 // GPR
6742 case X86::POPCNT32rm:
6743 case X86::POPCNT32rr:
6744 case X86::POPCNT64rm:
6745 case X86::POPCNT64rr:
6746 return Subtarget.hasPOPCNTFalseDeps();
6747 case X86::LZCNT32rm:
6748 case X86::LZCNT32rr:
6749 case X86::LZCNT64rm:
6750 case X86::LZCNT64rr:
6751 case X86::TZCNT32rm:
6752 case X86::TZCNT32rr:
6753 case X86::TZCNT64rm:
6754 case X86::TZCNT64rr:
6755 return Subtarget.hasLZCNTFalseDeps();
6756 }
6757
6758 return false;
6759}
6760
6761/// Inform the BreakFalseDeps pass how many idle
6762/// instructions we would like before a partial register update.
6764 const MachineInstr &MI, unsigned OpNum,
6765 const TargetRegisterInfo *TRI) const {
6766
6767 if (OpNum != 0)
6768 return 0;
6769
6770 // NDD ops with 8/16b results may appear to be partial register
6771 // updates after register allocation.
6772 bool HasNDDPartialWrite = false;
6773 if (X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
6774 Register Reg = MI.getOperand(0).getReg();
6775 if (!Reg.isVirtual())
6776 HasNDDPartialWrite =
6777 X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
6778 }
6779
6780 if (!(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
6781 return 0;
6782
6783 // Check if the result register is also used as a source.
6784 // For non-NDD ops, this means a partial update is wanted, hence we return 0.
6785 // For NDD ops, this means it is possible to compress the instruction
6786 // to a legacy form in CompressEVEX, which would create an unwanted partial
6787 // update, so we return the clearance.
6788 const MachineOperand &MO = MI.getOperand(0);
6789 Register Reg = MO.getReg();
6790 bool ReadsReg = false;
6791 if (Reg.isVirtual())
6792 ReadsReg = (MO.readsReg() || MI.readsVirtualRegister(Reg));
6793 else
6794 ReadsReg = MI.readsRegister(Reg, TRI);
6795 if (ReadsReg != HasNDDPartialWrite)
6796 return 0;
6797
6798 // If any instructions in the clearance range are reading Reg, insert a
6799 // dependency breaking instruction, which is inexpensive and is likely to
6800 // be hidden in other instruction's cycles.
6802}
6803
6804// Return true for any instruction the copies the high bits of the first source
6805// operand into the unused high bits of the destination operand.
6806// Also returns true for instructions that have two inputs where one may
6807// be undef and we want it to use the same register as the other input.
6808static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
6809 bool ForLoadFold = false) {
6810 // Set the OpNum parameter to the first source operand.
6811 switch (Opcode) {
6812 case X86::MMX_PUNPCKHBWrr:
6813 case X86::MMX_PUNPCKHWDrr:
6814 case X86::MMX_PUNPCKHDQrr:
6815 case X86::MMX_PUNPCKLBWrr:
6816 case X86::MMX_PUNPCKLWDrr:
6817 case X86::MMX_PUNPCKLDQrr:
6818 case X86::MOVHLPSrr:
6819 case X86::PACKSSWBrr:
6820 case X86::PACKUSWBrr:
6821 case X86::PACKSSDWrr:
6822 case X86::PACKUSDWrr:
6823 case X86::PUNPCKHBWrr:
6824 case X86::PUNPCKLBWrr:
6825 case X86::PUNPCKHWDrr:
6826 case X86::PUNPCKLWDrr:
6827 case X86::PUNPCKHDQrr:
6828 case X86::PUNPCKLDQrr:
6829 case X86::PUNPCKHQDQrr:
6830 case X86::PUNPCKLQDQrr:
6831 case X86::SHUFPDrri:
6832 case X86::SHUFPSrri:
6833 // These instructions are sometimes used with an undef first or second
6834 // source. Return true here so BreakFalseDeps will assign this source to the
6835 // same register as the first source to avoid a false dependency.
6836 // Operand 1 of these instructions is tied so they're separate from their
6837 // VEX counterparts.
6838 return OpNum == 2 && !ForLoadFold;
6839
6840 case X86::VMOVLHPSrr:
6841 case X86::VMOVLHPSZrr:
6842 case X86::VPACKSSWBrr:
6843 case X86::VPACKUSWBrr:
6844 case X86::VPACKSSDWrr:
6845 case X86::VPACKUSDWrr:
6846 case X86::VPACKSSWBZ128rr:
6847 case X86::VPACKUSWBZ128rr:
6848 case X86::VPACKSSDWZ128rr:
6849 case X86::VPACKUSDWZ128rr:
6850 case X86::VPERM2F128rri:
6851 case X86::VPERM2I128rri:
6852 case X86::VSHUFF32X4Z256rri:
6853 case X86::VSHUFF32X4Zrri:
6854 case X86::VSHUFF64X2Z256rri:
6855 case X86::VSHUFF64X2Zrri:
6856 case X86::VSHUFI32X4Z256rri:
6857 case X86::VSHUFI32X4Zrri:
6858 case X86::VSHUFI64X2Z256rri:
6859 case X86::VSHUFI64X2Zrri:
6860 case X86::VPUNPCKHBWrr:
6861 case X86::VPUNPCKLBWrr:
6862 case X86::VPUNPCKHBWYrr:
6863 case X86::VPUNPCKLBWYrr:
6864 case X86::VPUNPCKHBWZ128rr:
6865 case X86::VPUNPCKLBWZ128rr:
6866 case X86::VPUNPCKHBWZ256rr:
6867 case X86::VPUNPCKLBWZ256rr:
6868 case X86::VPUNPCKHBWZrr:
6869 case X86::VPUNPCKLBWZrr:
6870 case X86::VPUNPCKHWDrr:
6871 case X86::VPUNPCKLWDrr:
6872 case X86::VPUNPCKHWDYrr:
6873 case X86::VPUNPCKLWDYrr:
6874 case X86::VPUNPCKHWDZ128rr:
6875 case X86::VPUNPCKLWDZ128rr:
6876 case X86::VPUNPCKHWDZ256rr:
6877 case X86::VPUNPCKLWDZ256rr:
6878 case X86::VPUNPCKHWDZrr:
6879 case X86::VPUNPCKLWDZrr:
6880 case X86::VPUNPCKHDQrr:
6881 case X86::VPUNPCKLDQrr:
6882 case X86::VPUNPCKHDQYrr:
6883 case X86::VPUNPCKLDQYrr:
6884 case X86::VPUNPCKHDQZ128rr:
6885 case X86::VPUNPCKLDQZ128rr:
6886 case X86::VPUNPCKHDQZ256rr:
6887 case X86::VPUNPCKLDQZ256rr:
6888 case X86::VPUNPCKHDQZrr:
6889 case X86::VPUNPCKLDQZrr:
6890 case X86::VPUNPCKHQDQrr:
6891 case X86::VPUNPCKLQDQrr:
6892 case X86::VPUNPCKHQDQYrr:
6893 case X86::VPUNPCKLQDQYrr:
6894 case X86::VPUNPCKHQDQZ128rr:
6895 case X86::VPUNPCKLQDQZ128rr:
6896 case X86::VPUNPCKHQDQZ256rr:
6897 case X86::VPUNPCKLQDQZ256rr:
6898 case X86::VPUNPCKHQDQZrr:
6899 case X86::VPUNPCKLQDQZrr:
6900 // These instructions are sometimes used with an undef first or second
6901 // source. Return true here so BreakFalseDeps will assign this source to the
6902 // same register as the first source to avoid a false dependency.
6903 return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
6904
6905 case X86::VCVTSI2SSrr:
6906 case X86::VCVTSI2SSrm:
6907 case X86::VCVTSI2SSrr_Int:
6908 case X86::VCVTSI2SSrm_Int:
6909 case X86::VCVTSI642SSrr:
6910 case X86::VCVTSI642SSrm:
6911 case X86::VCVTSI642SSrr_Int:
6912 case X86::VCVTSI642SSrm_Int:
6913 case X86::VCVTSI2SDrr:
6914 case X86::VCVTSI2SDrm:
6915 case X86::VCVTSI2SDrr_Int:
6916 case X86::VCVTSI2SDrm_Int:
6917 case X86::VCVTSI642SDrr:
6918 case X86::VCVTSI642SDrm:
6919 case X86::VCVTSI642SDrr_Int:
6920 case X86::VCVTSI642SDrm_Int:
6921 // AVX-512
6922 case X86::VCVTSI2SSZrr:
6923 case X86::VCVTSI2SSZrm:
6924 case X86::VCVTSI2SSZrr_Int:
6925 case X86::VCVTSI2SSZrrb_Int:
6926 case X86::VCVTSI2SSZrm_Int:
6927 case X86::VCVTSI642SSZrr:
6928 case X86::VCVTSI642SSZrm:
6929 case X86::VCVTSI642SSZrr_Int:
6930 case X86::VCVTSI642SSZrrb_Int:
6931 case X86::VCVTSI642SSZrm_Int:
6932 case X86::VCVTSI2SDZrr:
6933 case X86::VCVTSI2SDZrm:
6934 case X86::VCVTSI2SDZrr_Int:
6935 case X86::VCVTSI2SDZrm_Int:
6936 case X86::VCVTSI642SDZrr:
6937 case X86::VCVTSI642SDZrm:
6938 case X86::VCVTSI642SDZrr_Int:
6939 case X86::VCVTSI642SDZrrb_Int:
6940 case X86::VCVTSI642SDZrm_Int:
6941 case X86::VCVTUSI2SSZrr:
6942 case X86::VCVTUSI2SSZrm:
6943 case X86::VCVTUSI2SSZrr_Int:
6944 case X86::VCVTUSI2SSZrrb_Int:
6945 case X86::VCVTUSI2SSZrm_Int:
6946 case X86::VCVTUSI642SSZrr:
6947 case X86::VCVTUSI642SSZrm:
6948 case X86::VCVTUSI642SSZrr_Int:
6949 case X86::VCVTUSI642SSZrrb_Int:
6950 case X86::VCVTUSI642SSZrm_Int:
6951 case X86::VCVTUSI2SDZrr:
6952 case X86::VCVTUSI2SDZrm:
6953 case X86::VCVTUSI2SDZrr_Int:
6954 case X86::VCVTUSI2SDZrm_Int:
6955 case X86::VCVTUSI642SDZrr:
6956 case X86::VCVTUSI642SDZrm:
6957 case X86::VCVTUSI642SDZrr_Int:
6958 case X86::VCVTUSI642SDZrrb_Int:
6959 case X86::VCVTUSI642SDZrm_Int:
6960 case X86::VCVTSI2SHZrr:
6961 case X86::VCVTSI2SHZrm:
6962 case X86::VCVTSI2SHZrr_Int:
6963 case X86::VCVTSI2SHZrrb_Int:
6964 case X86::VCVTSI2SHZrm_Int:
6965 case X86::VCVTSI642SHZrr:
6966 case X86::VCVTSI642SHZrm:
6967 case X86::VCVTSI642SHZrr_Int:
6968 case X86::VCVTSI642SHZrrb_Int:
6969 case X86::VCVTSI642SHZrm_Int:
6970 case X86::VCVTUSI2SHZrr:
6971 case X86::VCVTUSI2SHZrm:
6972 case X86::VCVTUSI2SHZrr_Int:
6973 case X86::VCVTUSI2SHZrrb_Int:
6974 case X86::VCVTUSI2SHZrm_Int:
6975 case X86::VCVTUSI642SHZrr:
6976 case X86::VCVTUSI642SHZrm:
6977 case X86::VCVTUSI642SHZrr_Int:
6978 case X86::VCVTUSI642SHZrrb_Int:
6979 case X86::VCVTUSI642SHZrm_Int:
6980 // Load folding won't effect the undef register update since the input is
6981 // a GPR.
6982 return OpNum == 1 && !ForLoadFold;
6983 case X86::VCVTSD2SSrr:
6984 case X86::VCVTSD2SSrm:
6985 case X86::VCVTSD2SSrr_Int:
6986 case X86::VCVTSD2SSrm_Int:
6987 case X86::VCVTSS2SDrr:
6988 case X86::VCVTSS2SDrm:
6989 case X86::VCVTSS2SDrr_Int:
6990 case X86::VCVTSS2SDrm_Int:
6991 case X86::VRCPSSr:
6992 case X86::VRCPSSr_Int:
6993 case X86::VRCPSSm:
6994 case X86::VRCPSSm_Int:
6995 case X86::VROUNDSDri:
6996 case X86::VROUNDSDmi:
6997 case X86::VROUNDSDri_Int:
6998 case X86::VROUNDSDmi_Int:
6999 case X86::VROUNDSSri:
7000 case X86::VROUNDSSmi:
7001 case X86::VROUNDSSri_Int:
7002 case X86::VROUNDSSmi_Int:
7003 case X86::VRSQRTSSr:
7004 case X86::VRSQRTSSr_Int:
7005 case X86::VRSQRTSSm:
7006 case X86::VRSQRTSSm_Int:
7007 case X86::VSQRTSSr:
7008 case X86::VSQRTSSr_Int:
7009 case X86::VSQRTSSm:
7010 case X86::VSQRTSSm_Int:
7011 case X86::VSQRTSDr:
7012 case X86::VSQRTSDr_Int:
7013 case X86::VSQRTSDm:
7014 case X86::VSQRTSDm_Int:
7015 // AVX-512
7016 case X86::VCVTSD2SSZrr:
7017 case X86::VCVTSD2SSZrr_Int:
7018 case X86::VCVTSD2SSZrrb_Int:
7019 case X86::VCVTSD2SSZrm:
7020 case X86::VCVTSD2SSZrm_Int:
7021 case X86::VCVTSS2SDZrr:
7022 case X86::VCVTSS2SDZrr_Int:
7023 case X86::VCVTSS2SDZrrb_Int:
7024 case X86::VCVTSS2SDZrm:
7025 case X86::VCVTSS2SDZrm_Int:
7026 case X86::VGETEXPSDZr:
7027 case X86::VGETEXPSDZrb:
7028 case X86::VGETEXPSDZm:
7029 case X86::VGETEXPSSZr:
7030 case X86::VGETEXPSSZrb:
7031 case X86::VGETEXPSSZm:
7032 case X86::VGETMANTSDZrri:
7033 case X86::VGETMANTSDZrrib:
7034 case X86::VGETMANTSDZrmi:
7035 case X86::VGETMANTSSZrri:
7036 case X86::VGETMANTSSZrrib:
7037 case X86::VGETMANTSSZrmi:
7038 case X86::VRNDSCALESDZrri:
7039 case X86::VRNDSCALESDZrri_Int:
7040 case X86::VRNDSCALESDZrrib_Int:
7041 case X86::VRNDSCALESDZrmi:
7042 case X86::VRNDSCALESDZrmi_Int:
7043 case X86::VRNDSCALESSZrri:
7044 case X86::VRNDSCALESSZrri_Int:
7045 case X86::VRNDSCALESSZrrib_Int:
7046 case X86::VRNDSCALESSZrmi:
7047 case X86::VRNDSCALESSZrmi_Int:
7048 case X86::VRCP14SDZrr:
7049 case X86::VRCP14SDZrm:
7050 case X86::VRCP14SSZrr:
7051 case X86::VRCP14SSZrm:
7052 case X86::VRCPSHZrr:
7053 case X86::VRCPSHZrm:
7054 case X86::VRSQRTSHZrr:
7055 case X86::VRSQRTSHZrm:
7056 case X86::VREDUCESHZrmi:
7057 case X86::VREDUCESHZrri:
7058 case X86::VREDUCESHZrrib:
7059 case X86::VGETEXPSHZr:
7060 case X86::VGETEXPSHZrb:
7061 case X86::VGETEXPSHZm:
7062 case X86::VGETMANTSHZrri:
7063 case X86::VGETMANTSHZrrib:
7064 case X86::VGETMANTSHZrmi:
7065 case X86::VRNDSCALESHZrri:
7066 case X86::VRNDSCALESHZrri_Int:
7067 case X86::VRNDSCALESHZrrib_Int:
7068 case X86::VRNDSCALESHZrmi:
7069 case X86::VRNDSCALESHZrmi_Int:
7070 case X86::VSQRTSHZr:
7071 case X86::VSQRTSHZr_Int:
7072 case X86::VSQRTSHZrb_Int:
7073 case X86::VSQRTSHZm:
7074 case X86::VSQRTSHZm_Int:
7075 case X86::VRCP28SDZr:
7076 case X86::VRCP28SDZrb:
7077 case X86::VRCP28SDZm:
7078 case X86::VRCP28SSZr:
7079 case X86::VRCP28SSZrb:
7080 case X86::VRCP28SSZm:
7081 case X86::VREDUCESSZrmi:
7082 case X86::VREDUCESSZrri:
7083 case X86::VREDUCESSZrrib:
7084 case X86::VRSQRT14SDZrr:
7085 case X86::VRSQRT14SDZrm:
7086 case X86::VRSQRT14SSZrr:
7087 case X86::VRSQRT14SSZrm:
7088 case X86::VRSQRT28SDZr:
7089 case X86::VRSQRT28SDZrb:
7090 case X86::VRSQRT28SDZm:
7091 case X86::VRSQRT28SSZr:
7092 case X86::VRSQRT28SSZrb:
7093 case X86::VRSQRT28SSZm:
7094 case X86::VSQRTSSZr:
7095 case X86::VSQRTSSZr_Int:
7096 case X86::VSQRTSSZrb_Int:
7097 case X86::VSQRTSSZm:
7098 case X86::VSQRTSSZm_Int:
7099 case X86::VSQRTSDZr:
7100 case X86::VSQRTSDZr_Int:
7101 case X86::VSQRTSDZrb_Int:
7102 case X86::VSQRTSDZm:
7103 case X86::VSQRTSDZm_Int:
7104 case X86::VCVTSD2SHZrr:
7105 case X86::VCVTSD2SHZrr_Int:
7106 case X86::VCVTSD2SHZrrb_Int:
7107 case X86::VCVTSD2SHZrm:
7108 case X86::VCVTSD2SHZrm_Int:
7109 case X86::VCVTSS2SHZrr:
7110 case X86::VCVTSS2SHZrr_Int:
7111 case X86::VCVTSS2SHZrrb_Int:
7112 case X86::VCVTSS2SHZrm:
7113 case X86::VCVTSS2SHZrm_Int:
7114 case X86::VCVTSH2SDZrr:
7115 case X86::VCVTSH2SDZrr_Int:
7116 case X86::VCVTSH2SDZrrb_Int:
7117 case X86::VCVTSH2SDZrm:
7118 case X86::VCVTSH2SDZrm_Int:
7119 case X86::VCVTSH2SSZrr:
7120 case X86::VCVTSH2SSZrr_Int:
7121 case X86::VCVTSH2SSZrrb_Int:
7122 case X86::VCVTSH2SSZrm:
7123 case X86::VCVTSH2SSZrm_Int:
7124 return OpNum == 1;
7125 case X86::VMOVSSZrrk:
7126 case X86::VMOVSDZrrk:
7127 return OpNum == 3 && !ForLoadFold;
7128 case X86::VMOVSSZrrkz:
7129 case X86::VMOVSDZrrkz:
7130 return OpNum == 2 && !ForLoadFold;
7131 }
7132
7133 return false;
7134}
7135
7136/// Inform the BreakFalseDeps pass how many idle instructions we would like
7137/// before certain undef register reads.
7138///
7139/// This catches the VCVTSI2SD family of instructions:
7140///
7141/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
7142///
7143/// We should to be careful *not* to catch VXOR idioms which are presumably
7144/// handled specially in the pipeline:
7145///
7146/// vxorps undef %xmm1, undef %xmm1, %xmm1
7147///
7148/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
7149/// high bits that are passed-through are not live.
7150unsigned
7152 const TargetRegisterInfo *TRI) const {
7153 const MachineOperand &MO = MI.getOperand(OpNum);
7154 if (MO.getReg().isPhysical() && hasUndefRegUpdate(MI.getOpcode(), OpNum))
7155 return UndefRegClearance;
7156
7157 return 0;
7158}
7159
7161 MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
7162 Register Reg = MI.getOperand(OpNum).getReg();
7163 // If MI kills this register, the false dependence is already broken.
7164 if (MI.killsRegister(Reg, TRI))
7165 return;
7166
7167 if (X86::VR128RegClass.contains(Reg)) {
7168 // These instructions are all floating point domain, so xorps is the best
7169 // choice.
7170 unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
7171 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
7172 .addReg(Reg, RegState::Undef)
7173 .addReg(Reg, RegState::Undef);
7174 MI.addRegisterKilled(Reg, TRI, true);
7175 } else if (X86::VR256RegClass.contains(Reg)) {
7176 // Use vxorps to clear the full ymm register.
7177 // It wants to read and write the xmm sub-register.
7178 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7179 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
7180 .addReg(XReg, RegState::Undef)
7181 .addReg(XReg, RegState::Undef)
7183 MI.addRegisterKilled(Reg, TRI, true);
7184 } else if (X86::VR128XRegClass.contains(Reg)) {
7185 // Only handle VLX targets.
7186 if (!Subtarget.hasVLX())
7187 return;
7188 // Since vxorps requires AVX512DQ, vpxord should be the best choice.
7189 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg)
7190 .addReg(Reg, RegState::Undef)
7191 .addReg(Reg, RegState::Undef);
7192 MI.addRegisterKilled(Reg, TRI, true);
7193 } else if (X86::VR256XRegClass.contains(Reg) ||
7194 X86::VR512RegClass.contains(Reg)) {
7195 // Only handle VLX targets.
7196 if (!Subtarget.hasVLX())
7197 return;
7198 // Use vpxord to clear the full ymm/zmm register.
7199 // It wants to read and write the xmm sub-register.
7200 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7201 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg)
7202 .addReg(XReg, RegState::Undef)
7203 .addReg(XReg, RegState::Undef)
7205 MI.addRegisterKilled(Reg, TRI, true);
7206 } else if (X86::GR64RegClass.contains(Reg)) {
7207 // Using XOR32rr because it has shorter encoding and zeros up the upper bits
7208 // as well.
7209 Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
7210 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
7211 .addReg(XReg, RegState::Undef)
7212 .addReg(XReg, RegState::Undef)
7214 MI.addRegisterKilled(Reg, TRI, true);
7215 } else if (X86::GR32RegClass.contains(Reg)) {
7216 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
7217 .addReg(Reg, RegState::Undef)
7218 .addReg(Reg, RegState::Undef);
7219 MI.addRegisterKilled(Reg, TRI, true);
7220 } else if ((X86::GR16RegClass.contains(Reg) ||
7221 X86::GR8RegClass.contains(Reg)) &&
7222 X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
7223 // This case is only expected for NDD ops which appear to be partial
7224 // writes, but are not due to the zeroing of the upper part. Here
7225 // we add an implicit def of the superegister, which prevents
7226 // CompressEVEX from converting this to a legacy form.
7227 Register SuperReg = getX86SubSuperRegister(Reg, 64);
7228 MachineInstrBuilder BuildMI(*MI.getParent()->getParent(), &MI);
7229 if (!MI.definesRegister(SuperReg, /*TRI=*/nullptr))
7230 BuildMI.addReg(SuperReg, RegState::ImplicitDefine);
7231 }
7232}
7233
7235 int PtrOffset = 0) {
7236 unsigned NumAddrOps = MOs.size();
7237
7238 if (NumAddrOps < 4) {
7239 // FrameIndex only - add an immediate offset (whether its zero or not).
7240 for (unsigned i = 0; i != NumAddrOps; ++i)
7241 MIB.add(MOs[i]);
7242 addOffset(MIB, PtrOffset);
7243 } else {
7244 // General Memory Addressing - we need to add any offset to an existing
7245 // offset.
7246 assert(MOs.size() == 5 && "Unexpected memory operand list length");
7247 for (unsigned i = 0; i != NumAddrOps; ++i) {
7248 const MachineOperand &MO = MOs[i];
7249 if (i == 3 && PtrOffset != 0) {
7250 MIB.addDisp(MO, PtrOffset);
7251 } else {
7252 MIB.add(MO);
7253 }
7254 }
7255 }
7256}
7257
7259 MachineInstr &NewMI,
7260 const TargetInstrInfo &TII) {
7262
7263 for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
7264 MachineOperand &MO = NewMI.getOperand(Idx);
7265 // We only need to update constraints on virtual register operands.
7266 if (!MO.isReg())
7267 continue;
7268 Register Reg = MO.getReg();
7269 if (!Reg.isVirtual())
7270 continue;
7271
7272 auto *NewRC =
7273 MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx));
7274 if (!NewRC) {
7275 LLVM_DEBUG(
7276 dbgs() << "WARNING: Unable to update register constraint for operand "
7277 << Idx << " of instruction:\n";
7278 NewMI.dump(); dbgs() << "\n");
7279 }
7280 }
7281}
7282
7283static MachineInstr *fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
7287 const TargetInstrInfo &TII) {
7288 // Create the base instruction with the memory operand as the first part.
7289 // Omit the implicit operands, something BuildMI can't do.
7290 MachineInstr *NewMI =
7291 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7292 MachineInstrBuilder MIB(MF, NewMI);
7293 addOperands(MIB, MOs);
7294
7295 // Loop over the rest of the ri operands, converting them over.
7296 unsigned NumOps = MI.getDesc().getNumOperands() - 2;
7297 for (unsigned i = 0; i != NumOps; ++i) {
7298 MachineOperand &MO = MI.getOperand(i + 2);
7299 MIB.add(MO);
7300 }
7301 for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), NumOps + 2))
7302 MIB.add(MO);
7303
7304 updateOperandRegConstraints(MF, *NewMI, TII);
7305
7306 MachineBasicBlock *MBB = InsertPt->getParent();
7307 MBB->insert(InsertPt, NewMI);
7308
7309 return MIB;
7310}
7311
7312static MachineInstr *fuseInst(MachineFunction &MF, unsigned Opcode,
7313 unsigned OpNo, ArrayRef<MachineOperand> MOs,
7316 int PtrOffset = 0) {
7317 // Omit the implicit operands, something BuildMI can't do.
7318 MachineInstr *NewMI =
7319 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7320 MachineInstrBuilder MIB(MF, NewMI);
7321
7322 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
7323 MachineOperand &MO = MI.getOperand(i);
7324 if (i == OpNo) {
7325 assert(MO.isReg() && "Expected to fold into reg operand!");
7326 addOperands(MIB, MOs, PtrOffset);
7327 } else {
7328 MIB.add(MO);
7329 }
7330 }
7331
7332 updateOperandRegConstraints(MF, *NewMI, TII);
7333
7334 // Copy the NoFPExcept flag from the instruction we're fusing.
7337
7338 MachineBasicBlock *MBB = InsertPt->getParent();
7339 MBB->insert(InsertPt, NewMI);
7340
7341 return MIB;
7342}
7343
7344static MachineInstr *makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
7347 MachineInstr &MI) {
7348 MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
7349 MI.getDebugLoc(), TII.get(Opcode));
7350 addOperands(MIB, MOs);
7351 return MIB.addImm(0);
7352}
7353
7354MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
7355 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7357 unsigned Size, Align Alignment) const {
7358 switch (MI.getOpcode()) {
7359 case X86::INSERTPSrri:
7360 case X86::VINSERTPSrri:
7361 case X86::VINSERTPSZrri:
7362 // Attempt to convert the load of inserted vector into a fold load
7363 // of a single float.
7364 if (OpNum == 2) {
7365 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
7366 unsigned ZMask = Imm & 15;
7367 unsigned DstIdx = (Imm >> 4) & 3;
7368 unsigned SrcIdx = (Imm >> 6) & 3;
7369
7370 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7371 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7372 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7373 if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
7374 (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) {
7375 int PtrOffset = SrcIdx * 4;
7376 unsigned NewImm = (DstIdx << 4) | ZMask;
7377 unsigned NewOpCode =
7378 (MI.getOpcode() == X86::VINSERTPSZrri) ? X86::VINSERTPSZrmi
7379 : (MI.getOpcode() == X86::VINSERTPSrri) ? X86::VINSERTPSrmi
7380 : X86::INSERTPSrmi;
7381 MachineInstr *NewMI =
7382 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
7383 NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
7384 return NewMI;
7385 }
7386 }
7387 break;
7388 case X86::MOVHLPSrr:
7389 case X86::VMOVHLPSrr:
7390 case X86::VMOVHLPSZrr:
7391 // Move the upper 64-bits of the second operand to the lower 64-bits.
7392 // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
7393 // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
7394 if (OpNum == 2) {
7395 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7396 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7397 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7398 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
7399 unsigned NewOpCode =
7400 (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm
7401 : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm
7402 : X86::MOVLPSrm;
7403 MachineInstr *NewMI =
7404 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
7405 return NewMI;
7406 }
7407 }
7408 break;
7409 case X86::UNPCKLPDrr:
7410 // If we won't be able to fold this to the memory form of UNPCKL, use
7411 // MOVHPD instead. Done as custom because we can't have this in the load
7412 // table twice.
7413 if (OpNum == 2) {
7414 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7415 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7416 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7417 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
7418 MachineInstr *NewMI =
7419 fuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
7420 return NewMI;
7421 }
7422 }
7423 break;
7424 case X86::MOV32r0:
7425 if (auto *NewMI =
7426 makeM0Inst(*this, (Size == 4) ? X86::MOV32mi : X86::MOV64mi32, MOs,
7427 InsertPt, MI))
7428 return NewMI;
7429 break;
7430 }
7431
7432 return nullptr;
7433}
7434
7436 MachineInstr &MI) {
7437 if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/ true) ||
7438 !MI.getOperand(1).isReg())
7439 return false;
7440
7441 // The are two cases we need to handle depending on where in the pipeline
7442 // the folding attempt is being made.
7443 // -Register has the undef flag set.
7444 // -Register is produced by the IMPLICIT_DEF instruction.
7445
7446 if (MI.getOperand(1).isUndef())
7447 return true;
7448
7450 MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
7451 return VRegDef && VRegDef->isImplicitDef();
7452}
7453
7454unsigned X86InstrInfo::commuteOperandsForFold(MachineInstr &MI,
7455 unsigned Idx1) const {
7456 unsigned Idx2 = CommuteAnyOperandIndex;
7457 if (!findCommutedOpIndices(MI, Idx1, Idx2))
7458 return Idx1;
7459
7460 bool HasDef = MI.getDesc().getNumDefs();
7461 Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
7462 Register Reg1 = MI.getOperand(Idx1).getReg();
7463 Register Reg2 = MI.getOperand(Idx2).getReg();
7464 bool Tied1 = 0 == MI.getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO);
7465 bool Tied2 = 0 == MI.getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO);
7466
7467 // If either of the commutable operands are tied to the destination
7468 // then we can not commute + fold.
7469 if ((HasDef && Reg0 == Reg1 && Tied1) || (HasDef && Reg0 == Reg2 && Tied2))
7470 return Idx1;
7471
7472 return commuteInstruction(MI, false, Idx1, Idx2) ? Idx2 : Idx1;
7473}
7474
7475static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx) {
7476 if (PrintFailedFusing && !MI.isCopy())
7477 dbgs() << "We failed to fuse operand " << Idx << " in " << MI;
7478}
7479
7481 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7483 unsigned Size, Align Alignment, bool AllowCommute) const {
7484 bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
7485 unsigned Opc = MI.getOpcode();
7486
7487 // For CPUs that favor the register form of a call or push,
7488 // do not fold loads into calls or pushes, unless optimizing for size
7489 // aggressively.
7490 if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
7491 (Opc == X86::CALL32r || Opc == X86::CALL64r ||
7492 Opc == X86::CALL64r_ImpCall || Opc == X86::PUSH16r ||
7493 Opc == X86::PUSH32r || Opc == X86::PUSH64r))
7494 return nullptr;
7495
7496 // Avoid partial and undef register update stalls unless optimizing for size.
7497 if (!MF.getFunction().hasOptSize() &&
7498 (hasPartialRegUpdate(Opc, Subtarget, /*ForLoadFold*/ true) ||
7500 return nullptr;
7501
7502 unsigned NumOps = MI.getDesc().getNumOperands();
7503 bool IsTwoAddr = NumOps > 1 && OpNum < 2 && MI.getOperand(0).isReg() &&
7504 MI.getOperand(1).isReg() &&
7505 MI.getOperand(0).getReg() == MI.getOperand(1).getReg();
7506
7507 // FIXME: AsmPrinter doesn't know how to handle
7508 // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
7509 if (Opc == X86::ADD32ri &&
7510 MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
7511 return nullptr;
7512
7513 // GOTTPOFF relocation loads can only be folded into add instructions.
7514 // FIXME: Need to exclude other relocations that only support specific
7515 // instructions.
7516 if (MOs.size() == X86::AddrNumOperands &&
7517 MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF &&
7518 Opc != X86::ADD64rr)
7519 return nullptr;
7520
7521 // Don't fold loads into indirect calls that need a KCFI check as we'll
7522 // have to unfold these in X86TargetLowering::EmitKCFICheck anyway.
7523 if (MI.isCall() && MI.getCFIType())
7524 return nullptr;
7525
7526 // Attempt to fold any custom cases we have.
7527 if (auto *CustomMI = foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt,
7528 Size, Alignment))
7529 return CustomMI;
7530
7531 // Folding a memory location into the two-address part of a two-address
7532 // instruction is different than folding it other places. It requires
7533 // replacing the *two* registers with the memory location.
7534 //
7535 // Utilize the mapping NonNDD -> RMW for the NDD variant.
7536 unsigned NonNDOpc = Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U;
7537 const X86FoldTableEntry *I =
7538 IsTwoAddr ? lookupTwoAddrFoldTable(NonNDOpc ? NonNDOpc : Opc)
7539 : lookupFoldTable(Opc, OpNum);
7540
7541 MachineInstr *NewMI = nullptr;
7542 if (I) {
7543 unsigned Opcode = I->DstOp;
7544 if (Alignment <
7545 Align(1ULL << ((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT)))
7546 return nullptr;
7547 bool NarrowToMOV32rm = false;
7548 if (Size) {
7550 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7551 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7552 // Check if it's safe to fold the load. If the size of the object is
7553 // narrower than the load width, then it's not.
7554 // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
7555 if ((I->Flags & TB_FOLDED_LOAD) && Size < RCSize) {
7556 // If this is a 64-bit load, but the spill slot is 32, then we can do
7557 // a 32-bit load which is implicitly zero-extended. This likely is
7558 // due to live interval analysis remat'ing a load from stack slot.
7559 if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
7560 return nullptr;
7561 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
7562 return nullptr;
7563 Opcode = X86::MOV32rm;
7564 NarrowToMOV32rm = true;
7565 }
7566 // For stores, make sure the size of the object is equal to the size of
7567 // the store. If the object is larger, the extra bits would be garbage. If
7568 // the object is smaller we might overwrite another object or fault.
7569 if ((I->Flags & TB_FOLDED_STORE) && Size != RCSize)
7570 return nullptr;
7571 }
7572
7573 NewMI = IsTwoAddr ? fuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this)
7574 : fuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
7575
7576 if (NarrowToMOV32rm) {
7577 // If this is the special case where we use a MOV32rm to load a 32-bit
7578 // value and zero-extend the top bits. Change the destination register
7579 // to a 32-bit one.
7580 Register DstReg = NewMI->getOperand(0).getReg();
7581 if (DstReg.isPhysical())
7582 NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
7583 else
7584 NewMI->getOperand(0).setSubReg(X86::sub_32bit);
7585 }
7586 return NewMI;
7587 }
7588
7589 if (AllowCommute) {
7590 // If the instruction and target operand are commutable, commute the
7591 // instruction and try again.
7592 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
7593 if (CommuteOpIdx2 == OpNum) {
7594 printFailMsgforFold(MI, OpNum);
7595 return nullptr;
7596 }
7597 // Attempt to fold with the commuted version of the instruction.
7598 NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
7599 Alignment, /*AllowCommute=*/false);
7600 if (NewMI)
7601 return NewMI;
7602 // Folding failed again - undo the commute before returning.
7603 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
7604 }
7605
7606 printFailMsgforFold(MI, OpNum);
7607 return nullptr;
7608}
7609
7612 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
7613 VirtRegMap *VRM) const {
7614 // Check switch flag
7615 if (NoFusing)
7616 return nullptr;
7617
7618 // Avoid partial and undef register update stalls unless optimizing for size.
7619 if (!MF.getFunction().hasOptSize() &&
7620 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
7622 return nullptr;
7623
7624 // Don't fold subreg spills, or reloads that use a high subreg.
7625 for (auto Op : Ops) {
7626 MachineOperand &MO = MI.getOperand(Op);
7627 auto SubReg = MO.getSubReg();
7628 // MOV32r0 is special b/c it's used to clear a 64-bit register too.
7629 // (See patterns for MOV32r0 in TD files).
7630 if (MI.getOpcode() == X86::MOV32r0 && SubReg == X86::sub_32bit)
7631 continue;
7632 if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
7633 return nullptr;
7634 }
7635
7636 const MachineFrameInfo &MFI = MF.getFrameInfo();
7637 unsigned Size = MFI.getObjectSize(FrameIndex);
7638 Align Alignment = MFI.getObjectAlign(FrameIndex);
7639 // If the function stack isn't realigned we don't want to fold instructions
7640 // that need increased alignment.
7641 if (!RI.hasStackRealignment(MF))
7642 Alignment =
7643 std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
7644
7645 auto Impl = [&]() {
7646 return foldMemoryOperandImpl(MF, MI, Ops[0],
7647 MachineOperand::CreateFI(FrameIndex), InsertPt,
7648 Size, Alignment, /*AllowCommute=*/true);
7649 };
7650 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
7651 unsigned NewOpc = 0;
7652 unsigned RCSize = 0;
7653 unsigned Opc = MI.getOpcode();
7654 switch (Opc) {
7655 default:
7656 // NDD can be folded into RMW though its Op0 and Op1 are not tied.
7657 return (Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U) ? Impl()
7658 : nullptr;
7659 case X86::TEST8rr:
7660 NewOpc = X86::CMP8ri;
7661 RCSize = 1;
7662 break;
7663 case X86::TEST16rr:
7664 NewOpc = X86::CMP16ri;
7665 RCSize = 2;
7666 break;
7667 case X86::TEST32rr:
7668 NewOpc = X86::CMP32ri;
7669 RCSize = 4;
7670 break;
7671 case X86::TEST64rr:
7672 NewOpc = X86::CMP64ri32;
7673 RCSize = 8;
7674 break;
7675 }
7676 // Check if it's safe to fold the load. If the size of the object is
7677 // narrower than the load width, then it's not.
7678 if (Size < RCSize)
7679 return nullptr;
7680 // Change to CMPXXri r, 0 first.
7681 MI.setDesc(get(NewOpc));
7682 MI.getOperand(1).ChangeToImmediate(0);
7683 } else if (Ops.size() != 1)
7684 return nullptr;
7685
7686 return Impl();
7687}
7688
7689/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
7690/// because the latter uses contents that wouldn't be defined in the folded
7691/// version. For instance, this transformation isn't legal:
7692/// movss (%rdi), %xmm0
7693/// addps %xmm0, %xmm0
7694/// ->
7695/// addps (%rdi), %xmm0
7696///
7697/// But this one is:
7698/// movss (%rdi), %xmm0
7699/// addss %xmm0, %xmm0
7700/// ->
7701/// addss (%rdi), %xmm0
7702///
7704 const MachineInstr &UserMI,
7705 const MachineFunction &MF) {
7706 unsigned Opc = LoadMI.getOpcode();
7707 unsigned UserOpc = UserMI.getOpcode();
7709 const TargetRegisterClass *RC =
7710 MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
7711 unsigned RegSize = TRI.getRegSizeInBits(*RC);
7712
7713 if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm ||
7714 Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt ||
7715 Opc == X86::VMOVSSZrm_alt) &&
7716 RegSize > 32) {
7717 // These instructions only load 32 bits, we can't fold them if the
7718 // destination register is wider than 32 bits (4 bytes), and its user
7719 // instruction isn't scalar (SS).
7720 switch (UserOpc) {
7721 case X86::CVTSS2SDrr_Int:
7722 case X86::VCVTSS2SDrr_Int:
7723 case X86::VCVTSS2SDZrr_Int:
7724 case X86::VCVTSS2SDZrrk_Int:
7725 case X86::VCVTSS2SDZrrkz_Int:
7726 case X86::CVTSS2SIrr_Int:
7727 case X86::CVTSS2SI64rr_Int:
7728 case X86::VCVTSS2SIrr_Int:
7729 case X86::VCVTSS2SI64rr_Int:
7730 case X86::VCVTSS2SIZrr_Int:
7731 case X86::VCVTSS2SI64Zrr_Int:
7732 case X86::CVTTSS2SIrr_Int:
7733 case X86::CVTTSS2SI64rr_Int:
7734 case X86::VCVTTSS2SIrr_Int:
7735 case X86::VCVTTSS2SI64rr_Int:
7736 case X86::VCVTTSS2SIZrr_Int:
7737 case X86::VCVTTSS2SI64Zrr_Int:
7738 case X86::VCVTSS2USIZrr_Int:
7739 case X86::VCVTSS2USI64Zrr_Int:
7740 case X86::VCVTTSS2USIZrr_Int:
7741 case X86::VCVTTSS2USI64Zrr_Int:
7742 case X86::RCPSSr_Int:
7743 case X86::VRCPSSr_Int:
7744 case X86::RSQRTSSr_Int:
7745 case X86::VRSQRTSSr_Int:
7746 case X86::ROUNDSSri_Int:
7747 case X86::VROUNDSSri_Int:
7748 case X86::COMISSrr_Int:
7749 case X86::VCOMISSrr_Int:
7750 case X86::VCOMISSZrr_Int:
7751 case X86::UCOMISSrr_Int:
7752 case X86::VUCOMISSrr_Int:
7753 case X86::VUCOMISSZrr_Int:
7754 case X86::ADDSSrr_Int:
7755 case X86::VADDSSrr_Int:
7756 case X86::VADDSSZrr_Int:
7757 case X86::CMPSSrri_Int:
7758 case X86::VCMPSSrri_Int:
7759 case X86::VCMPSSZrri_Int:
7760 case X86::DIVSSrr_Int:
7761 case X86::VDIVSSrr_Int:
7762 case X86::VDIVSSZrr_Int:
7763 case X86::MAXSSrr_Int:
7764 case X86::VMAXSSrr_Int:
7765 case X86::VMAXSSZrr_Int:
7766 case X86::MINSSrr_Int:
7767 case X86::VMINSSrr_Int:
7768 case X86::VMINSSZrr_Int:
7769 case X86::MULSSrr_Int:
7770 case X86::VMULSSrr_Int:
7771 case X86::VMULSSZrr_Int:
7772 case X86::SQRTSSr_Int:
7773 case X86::VSQRTSSr_Int:
7774 case X86::VSQRTSSZr_Int:
7775 case X86::SUBSSrr_Int:
7776 case X86::VSUBSSrr_Int:
7777 case X86::VSUBSSZrr_Int:
7778 case X86::VADDSSZrrk_Int:
7779 case X86::VADDSSZrrkz_Int:
7780 case X86::VCMPSSZrrik_Int:
7781 case X86::VDIVSSZrrk_Int:
7782 case X86::VDIVSSZrrkz_Int:
7783 case X86::VMAXSSZrrk_Int:
7784 case X86::VMAXSSZrrkz_Int:
7785 case X86::VMINSSZrrk_Int:
7786 case X86::VMINSSZrrkz_Int:
7787 case X86::VMULSSZrrk_Int:
7788 case X86::VMULSSZrrkz_Int:
7789 case X86::VSQRTSSZrk_Int:
7790 case X86::VSQRTSSZrkz_Int:
7791 case X86::VSUBSSZrrk_Int:
7792 case X86::VSUBSSZrrkz_Int:
7793 case X86::VFMADDSS4rr_Int:
7794 case X86::VFNMADDSS4rr_Int:
7795 case X86::VFMSUBSS4rr_Int:
7796 case X86::VFNMSUBSS4rr_Int:
7797 case X86::VFMADD132SSr_Int:
7798 case X86::VFNMADD132SSr_Int:
7799 case X86::VFMADD213SSr_Int:
7800 case X86::VFNMADD213SSr_Int:
7801 case X86::VFMADD231SSr_Int:
7802 case X86::VFNMADD231SSr_Int:
7803 case X86::VFMSUB132SSr_Int:
7804 case X86::VFNMSUB132SSr_Int:
7805 case X86::VFMSUB213SSr_Int:
7806 case X86::VFNMSUB213SSr_Int:
7807 case X86::VFMSUB231SSr_Int:
7808 case X86::VFNMSUB231SSr_Int:
7809 case X86::VFMADD132SSZr_Int:
7810 case X86::VFNMADD132SSZr_Int:
7811 case X86::VFMADD213SSZr_Int:
7812 case X86::VFNMADD213SSZr_Int:
7813 case X86::VFMADD231SSZr_Int:
7814 case X86::VFNMADD231SSZr_Int:
7815 case X86::VFMSUB132SSZr_Int:
7816 case X86::VFNMSUB132SSZr_Int:
7817 case X86::VFMSUB213SSZr_Int:
7818 case X86::VFNMSUB213SSZr_Int:
7819 case X86::VFMSUB231SSZr_Int:
7820 case X86::VFNMSUB231SSZr_Int:
7821 case X86::VFMADD132SSZrk_Int:
7822 case X86::VFNMADD132SSZrk_Int:
7823 case X86::VFMADD213SSZrk_Int:
7824 case X86::VFNMADD213SSZrk_Int:
7825 case X86::VFMADD231SSZrk_Int:
7826 case X86::VFNMADD231SSZrk_Int:
7827 case X86::VFMSUB132SSZrk_Int:
7828 case X86::VFNMSUB132SSZrk_Int:
7829 case X86::VFMSUB213SSZrk_Int:
7830 case X86::VFNMSUB213SSZrk_Int:
7831 case X86::VFMSUB231SSZrk_Int:
7832 case X86::VFNMSUB231SSZrk_Int:
7833 case X86::VFMADD132SSZrkz_Int:
7834 case X86::VFNMADD132SSZrkz_Int:
7835 case X86::VFMADD213SSZrkz_Int:
7836 case X86::VFNMADD213SSZrkz_Int:
7837 case X86::VFMADD231SSZrkz_Int:
7838 case X86::VFNMADD231SSZrkz_Int:
7839 case X86::VFMSUB132SSZrkz_Int:
7840 case X86::VFNMSUB132SSZrkz_Int:
7841 case X86::VFMSUB213SSZrkz_Int:
7842 case X86::VFNMSUB213SSZrkz_Int:
7843 case X86::VFMSUB231SSZrkz_Int:
7844 case X86::VFNMSUB231SSZrkz_Int:
7845 case X86::VFIXUPIMMSSZrri:
7846 case X86::VFIXUPIMMSSZrrik:
7847 case X86::VFIXUPIMMSSZrrikz:
7848 case X86::VFPCLASSSSZri:
7849 case X86::VFPCLASSSSZrik:
7850 case X86::VGETEXPSSZr:
7851 case X86::VGETEXPSSZrk:
7852 case X86::VGETEXPSSZrkz:
7853 case X86::VGETMANTSSZrri:
7854 case X86::VGETMANTSSZrrik:
7855 case X86::VGETMANTSSZrrikz:
7856 case X86::VRANGESSZrri:
7857 case X86::VRANGESSZrrik:
7858 case X86::VRANGESSZrrikz:
7859 case X86::VRCP14SSZrr:
7860 case X86::VRCP14SSZrrk:
7861 case X86::VRCP14SSZrrkz:
7862 case X86::VRCP28SSZr:
7863 case X86::VRCP28SSZrk:
7864 case X86::VRCP28SSZrkz:
7865 case X86::VREDUCESSZrri:
7866 case X86::VREDUCESSZrrik:
7867 case X86::VREDUCESSZrrikz:
7868 case X86::VRNDSCALESSZrri_Int:
7869 case X86::VRNDSCALESSZrrik_Int:
7870 case X86::VRNDSCALESSZrrikz_Int:
7871 case X86::VRSQRT14SSZrr:
7872 case X86::VRSQRT14SSZrrk:
7873 case X86::VRSQRT14SSZrrkz:
7874 case X86::VRSQRT28SSZr:
7875 case X86::VRSQRT28SSZrk:
7876 case X86::VRSQRT28SSZrkz:
7877 case X86::VSCALEFSSZrr:
7878 case X86::VSCALEFSSZrrk:
7879 case X86::VSCALEFSSZrrkz:
7880 return false;
7881 default:
7882 return true;
7883 }
7884 }
7885
7886 if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm ||
7887 Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt ||
7888 Opc == X86::VMOVSDZrm_alt) &&
7889 RegSize > 64) {
7890 // These instructions only load 64 bits, we can't fold them if the
7891 // destination register is wider than 64 bits (8 bytes), and its user
7892 // instruction isn't scalar (SD).
7893 switch (UserOpc) {
7894 case X86::CVTSD2SSrr_Int:
7895 case X86::VCVTSD2SSrr_Int:
7896 case X86::VCVTSD2SSZrr_Int:
7897 case X86::VCVTSD2SSZrrk_Int:
7898 case X86::VCVTSD2SSZrrkz_Int:
7899 case X86::CVTSD2SIrr_Int:
7900 case X86::CVTSD2SI64rr_Int:
7901 case X86::VCVTSD2SIrr_Int:
7902 case X86::VCVTSD2SI64rr_Int:
7903 case X86::VCVTSD2SIZrr_Int:
7904 case X86::VCVTSD2SI64Zrr_Int:
7905 case X86::CVTTSD2SIrr_Int:
7906 case X86::CVTTSD2SI64rr_Int:
7907 case X86::VCVTTSD2SIrr_Int:
7908 case X86::VCVTTSD2SI64rr_Int:
7909 case X86::VCVTTSD2SIZrr_Int:
7910 case X86::VCVTTSD2SI64Zrr_Int:
7911 case X86::VCVTSD2USIZrr_Int:
7912 case X86::VCVTSD2USI64Zrr_Int:
7913 case X86::VCVTTSD2USIZrr_Int:
7914 case X86::VCVTTSD2USI64Zrr_Int:
7915 case X86::ROUNDSDri_Int:
7916 case X86::VROUNDSDri_Int:
7917 case X86::COMISDrr_Int:
7918 case X86::VCOMISDrr_Int:
7919 case X86::VCOMISDZrr_Int:
7920 case X86::UCOMISDrr_Int:
7921 case X86::VUCOMISDrr_Int:
7922 case X86::VUCOMISDZrr_Int:
7923 case X86::ADDSDrr_Int:
7924 case X86::VADDSDrr_Int:
7925 case X86::VADDSDZrr_Int:
7926 case X86::CMPSDrri_Int:
7927 case X86::VCMPSDrri_Int:
7928 case X86::VCMPSDZrri_Int:
7929 case X86::DIVSDrr_Int:
7930 case X86::VDIVSDrr_Int:
7931 case X86::VDIVSDZrr_Int:
7932 case X86::MAXSDrr_Int:
7933 case X86::VMAXSDrr_Int:
7934 case X86::VMAXSDZrr_Int:
7935 case X86::MINSDrr_Int:
7936 case X86::VMINSDrr_Int:
7937 case X86::VMINSDZrr_Int:
7938 case X86::MULSDrr_Int:
7939 case X86::VMULSDrr_Int:
7940 case X86::VMULSDZrr_Int:
7941 case X86::SQRTSDr_Int:
7942 case X86::VSQRTSDr_Int:
7943 case X86::VSQRTSDZr_Int:
7944 case X86::SUBSDrr_Int:
7945 case X86::VSUBSDrr_Int:
7946 case X86::VSUBSDZrr_Int:
7947 case X86::VADDSDZrrk_Int:
7948 case X86::VADDSDZrrkz_Int:
7949 case X86::VCMPSDZrrik_Int:
7950 case X86::VDIVSDZrrk_Int:
7951 case X86::VDIVSDZrrkz_Int:
7952 case X86::VMAXSDZrrk_Int:
7953 case X86::VMAXSDZrrkz_Int:
7954 case X86::VMINSDZrrk_Int:
7955 case X86::VMINSDZrrkz_Int:
7956 case X86::VMULSDZrrk_Int:
7957 case X86::VMULSDZrrkz_Int:
7958 case X86::VSQRTSDZrk_Int:
7959 case X86::VSQRTSDZrkz_Int:
7960 case X86::VSUBSDZrrk_Int:
7961 case X86::VSUBSDZrrkz_Int:
7962 case X86::VFMADDSD4rr_Int:
7963 case X86::VFNMADDSD4rr_Int:
7964 case X86::VFMSUBSD4rr_Int:
7965 case X86::VFNMSUBSD4rr_Int:
7966 case X86::VFMADD132SDr_Int:
7967 case X86::VFNMADD132SDr_Int:
7968 case X86::VFMADD213SDr_Int:
7969 case X86::VFNMADD213SDr_Int:
7970 case X86::VFMADD231SDr_Int:
7971 case X86::VFNMADD231SDr_Int:
7972 case X86::VFMSUB132SDr_Int:
7973 case X86::VFNMSUB132SDr_Int:
7974 case X86::VFMSUB213SDr_Int:
7975 case X86::VFNMSUB213SDr_Int:
7976 case X86::VFMSUB231SDr_Int:
7977 case X86::VFNMSUB231SDr_Int:
7978 case X86::VFMADD132SDZr_Int:
7979 case X86::VFNMADD132SDZr_Int:
7980 case X86::VFMADD213SDZr_Int:
7981 case X86::VFNMADD213SDZr_Int:
7982 case X86::VFMADD231SDZr_Int:
7983 case X86::VFNMADD231SDZr_Int:
7984 case X86::VFMSUB132SDZr_Int:
7985 case X86::VFNMSUB132SDZr_Int:
7986 case X86::VFMSUB213SDZr_Int:
7987 case X86::VFNMSUB213SDZr_Int:
7988 case X86::VFMSUB231SDZr_Int:
7989 case X86::VFNMSUB231SDZr_Int:
7990 case X86::VFMADD132SDZrk_Int:
7991 case X86::VFNMADD132SDZrk_Int:
7992 case X86::VFMADD213SDZrk_Int:
7993 case X86::VFNMADD213SDZrk_Int:
7994 case X86::VFMADD231SDZrk_Int:
7995 case X86::VFNMADD231SDZrk_Int:
7996 case X86::VFMSUB132SDZrk_Int:
7997 case X86::VFNMSUB132SDZrk_Int:
7998 case X86::VFMSUB213SDZrk_Int:
7999 case X86::VFNMSUB213SDZrk_Int:
8000 case X86::VFMSUB231SDZrk_Int:
8001 case X86::VFNMSUB231SDZrk_Int:
8002 case X86::VFMADD132SDZrkz_Int:
8003 case X86::VFNMADD132SDZrkz_Int:
8004 case X86::VFMADD213SDZrkz_Int:
8005 case X86::VFNMADD213SDZrkz_Int:
8006 case X86::VFMADD231SDZrkz_Int:
8007 case X86::VFNMADD231SDZrkz_Int:
8008 case X86::VFMSUB132SDZrkz_Int:
8009 case X86::VFNMSUB132SDZrkz_Int:
8010 case X86::VFMSUB213SDZrkz_Int:
8011 case X86::VFNMSUB213SDZrkz_Int:
8012 case X86::VFMSUB231SDZrkz_Int:
8013 case X86::VFNMSUB231SDZrkz_Int:
8014 case X86::VFIXUPIMMSDZrri:
8015 case X86::VFIXUPIMMSDZrrik:
8016 case X86::VFIXUPIMMSDZrrikz:
8017 case X86::VFPCLASSSDZri:
8018 case X86::VFPCLASSSDZrik:
8019 case X86::VGETEXPSDZr:
8020 case X86::VGETEXPSDZrk:
8021 case X86::VGETEXPSDZrkz:
8022 case X86::VGETMANTSDZrri:
8023 case X86::VGETMANTSDZrrik:
8024 case X86::VGETMANTSDZrrikz:
8025 case X86::VRANGESDZrri:
8026 case X86::VRANGESDZrrik:
8027 case X86::VRANGESDZrrikz:
8028 case X86::VRCP14SDZrr:
8029 case X86::VRCP14SDZrrk:
8030 case X86::VRCP14SDZrrkz:
8031 case X86::VRCP28SDZr:
8032 case X86::VRCP28SDZrk:
8033 case X86::VRCP28SDZrkz:
8034 case X86::VREDUCESDZrri:
8035 case X86::VREDUCESDZrrik:
8036 case X86::VREDUCESDZrrikz:
8037 case X86::VRNDSCALESDZrri_Int:
8038 case X86::VRNDSCALESDZrrik_Int:
8039 case X86::VRNDSCALESDZrrikz_Int:
8040 case X86::VRSQRT14SDZrr:
8041 case X86::VRSQRT14SDZrrk:
8042 case X86::VRSQRT14SDZrrkz:
8043 case X86::VRSQRT28SDZr:
8044 case X86::VRSQRT28SDZrk:
8045 case X86::VRSQRT28SDZrkz:
8046 case X86::VSCALEFSDZrr:
8047 case X86::VSCALEFSDZrrk:
8048 case X86::VSCALEFSDZrrkz:
8049 return false;
8050 default:
8051 return true;
8052 }
8053 }
8054
8055 if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) {
8056 // These instructions only load 16 bits, we can't fold them if the
8057 // destination register is wider than 16 bits (2 bytes), and its user
8058 // instruction isn't scalar (SH).
8059 switch (UserOpc) {
8060 case X86::VADDSHZrr_Int:
8061 case X86::VCMPSHZrri_Int:
8062 case X86::VDIVSHZrr_Int:
8063 case X86::VMAXSHZrr_Int:
8064 case X86::VMINSHZrr_Int:
8065 case X86::VMULSHZrr_Int:
8066 case X86::VSUBSHZrr_Int:
8067 case X86::VADDSHZrrk_Int:
8068 case X86::VADDSHZrrkz_Int:
8069 case X86::VCMPSHZrrik_Int:
8070 case X86::VDIVSHZrrk_Int:
8071 case X86::VDIVSHZrrkz_Int:
8072 case X86::VMAXSHZrrk_Int:
8073 case X86::VMAXSHZrrkz_Int:
8074 case X86::VMINSHZrrk_Int:
8075 case X86::VMINSHZrrkz_Int:
8076 case X86::VMULSHZrrk_Int:
8077 case X86::VMULSHZrrkz_Int:
8078 case X86::VSUBSHZrrk_Int:
8079 case X86::VSUBSHZrrkz_Int:
8080 case X86::VFMADD132SHZr_Int:
8081 case X86::VFNMADD132SHZr_Int:
8082 case X86::VFMADD213SHZr_Int:
8083 case X86::VFNMADD213SHZr_Int:
8084 case X86::VFMADD231SHZr_Int:
8085 case X86::VFNMADD231SHZr_Int:
8086 case X86::VFMSUB132SHZr_Int:
8087 case X86::VFNMSUB132SHZr_Int:
8088 case X86::VFMSUB213SHZr_Int:
8089 case X86::VFNMSUB213SHZr_Int:
8090 case X86::VFMSUB231SHZr_Int:
8091 case X86::VFNMSUB231SHZr_Int:
8092 case X86::VFMADD132SHZrk_Int:
8093 case X86::VFNMADD132SHZrk_Int:
8094 case X86::VFMADD213SHZrk_Int:
8095 case X86::VFNMADD213SHZrk_Int:
8096 case X86::VFMADD231SHZrk_Int:
8097 case X86::VFNMADD231SHZrk_Int:
8098 case X86::VFMSUB132SHZrk_Int:
8099 case X86::VFNMSUB132SHZrk_Int:
8100 case X86::VFMSUB213SHZrk_Int:
8101 case X86::VFNMSUB213SHZrk_Int:
8102 case X86::VFMSUB231SHZrk_Int:
8103 case X86::VFNMSUB231SHZrk_Int:
8104 case X86::VFMADD132SHZrkz_Int:
8105 case X86::VFNMADD132SHZrkz_Int:
8106 case X86::VFMADD213SHZrkz_Int:
8107 case X86::VFNMADD213SHZrkz_Int:
8108 case X86::VFMADD231SHZrkz_Int:
8109 case X86::VFNMADD231SHZrkz_Int:
8110 case X86::VFMSUB132SHZrkz_Int:
8111 case X86::VFNMSUB132SHZrkz_Int:
8112 case X86::VFMSUB213SHZrkz_Int:
8113 case X86::VFNMSUB213SHZrkz_Int:
8114 case X86::VFMSUB231SHZrkz_Int:
8115 case X86::VFNMSUB231SHZrkz_Int:
8116 return false;
8117 default:
8118 return true;
8119 }
8120 }
8121
8122 return false;
8123}
8124
8127 MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
8128 LiveIntervals *LIS) const {
8129
8130 // If LoadMI is a masked load, check MI having the same mask.
8131 const MCInstrDesc &MCID = get(LoadMI.getOpcode());
8132 unsigned NumOps = MCID.getNumOperands();
8133 if (NumOps >= 3) {
8134 Register MaskReg;
8135 const MachineOperand &Op1 = LoadMI.getOperand(1);
8136 const MachineOperand &Op2 = LoadMI.getOperand(2);
8137
8138 auto IsVKWMClass = [](const TargetRegisterClass *RC) {
8139 return RC == &X86::VK2WMRegClass || RC == &X86::VK4WMRegClass ||
8140 RC == &X86::VK8WMRegClass || RC == &X86::VK16WMRegClass ||
8141 RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass;
8142 };
8143
8144 if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1)))
8145 MaskReg = Op1.getReg();
8146 else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2)))
8147 MaskReg = Op2.getReg();
8148
8149 if (MaskReg) {
8150 bool HasSameMask = false;
8151 for (unsigned I = 1, E = MI.getDesc().getNumOperands(); I < E; ++I) {
8152 const MachineOperand &Op = MI.getOperand(I);
8153 if (Op.isReg() && Op.getReg() == MaskReg) {
8154 HasSameMask = true;
8155 break;
8156 }
8157 }
8158 if (!HasSameMask)
8159 return nullptr;
8160 }
8161 }
8162
8163 // TODO: Support the case where LoadMI loads a wide register, but MI
8164 // only uses a subreg.
8165 for (auto Op : Ops) {
8166 if (MI.getOperand(Op).getSubReg())
8167 return nullptr;
8168 }
8169
8170 // If loading from a FrameIndex, fold directly from the FrameIndex.
8171 int FrameIndex;
8172 if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
8173 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8174 return nullptr;
8175 return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
8176 }
8177
8178 // Check switch flag
8179 if (NoFusing)
8180 return nullptr;
8181
8182 // Avoid partial and undef register update stalls unless optimizing for size.
8183 if (!MF.getFunction().hasOptSize() &&
8184 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
8186 return nullptr;
8187
8188 // Do not fold a NDD instruction and a memory instruction with relocation to
8189 // avoid emit APX relocation when the flag is disabled for backward
8190 // compatibility.
8191 uint64_t TSFlags = MI.getDesc().TSFlags;
8193 X86II::hasNewDataDest(TSFlags))
8194 return nullptr;
8195
8196 // Determine the alignment of the load.
8197 Align Alignment;
8198 unsigned LoadOpc = LoadMI.getOpcode();
8199 if (LoadMI.hasOneMemOperand())
8200 Alignment = (*LoadMI.memoperands_begin())->getAlign();
8201 else
8202 switch (LoadOpc) {
8203 case X86::AVX512_512_SET0:
8204 case X86::AVX512_512_SETALLONES:
8205 Alignment = Align(64);
8206 break;
8207 case X86::AVX2_SETALLONES:
8208 case X86::AVX1_SETALLONES:
8209 case X86::AVX_SET0:
8210 case X86::AVX512_256_SET0:
8211 case X86::AVX512_256_SETALLONES:
8212 Alignment = Align(32);
8213 break;
8214 case X86::V_SET0:
8215 case X86::V_SETALLONES:
8216 case X86::AVX512_128_SET0:
8217 case X86::FsFLD0F128:
8218 case X86::AVX512_FsFLD0F128:
8219 case X86::AVX512_128_SETALLONES:
8220 Alignment = Align(16);
8221 break;
8222 case X86::MMX_SET0:
8223 case X86::FsFLD0SD:
8224 case X86::AVX512_FsFLD0SD:
8225 Alignment = Align(8);
8226 break;
8227 case X86::FsFLD0SS:
8228 case X86::AVX512_FsFLD0SS:
8229 Alignment = Align(4);
8230 break;
8231 case X86::FsFLD0SH:
8232 case X86::AVX512_FsFLD0SH:
8233 Alignment = Align(2);
8234 break;
8235 default:
8236 return nullptr;
8237 }
8238 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
8239 unsigned NewOpc = 0;
8240 switch (MI.getOpcode()) {
8241 default:
8242 return nullptr;
8243 case X86::TEST8rr:
8244 NewOpc = X86::CMP8ri;
8245 break;
8246 case X86::TEST16rr:
8247 NewOpc = X86::CMP16ri;
8248 break;
8249 case X86::TEST32rr:
8250 NewOpc = X86::CMP32ri;
8251 break;
8252 case X86::TEST64rr:
8253 NewOpc = X86::CMP64ri32;
8254 break;
8255 }
8256 // Change to CMPXXri r, 0 first.
8257 MI.setDesc(get(NewOpc));
8258 MI.getOperand(1).ChangeToImmediate(0);
8259 } else if (Ops.size() != 1)
8260 return nullptr;
8261
8262 // Make sure the subregisters match.
8263 // Otherwise we risk changing the size of the load.
8264 if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
8265 return nullptr;
8266
8268 switch (LoadOpc) {
8269 case X86::MMX_SET0:
8270 case X86::V_SET0:
8271 case X86::V_SETALLONES:
8272 case X86::AVX2_SETALLONES:
8273 case X86::AVX1_SETALLONES:
8274 case X86::AVX_SET0:
8275 case X86::AVX512_128_SET0:
8276 case X86::AVX512_256_SET0:
8277 case X86::AVX512_512_SET0:
8278 case X86::AVX512_128_SETALLONES:
8279 case X86::AVX512_256_SETALLONES:
8280 case X86::AVX512_512_SETALLONES:
8281 case X86::FsFLD0SH:
8282 case X86::AVX512_FsFLD0SH:
8283 case X86::FsFLD0SD:
8284 case X86::AVX512_FsFLD0SD:
8285 case X86::FsFLD0SS:
8286 case X86::AVX512_FsFLD0SS:
8287 case X86::FsFLD0F128:
8288 case X86::AVX512_FsFLD0F128: {
8289 // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
8290 // Create a constant-pool entry and operands to load from it.
8291
8292 // Large code model can't fold loads this way.
8294 return nullptr;
8295
8296 // x86-32 PIC requires a PIC base register for constant pools.
8297 unsigned PICBase = 0;
8298 // Since we're using Small or Kernel code model, we can always use
8299 // RIP-relative addressing for a smaller encoding.
8300 if (Subtarget.is64Bit()) {
8301 PICBase = X86::RIP;
8302 } else if (MF.getTarget().isPositionIndependent()) {
8303 // FIXME: PICBase = getGlobalBaseReg(&MF);
8304 // This doesn't work for several reasons.
8305 // 1. GlobalBaseReg may have been spilled.
8306 // 2. It may not be live at MI.
8307 return nullptr;
8308 }
8309
8310 // Create a constant-pool entry.
8312 Type *Ty;
8313 bool IsAllOnes = false;
8314 switch (LoadOpc) {
8315 case X86::FsFLD0SS:
8316 case X86::AVX512_FsFLD0SS:
8318 break;
8319 case X86::FsFLD0SD:
8320 case X86::AVX512_FsFLD0SD:
8322 break;
8323 case X86::FsFLD0F128:
8324 case X86::AVX512_FsFLD0F128:
8326 break;
8327 case X86::FsFLD0SH:
8328 case X86::AVX512_FsFLD0SH:
8330 break;
8331 case X86::AVX512_512_SETALLONES:
8332 IsAllOnes = true;
8333 [[fallthrough]];
8334 case X86::AVX512_512_SET0:
8336 16);
8337 break;
8338 case X86::AVX1_SETALLONES:
8339 case X86::AVX2_SETALLONES:
8340 case X86::AVX512_256_SETALLONES:
8341 IsAllOnes = true;
8342 [[fallthrough]];
8343 case X86::AVX512_256_SET0:
8344 case X86::AVX_SET0:
8346 8);
8347
8348 break;
8349 case X86::MMX_SET0:
8351 2);
8352 break;
8353 case X86::V_SETALLONES:
8354 case X86::AVX512_128_SETALLONES:
8355 IsAllOnes = true;
8356 [[fallthrough]];
8357 case X86::V_SET0:
8358 case X86::AVX512_128_SET0:
8360 4);
8361 break;
8362 }
8363
8364 const Constant *C =
8366 unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
8367
8368 // Create operands to load from the constant pool entry.
8369 MOs.push_back(MachineOperand::CreateReg(PICBase, false));
8371 MOs.push_back(MachineOperand::CreateReg(0, false));
8373 MOs.push_back(MachineOperand::CreateReg(0, false));
8374 break;
8375 }
8376 case X86::VPBROADCASTBZ128rm:
8377 case X86::VPBROADCASTBZ256rm:
8378 case X86::VPBROADCASTBZrm:
8379 case X86::VBROADCASTF32X2Z256rm:
8380 case X86::VBROADCASTF32X2Zrm:
8381 case X86::VBROADCASTI32X2Z128rm:
8382 case X86::VBROADCASTI32X2Z256rm:
8383 case X86::VBROADCASTI32X2Zrm:
8384 // No instructions currently fuse with 8bits or 32bits x 2.
8385 return nullptr;
8386
8387#define FOLD_BROADCAST(SIZE) \
8388 MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, \
8389 LoadMI.operands_begin() + NumOps); \
8390 return foldMemoryBroadcast(MF, MI, Ops[0], MOs, InsertPt, /*Size=*/SIZE, \
8391 /*AllowCommute=*/true);
8392 case X86::VPBROADCASTWZ128rm:
8393 case X86::VPBROADCASTWZ256rm:
8394 case X86::VPBROADCASTWZrm:
8395 FOLD_BROADCAST(16);
8396 case X86::VPBROADCASTDZ128rm:
8397 case X86::VPBROADCASTDZ256rm:
8398 case X86::VPBROADCASTDZrm:
8399 case X86::VBROADCASTSSZ128rm:
8400 case X86::VBROADCASTSSZ256rm:
8401 case X86::VBROADCASTSSZrm:
8402 FOLD_BROADCAST(32);
8403 case X86::VPBROADCASTQZ128rm:
8404 case X86::VPBROADCASTQZ256rm:
8405 case X86::VPBROADCASTQZrm:
8406 case X86::VBROADCASTSDZ256rm:
8407 case X86::VBROADCASTSDZrm:
8408 FOLD_BROADCAST(64);
8409 default: {
8410 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8411 return nullptr;
8412
8413 // Folding a normal load. Just copy the load's address operands.
8415 LoadMI.operands_begin() + NumOps);
8416 break;
8417 }
8418 }
8419 return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
8420 /*Size=*/0, Alignment, /*AllowCommute=*/true);
8421}
8422
8424X86InstrInfo::foldMemoryBroadcast(MachineFunction &MF, MachineInstr &MI,
8425 unsigned OpNum, ArrayRef<MachineOperand> MOs,
8427 unsigned BitsSize, bool AllowCommute) const {
8428
8429 if (auto *I = lookupBroadcastFoldTable(MI.getOpcode(), OpNum))
8430 return matchBroadcastSize(*I, BitsSize)
8431 ? fuseInst(MF, I->DstOp, OpNum, MOs, InsertPt, MI, *this)
8432 : nullptr;
8433
8434 if (AllowCommute) {
8435 // If the instruction and target operand are commutable, commute the
8436 // instruction and try again.
8437 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
8438 if (CommuteOpIdx2 == OpNum) {
8439 printFailMsgforFold(MI, OpNum);
8440 return nullptr;
8441 }
8442 MachineInstr *NewMI =
8443 foldMemoryBroadcast(MF, MI, CommuteOpIdx2, MOs, InsertPt, BitsSize,
8444 /*AllowCommute=*/false);
8445 if (NewMI)
8446 return NewMI;
8447 // Folding failed again - undo the commute before returning.
8448 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
8449 }
8450
8451 printFailMsgforFold(MI, OpNum);
8452 return nullptr;
8453}
8454
8458
8459 for (MachineMemOperand *MMO : MMOs) {
8460 if (!MMO->isLoad())
8461 continue;
8462
8463 if (!MMO->isStore()) {
8464 // Reuse the MMO.
8465 LoadMMOs.push_back(MMO);
8466 } else {
8467 // Clone the MMO and unset the store flag.
8468 LoadMMOs.push_back(MF.getMachineMemOperand(
8469 MMO, MMO->getFlags() & ~MachineMemOperand::MOStore));
8470 }
8471 }
8472
8473 return LoadMMOs;
8474}
8475
8479
8480 for (MachineMemOperand *MMO : MMOs) {
8481 if (!MMO->isStore())
8482 continue;
8483
8484 if (!MMO->isLoad()) {
8485 // Reuse the MMO.
8486 StoreMMOs.push_back(MMO);
8487 } else {
8488 // Clone the MMO and unset the load flag.
8489 StoreMMOs.push_back(MF.getMachineMemOperand(
8490 MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad));
8491 }
8492 }
8493
8494 return StoreMMOs;
8495}
8496
8498 const TargetRegisterClass *RC,
8499 const X86Subtarget &STI) {
8500 assert(STI.hasAVX512() && "Expected at least AVX512!");
8501 unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
8502 assert((SpillSize == 64 || STI.hasVLX()) &&
8503 "Can't broadcast less than 64 bytes without AVX512VL!");
8504
8505#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64) \
8506 case TYPE: \
8507 switch (SpillSize) { \
8508 default: \
8509 llvm_unreachable("Unknown spill size"); \
8510 case 16: \
8511 return X86::OP16; \
8512 case 32: \
8513 return X86::OP32; \
8514 case 64: \
8515 return X86::OP64; \
8516 } \
8517 break;
8518
8519 switch (I->Flags & TB_BCAST_MASK) {
8520 default:
8521 llvm_unreachable("Unexpected broadcast type!");
8522 CASE_BCAST_TYPE_OPC(TB_BCAST_W, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8523 VPBROADCASTWZrm)
8524 CASE_BCAST_TYPE_OPC(TB_BCAST_D, VPBROADCASTDZ128rm, VPBROADCASTDZ256rm,
8525 VPBROADCASTDZrm)
8526 CASE_BCAST_TYPE_OPC(TB_BCAST_Q, VPBROADCASTQZ128rm, VPBROADCASTQZ256rm,
8527 VPBROADCASTQZrm)
8528 CASE_BCAST_TYPE_OPC(TB_BCAST_SH, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8529 VPBROADCASTWZrm)
8530 CASE_BCAST_TYPE_OPC(TB_BCAST_SS, VBROADCASTSSZ128rm, VBROADCASTSSZ256rm,
8531 VBROADCASTSSZrm)
8532 CASE_BCAST_TYPE_OPC(TB_BCAST_SD, VMOVDDUPZ128rm, VBROADCASTSDZ256rm,
8533 VBROADCASTSDZrm)
8534 }
8535}
8536
8538 MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad,
8539 bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
8540 const X86FoldTableEntry *I = lookupUnfoldTable(MI.getOpcode());
8541 if (I == nullptr)
8542 return false;
8543 unsigned Opc = I->DstOp;
8544 unsigned Index = I->Flags & TB_INDEX_MASK;
8545 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8546 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8547 if (UnfoldLoad && !FoldedLoad)
8548 return false;
8549 UnfoldLoad &= FoldedLoad;
8550 if (UnfoldStore && !FoldedStore)
8551 return false;
8552 UnfoldStore &= FoldedStore;
8553
8554 const MCInstrDesc &MCID = get(Opc);
8555
8556 const TargetRegisterClass *RC = getRegClass(MCID, Index);
8558 // TODO: Check if 32-byte or greater accesses are slow too?
8559 if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
8560 Subtarget.isUnalignedMem16Slow())
8561 // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
8562 // conservatively assume the address is unaligned. That's bad for
8563 // performance.
8564 return false;
8569 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
8570 MachineOperand &Op = MI.getOperand(i);
8571 if (i >= Index && i < Index + X86::AddrNumOperands)
8572 AddrOps.push_back(Op);
8573 else if (Op.isReg() && Op.isImplicit())
8574 ImpOps.push_back(Op);
8575 else if (i < Index)
8576 BeforeOps.push_back(Op);
8577 else if (i > Index)
8578 AfterOps.push_back(Op);
8579 }
8580
8581 // Emit the load or broadcast instruction.
8582 if (UnfoldLoad) {
8583 auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
8584
8585 unsigned Opc;
8586 if (I->Flags & TB_BCAST_MASK) {
8587 Opc = getBroadcastOpcode(I, RC, Subtarget);
8588 } else {
8589 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8590 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8591 Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
8592 }
8593
8594 DebugLoc DL;
8595 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
8596 for (const MachineOperand &AddrOp : AddrOps)
8597 MIB.add(AddrOp);
8598 MIB.setMemRefs(MMOs);
8599 NewMIs.push_back(MIB);
8600
8601 if (UnfoldStore) {
8602 // Address operands cannot be marked isKill.
8603 for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
8604 MachineOperand &MO = NewMIs[0]->getOperand(i);
8605 if (MO.isReg())
8606 MO.setIsKill(false);
8607 }
8608 }
8609 }
8610
8611 // Emit the data processing instruction.
8612 MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
8613 MachineInstrBuilder MIB(MF, DataMI);
8614
8615 if (FoldedStore)
8616 MIB.addReg(Reg, RegState::Define);
8617 for (MachineOperand &BeforeOp : BeforeOps)
8618 MIB.add(BeforeOp);
8619 if (FoldedLoad)
8620 MIB.addReg(Reg);
8621 for (MachineOperand &AfterOp : AfterOps)
8622 MIB.add(AfterOp);
8623 for (MachineOperand &ImpOp : ImpOps) {
8624 MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) |
8626 getKillRegState(ImpOp.isKill()) |
8627 getDeadRegState(ImpOp.isDead()) |
8628 getUndefRegState(ImpOp.isUndef()));
8629 }
8630 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8631 switch (DataMI->getOpcode()) {
8632 default:
8633 break;
8634 case X86::CMP64ri32:
8635 case X86::CMP32ri:
8636 case X86::CMP16ri:
8637 case X86::CMP8ri: {
8638 MachineOperand &MO0 = DataMI->getOperand(0);
8639 MachineOperand &MO1 = DataMI->getOperand(1);
8640 if (MO1.isImm() && MO1.getImm() == 0) {
8641 unsigned NewOpc;
8642 switch (DataMI->getOpcode()) {
8643 default:
8644 llvm_unreachable("Unreachable!");
8645 case X86::CMP64ri32:
8646 NewOpc = X86::TEST64rr;
8647 break;
8648 case X86::CMP32ri:
8649 NewOpc = X86::TEST32rr;
8650 break;
8651 case X86::CMP16ri:
8652 NewOpc = X86::TEST16rr;
8653 break;
8654 case X86::CMP8ri:
8655 NewOpc = X86::TEST8rr;
8656 break;
8657 }
8658 DataMI->setDesc(get(NewOpc));
8659 MO1.ChangeToRegister(MO0.getReg(), false);
8660 }
8661 }
8662 }
8663 NewMIs.push_back(DataMI);
8664
8665 // Emit the store instruction.
8666 if (UnfoldStore) {
8667 const TargetRegisterClass *DstRC = getRegClass(MCID, 0);
8668 auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
8669 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
8670 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8671 unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
8672 DebugLoc DL;
8673 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
8674 for (const MachineOperand &AddrOp : AddrOps)
8675 MIB.add(AddrOp);
8676 MIB.addReg(Reg, RegState::Kill);
8677 MIB.setMemRefs(MMOs);
8678 NewMIs.push_back(MIB);
8679 }
8680
8681 return true;
8682}
8683
8685 SelectionDAG &DAG, SDNode *N, SmallVectorImpl<SDNode *> &NewNodes) const {
8686 if (!N->isMachineOpcode())
8687 return false;
8688
8689 const X86FoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode());
8690 if (I == nullptr)
8691 return false;
8692 unsigned Opc = I->DstOp;
8693 unsigned Index = I->Flags & TB_INDEX_MASK;
8694 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8695 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8696 const MCInstrDesc &MCID = get(Opc);
8699 const TargetRegisterClass *RC = getRegClass(MCID, Index);
8700 unsigned NumDefs = MCID.NumDefs;
8701 std::vector<SDValue> AddrOps;
8702 std::vector<SDValue> BeforeOps;
8703 std::vector<SDValue> AfterOps;
8704 SDLoc dl(N);
8705 unsigned NumOps = N->getNumOperands();
8706 for (unsigned i = 0; i != NumOps - 1; ++i) {
8707 SDValue Op = N->getOperand(i);
8708 if (i >= Index - NumDefs && i < Index - NumDefs + X86::AddrNumOperands)
8709 AddrOps.push_back(Op);
8710 else if (i < Index - NumDefs)
8711 BeforeOps.push_back(Op);
8712 else if (i > Index - NumDefs)
8713 AfterOps.push_back(Op);
8714 }
8715 SDValue Chain = N->getOperand(NumOps - 1);
8716 AddrOps.push_back(Chain);
8717
8718 // Emit the load instruction.
8719 SDNode *Load = nullptr;
8720 if (FoldedLoad) {
8721 EVT VT = *TRI.legalclasstypes_begin(*RC);
8722 auto MMOs = extractLoadMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8723 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8724 Subtarget.isUnalignedMem16Slow())
8725 // Do not introduce a slow unaligned load.
8726 return false;
8727 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8728 // memory access is slow above.
8729
8730 unsigned Opc;
8731 if (I->Flags & TB_BCAST_MASK) {
8732 Opc = getBroadcastOpcode(I, RC, Subtarget);
8733 } else {
8734 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8735 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8736 Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
8737 }
8738
8739 Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
8740 NewNodes.push_back(Load);
8741
8742 // Preserve memory reference information.
8743 DAG.setNodeMemRefs(cast<MachineSDNode>(Load), MMOs);
8744 }
8745
8746 // Emit the data processing instruction.
8747 std::vector<EVT> VTs;
8748 const TargetRegisterClass *DstRC = nullptr;
8749 if (MCID.getNumDefs() > 0) {
8750 DstRC = getRegClass(MCID, 0);
8751 VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
8752 }
8753 for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
8754 EVT VT = N->getValueType(i);
8755 if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
8756 VTs.push_back(VT);
8757 }
8758 if (Load)
8759 BeforeOps.push_back(SDValue(Load, 0));
8760 llvm::append_range(BeforeOps, AfterOps);
8761 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8762 switch (Opc) {
8763 default:
8764 break;
8765 case X86::CMP64ri32:
8766 case X86::CMP32ri:
8767 case X86::CMP16ri:
8768 case X86::CMP8ri:
8769 if (isNullConstant(BeforeOps[1])) {
8770 switch (Opc) {
8771 default:
8772 llvm_unreachable("Unreachable!");
8773 case X86::CMP64ri32:
8774 Opc = X86::TEST64rr;
8775 break;
8776 case X86::CMP32ri:
8777 Opc = X86::TEST32rr;
8778 break;
8779 case X86::CMP16ri:
8780 Opc = X86::TEST16rr;
8781 break;
8782 case X86::CMP8ri:
8783 Opc = X86::TEST8rr;
8784 break;
8785 }
8786 BeforeOps[1] = BeforeOps[0];
8787 }
8788 }
8789 SDNode *NewNode = DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
8790 NewNodes.push_back(NewNode);
8791
8792 // Emit the store instruction.
8793 if (FoldedStore) {
8794 AddrOps.pop_back();
8795 AddrOps.push_back(SDValue(NewNode, 0));
8796 AddrOps.push_back(Chain);
8797 auto MMOs = extractStoreMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8798 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8799 Subtarget.isUnalignedMem16Slow())
8800 // Do not introduce a slow unaligned store.
8801 return false;
8802 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8803 // memory access is slow above.
8804 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8805 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8806 SDNode *Store =
8807 DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
8808 dl, MVT::Other, AddrOps);
8809 NewNodes.push_back(Store);
8810
8811 // Preserve memory reference information.
8812 DAG.setNodeMemRefs(cast<MachineSDNode>(Store), MMOs);
8813 }
8814
8815 return true;
8816}
8817
8818unsigned
8820 bool UnfoldStore,
8821 unsigned *LoadRegIndex) const {
8823 if (I == nullptr)
8824 return 0;
8825 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8826 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8827 if (UnfoldLoad && !FoldedLoad)
8828 return 0;
8829 if (UnfoldStore && !FoldedStore)
8830 return 0;
8831 if (LoadRegIndex)
8832 *LoadRegIndex = I->Flags & TB_INDEX_MASK;
8833 return I->DstOp;
8834}
8835
8837 int64_t &Offset1,
8838 int64_t &Offset2) const {
8839 if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
8840 return false;
8841
8842 auto IsLoadOpcode = [&](unsigned Opcode) {
8843 switch (Opcode) {
8844 default:
8845 return false;
8846 case X86::MOV8rm:
8847 case X86::MOV16rm:
8848 case X86::MOV32rm:
8849 case X86::MOV64rm:
8850 case X86::LD_Fp32m:
8851 case X86::LD_Fp64m:
8852 case X86::LD_Fp80m:
8853 case X86::MOVSSrm:
8854 case X86::MOVSSrm_alt:
8855 case X86::MOVSDrm:
8856 case X86::MOVSDrm_alt:
8857 case X86::MMX_MOVD64rm:
8858 case X86::MMX_MOVQ64rm:
8859 case X86::MOVAPSrm:
8860 case X86::MOVUPSrm:
8861 case X86::MOVAPDrm:
8862 case X86::MOVUPDrm:
8863 case X86::MOVDQArm:
8864 case X86::MOVDQUrm:
8865 // AVX load instructions
8866 case X86::VMOVSSrm:
8867 case X86::VMOVSSrm_alt:
8868 case X86::VMOVSDrm:
8869 case X86::VMOVSDrm_alt:
8870 case X86::VMOVAPSrm:
8871 case X86::VMOVUPSrm:
8872 case X86::VMOVAPDrm:
8873 case X86::VMOVUPDrm:
8874 case X86::VMOVDQArm:
8875 case X86::VMOVDQUrm:
8876 case X86::VMOVAPSYrm:
8877 case X86::VMOVUPSYrm:
8878 case X86::VMOVAPDYrm:
8879 case X86::VMOVUPDYrm:
8880 case X86::VMOVDQAYrm:
8881 case X86::VMOVDQUYrm:
8882 // AVX512 load instructions
8883 case X86::VMOVSSZrm:
8884 case X86::VMOVSSZrm_alt:
8885 case X86::VMOVSDZrm:
8886 case X86::VMOVSDZrm_alt:
8887 case X86::VMOVAPSZ128rm:
8888 case X86::VMOVUPSZ128rm:
8889 case X86::VMOVAPSZ128rm_NOVLX:
8890 case X86::VMOVUPSZ128rm_NOVLX:
8891 case X86::VMOVAPDZ128rm:
8892 case X86::VMOVUPDZ128rm:
8893 case X86::VMOVDQU8Z128rm:
8894 case X86::VMOVDQU16Z128rm:
8895 case X86::VMOVDQA32Z128rm:
8896 case X86::VMOVDQU32Z128rm:
8897 case X86::VMOVDQA64Z128rm:
8898 case X86::VMOVDQU64Z128rm:
8899 case X86::VMOVAPSZ256rm:
8900 case X86::VMOVUPSZ256rm:
8901 case X86::VMOVAPSZ256rm_NOVLX:
8902 case X86::VMOVUPSZ256rm_NOVLX:
8903 case X86::VMOVAPDZ256rm:
8904 case X86::VMOVUPDZ256rm:
8905 case X86::VMOVDQU8Z256rm:
8906 case X86::VMOVDQU16Z256rm:
8907 case X86::VMOVDQA32Z256rm:
8908 case X86::VMOVDQU32Z256rm:
8909 case X86::VMOVDQA64Z256rm:
8910 case X86::VMOVDQU64Z256rm:
8911 case X86::VMOVAPSZrm:
8912 case X86::VMOVUPSZrm:
8913 case X86::VMOVAPDZrm:
8914 case X86::VMOVUPDZrm:
8915 case X86::VMOVDQU8Zrm:
8916 case X86::VMOVDQU16Zrm:
8917 case X86::VMOVDQA32Zrm:
8918 case X86::VMOVDQU32Zrm:
8919 case X86::VMOVDQA64Zrm:
8920 case X86::VMOVDQU64Zrm:
8921 case X86::KMOVBkm:
8922 case X86::KMOVBkm_EVEX:
8923 case X86::KMOVWkm:
8924 case X86::KMOVWkm_EVEX:
8925 case X86::KMOVDkm:
8926 case X86::KMOVDkm_EVEX:
8927 case X86::KMOVQkm:
8928 case X86::KMOVQkm_EVEX:
8929 return true;
8930 }
8931 };
8932
8933 if (!IsLoadOpcode(Load1->getMachineOpcode()) ||
8934 !IsLoadOpcode(Load2->getMachineOpcode()))
8935 return false;
8936
8937 // Lambda to check if both the loads have the same value for an operand index.
8938 auto HasSameOp = [&](int I) {
8939 return Load1->getOperand(I) == Load2->getOperand(I);
8940 };
8941
8942 // All operands except the displacement should match.
8943 if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
8944 !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
8945 return false;
8946
8947 // Chain Operand must be the same.
8948 if (!HasSameOp(5))
8949 return false;
8950
8951 // Now let's examine if the displacements are constants.
8954 if (!Disp1 || !Disp2)
8955 return false;
8956
8957 Offset1 = Disp1->getSExtValue();
8958 Offset2 = Disp2->getSExtValue();
8959 return true;
8960}
8961
8963 int64_t Offset1, int64_t Offset2,
8964 unsigned NumLoads) const {
8965 assert(Offset2 > Offset1);
8966 if ((Offset2 - Offset1) / 8 > 64)
8967 return false;
8968
8969 unsigned Opc1 = Load1->getMachineOpcode();
8970 unsigned Opc2 = Load2->getMachineOpcode();
8971 if (Opc1 != Opc2)
8972 return false; // FIXME: overly conservative?
8973
8974 switch (Opc1) {
8975 default:
8976 break;
8977 case X86::LD_Fp32m:
8978 case X86::LD_Fp64m:
8979 case X86::LD_Fp80m:
8980 case X86::MMX_MOVD64rm:
8981 case X86::MMX_MOVQ64rm:
8982 return false;
8983 }
8984
8985 EVT VT = Load1->getValueType(0);
8986 switch (VT.getSimpleVT().SimpleTy) {
8987 default:
8988 // XMM registers. In 64-bit mode we can be a bit more aggressive since we
8989 // have 16 of them to play with.
8990 if (Subtarget.is64Bit()) {
8991 if (NumLoads >= 3)
8992 return false;
8993 } else if (NumLoads) {
8994 return false;
8995 }
8996 break;
8997 case MVT::i8:
8998 case MVT::i16:
8999 case MVT::i32:
9000 case MVT::i64:
9001 case MVT::f32:
9002 case MVT::f64:
9003 if (NumLoads)
9004 return false;
9005 break;
9006 }
9007
9008 return true;
9009}
9010
9012 const MachineBasicBlock *MBB,
9013 const MachineFunction &MF) const {
9014
9015 // ENDBR instructions should not be scheduled around.
9016 unsigned Opcode = MI.getOpcode();
9017 if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
9018 Opcode == X86::PLDTILECFGV)
9019 return true;
9020
9021 // Frame setup and destroy can't be scheduled around.
9022 if (MI.getFlag(MachineInstr::FrameSetup) ||
9024 return true;
9025
9027}
9028
9031 assert(Cond.size() == 1 && "Invalid X86 branch condition!");
9032 X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
9033 Cond[0].setImm(GetOppositeBranchCondition(CC));
9034 return false;
9035}
9036
9038 const TargetRegisterClass *RC) const {
9039 // FIXME: Return false for x87 stack register classes for now. We can't
9040 // allow any loads of these registers before FpGet_ST0_80.
9041 return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
9042 RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
9043 RC == &X86::RFP80RegClass);
9044}
9045
9046/// Return a virtual register initialized with the
9047/// the global base register value. Output instructions required to
9048/// initialize the register in the function entry block, if necessary.
9049///
9050/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
9051///
9054 Register GlobalBaseReg = X86FI->getGlobalBaseReg();
9055 if (GlobalBaseReg)
9056 return GlobalBaseReg;
9057
9058 // Create the register. The code to initialize it is inserted
9059 // later, by the CGBR pass (below).
9060 MachineRegisterInfo &RegInfo = MF->getRegInfo();
9061 GlobalBaseReg = RegInfo.createVirtualRegister(
9062 Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass);
9063 X86FI->setGlobalBaseReg(GlobalBaseReg);
9064 return GlobalBaseReg;
9065}
9066
9067// FIXME: Some shuffle and unpack instructions have equivalents in different
9068// domains, but they require a bit more work than just switching opcodes.
9069
9070static const uint16_t *lookup(unsigned opcode, unsigned domain,
9071 ArrayRef<uint16_t[3]> Table) {
9072 for (const uint16_t(&Row)[3] : Table)
9073 if (Row[domain - 1] == opcode)
9074 return Row;
9075 return nullptr;
9076}
9077
9078static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
9079 ArrayRef<uint16_t[4]> Table) {
9080 // If this is the integer domain make sure to check both integer columns.
9081 for (const uint16_t(&Row)[4] : Table)
9082 if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
9083 return Row;
9084 return nullptr;
9085}
9086
9087// Helper to attempt to widen/narrow blend masks.
9088static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
9089 unsigned NewWidth, unsigned *pNewMask = nullptr) {
9090 assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
9091 "Illegal blend mask scale");
9092 unsigned NewMask = 0;
9093
9094 if ((OldWidth % NewWidth) == 0) {
9095 unsigned Scale = OldWidth / NewWidth;
9096 unsigned SubMask = (1u << Scale) - 1;
9097 for (unsigned i = 0; i != NewWidth; ++i) {
9098 unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
9099 if (Sub == SubMask)
9100 NewMask |= (1u << i);
9101 else if (Sub != 0x0)
9102 return false;
9103 }
9104 } else {
9105 unsigned Scale = NewWidth / OldWidth;
9106 unsigned SubMask = (1u << Scale) - 1;
9107 for (unsigned i = 0; i != OldWidth; ++i) {
9108 if (OldMask & (1 << i)) {
9109 NewMask |= (SubMask << (i * Scale));
9110 }
9111 }
9112 }
9113
9114 if (pNewMask)
9115 *pNewMask = NewMask;
9116 return true;
9117}
9118
9120 unsigned Opcode = MI.getOpcode();
9121 unsigned NumOperands = MI.getDesc().getNumOperands();
9122
9123 auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
9124 uint16_t validDomains = 0;
9125 if (MI.getOperand(NumOperands - 1).isImm()) {
9126 unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
9127 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
9128 validDomains |= 0x2; // PackedSingle
9129 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
9130 validDomains |= 0x4; // PackedDouble
9131 if (!Is256 || Subtarget.hasAVX2())
9132 validDomains |= 0x8; // PackedInt
9133 }
9134 return validDomains;
9135 };
9136
9137 switch (Opcode) {
9138 case X86::BLENDPDrmi:
9139 case X86::BLENDPDrri:
9140 case X86::VBLENDPDrmi:
9141 case X86::VBLENDPDrri:
9142 return GetBlendDomains(2, false);
9143 case X86::VBLENDPDYrmi:
9144 case X86::VBLENDPDYrri:
9145 return GetBlendDomains(4, true);
9146 case X86::BLENDPSrmi:
9147 case X86::BLENDPSrri:
9148 case X86::VBLENDPSrmi:
9149 case X86::VBLENDPSrri:
9150 case X86::VPBLENDDrmi:
9151 case X86::VPBLENDDrri:
9152 return GetBlendDomains(4, false);
9153 case X86::VBLENDPSYrmi:
9154 case X86::VBLENDPSYrri:
9155 case X86::VPBLENDDYrmi:
9156 case X86::VPBLENDDYrri:
9157 return GetBlendDomains(8, true);
9158 case X86::PBLENDWrmi:
9159 case X86::PBLENDWrri:
9160 case X86::VPBLENDWrmi:
9161 case X86::VPBLENDWrri:
9162 // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
9163 case X86::VPBLENDWYrmi:
9164 case X86::VPBLENDWYrri:
9165 return GetBlendDomains(8, false);
9166 case X86::VPANDDZ128rr:
9167 case X86::VPANDDZ128rm:
9168 case X86::VPANDDZ256rr:
9169 case X86::VPANDDZ256rm:
9170 case X86::VPANDQZ128rr:
9171 case X86::VPANDQZ128rm:
9172 case X86::VPANDQZ256rr:
9173 case X86::VPANDQZ256rm:
9174 case X86::VPANDNDZ128rr:
9175 case X86::VPANDNDZ128rm:
9176 case X86::VPANDNDZ256rr:
9177 case X86::VPANDNDZ256rm:
9178 case X86::VPANDNQZ128rr:
9179 case X86::VPANDNQZ128rm:
9180 case X86::VPANDNQZ256rr:
9181 case X86::VPANDNQZ256rm:
9182 case X86::VPORDZ128rr:
9183 case X86::VPORDZ128rm:
9184 case X86::VPORDZ256rr:
9185 case X86::VPORDZ256rm:
9186 case X86::VPORQZ128rr:
9187 case X86::VPORQZ128rm:
9188 case X86::VPORQZ256rr:
9189 case X86::VPORQZ256rm:
9190 case X86::VPXORDZ128rr:
9191 case X86::VPXORDZ128rm:
9192 case X86::VPXORDZ256rr:
9193 case X86::VPXORDZ256rm:
9194 case X86::VPXORQZ128rr:
9195 case X86::VPXORQZ128rm:
9196 case X86::VPXORQZ256rr:
9197 case X86::VPXORQZ256rm:
9198 // If we don't have DQI see if we can still switch from an EVEX integer
9199 // instruction to a VEX floating point instruction.
9200 if (Subtarget.hasDQI())
9201 return 0;
9202
9203 if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16)
9204 return 0;
9205 if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16)
9206 return 0;
9207 // Register forms will have 3 operands. Memory form will have more.
9208 if (NumOperands == 3 &&
9209 RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
9210 return 0;
9211
9212 // All domains are valid.
9213 return 0xe;
9214 case X86::MOVHLPSrr:
9215 // We can swap domains when both inputs are the same register.
9216 // FIXME: This doesn't catch all the cases we would like. If the input
9217 // register isn't KILLed by the instruction, the two address instruction
9218 // pass puts a COPY on one input. The other input uses the original
9219 // register. This prevents the same physical register from being used by
9220 // both inputs.
9221 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9222 MI.getOperand(0).getSubReg() == 0 &&
9223 MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0)
9224 return 0x6;
9225 return 0;
9226 case X86::SHUFPDrri:
9227 return 0x6;
9228 }
9229 return 0;
9230}
9231
9232#include "X86ReplaceableInstrs.def"
9233
9235 unsigned Domain) const {
9236 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9237 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9238 assert(dom && "Not an SSE instruction");
9239
9240 unsigned Opcode = MI.getOpcode();
9241 unsigned NumOperands = MI.getDesc().getNumOperands();
9242
9243 auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
9244 if (MI.getOperand(NumOperands - 1).isImm()) {
9245 unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
9246 Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
9247 unsigned NewImm = Imm;
9248
9249 const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
9250 if (!table)
9251 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9252
9253 if (Domain == 1) { // PackedSingle
9254 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9255 } else if (Domain == 2) { // PackedDouble
9256 AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
9257 } else if (Domain == 3) { // PackedInt
9258 if (Subtarget.hasAVX2()) {
9259 // If we are already VPBLENDW use that, else use VPBLENDD.
9260 if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
9261 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9262 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9263 }
9264 } else {
9265 assert(!Is256 && "128-bit vector expected");
9266 AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
9267 }
9268 }
9269
9270 assert(table && table[Domain - 1] && "Unknown domain op");
9271 MI.setDesc(get(table[Domain - 1]));
9272 MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
9273 }
9274 return true;
9275 };
9276
9277 switch (Opcode) {
9278 case X86::BLENDPDrmi:
9279 case X86::BLENDPDrri:
9280 case X86::VBLENDPDrmi:
9281 case X86::VBLENDPDrri:
9282 return SetBlendDomain(2, false);
9283 case X86::VBLENDPDYrmi:
9284 case X86::VBLENDPDYrri:
9285 return SetBlendDomain(4, true);
9286 case X86::BLENDPSrmi:
9287 case X86::BLENDPSrri:
9288 case X86::VBLENDPSrmi:
9289 case X86::VBLENDPSrri:
9290 case X86::VPBLENDDrmi:
9291 case X86::VPBLENDDrri:
9292 return SetBlendDomain(4, false);
9293 case X86::VBLENDPSYrmi:
9294 case X86::VBLENDPSYrri:
9295 case X86::VPBLENDDYrmi:
9296 case X86::VPBLENDDYrri:
9297 return SetBlendDomain(8, true);
9298 case X86::PBLENDWrmi:
9299 case X86::PBLENDWrri:
9300 case X86::VPBLENDWrmi:
9301 case X86::VPBLENDWrri:
9302 return SetBlendDomain(8, false);
9303 case X86::VPBLENDWYrmi:
9304 case X86::VPBLENDWYrri:
9305 return SetBlendDomain(16, true);
9306 case X86::VPANDDZ128rr:
9307 case X86::VPANDDZ128rm:
9308 case X86::VPANDDZ256rr:
9309 case X86::VPANDDZ256rm:
9310 case X86::VPANDQZ128rr:
9311 case X86::VPANDQZ128rm:
9312 case X86::VPANDQZ256rr:
9313 case X86::VPANDQZ256rm:
9314 case X86::VPANDNDZ128rr:
9315 case X86::VPANDNDZ128rm:
9316 case X86::VPANDNDZ256rr:
9317 case X86::VPANDNDZ256rm:
9318 case X86::VPANDNQZ128rr:
9319 case X86::VPANDNQZ128rm:
9320 case X86::VPANDNQZ256rr:
9321 case X86::VPANDNQZ256rm:
9322 case X86::VPORDZ128rr:
9323 case X86::VPORDZ128rm:
9324 case X86::VPORDZ256rr:
9325 case X86::VPORDZ256rm:
9326 case X86::VPORQZ128rr:
9327 case X86::VPORQZ128rm:
9328 case X86::VPORQZ256rr:
9329 case X86::VPORQZ256rm:
9330 case X86::VPXORDZ128rr:
9331 case X86::VPXORDZ128rm:
9332 case X86::VPXORDZ256rr:
9333 case X86::VPXORDZ256rm:
9334 case X86::VPXORQZ128rr:
9335 case X86::VPXORQZ128rm:
9336 case X86::VPXORQZ256rr:
9337 case X86::VPXORQZ256rm: {
9338 // Without DQI, convert EVEX instructions to VEX instructions.
9339 if (Subtarget.hasDQI())
9340 return false;
9341
9342 const uint16_t *table =
9343 lookupAVX512(MI.getOpcode(), dom, ReplaceableCustomAVX512LogicInstrs);
9344 assert(table && "Instruction not found in table?");
9345 // Don't change integer Q instructions to D instructions and
9346 // use D intructions if we started with a PS instruction.
9347 if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9348 Domain = 4;
9349 MI.setDesc(get(table[Domain - 1]));
9350 return true;
9351 }
9352 case X86::UNPCKHPDrr:
9353 case X86::MOVHLPSrr:
9354 // We just need to commute the instruction which will switch the domains.
9355 if (Domain != dom && Domain != 3 &&
9356 MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9357 MI.getOperand(0).getSubReg() == 0 &&
9358 MI.getOperand(1).getSubReg() == 0 &&
9359 MI.getOperand(2).getSubReg() == 0) {
9360 commuteInstruction(MI, false);
9361 return true;
9362 }
9363 // We must always return true for MOVHLPSrr.
9364 if (Opcode == X86::MOVHLPSrr)
9365 return true;
9366 break;
9367 case X86::SHUFPDrri: {
9368 if (Domain == 1) {
9369 unsigned Imm = MI.getOperand(3).getImm();
9370 unsigned NewImm = 0x44;
9371 if (Imm & 1)
9372 NewImm |= 0x0a;
9373 if (Imm & 2)
9374 NewImm |= 0xa0;
9375 MI.getOperand(3).setImm(NewImm);
9376 MI.setDesc(get(X86::SHUFPSrri));
9377 }
9378 return true;
9379 }
9380 }
9381 return false;
9382}
9383
9384std::pair<uint16_t, uint16_t>
9386 uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9387 unsigned opcode = MI.getOpcode();
9388 uint16_t validDomains = 0;
9389 if (domain) {
9390 // Attempt to match for custom instructions.
9391 validDomains = getExecutionDomainCustom(MI);
9392 if (validDomains)
9393 return std::make_pair(domain, validDomains);
9394
9395 if (lookup(opcode, domain, ReplaceableInstrs)) {
9396 validDomains = 0xe;
9397 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
9398 validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
9399 } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
9400 validDomains = 0x6;
9401 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
9402 // Insert/extract instructions should only effect domain if AVX2
9403 // is enabled.
9404 if (!Subtarget.hasAVX2())
9405 return std::make_pair(0, 0);
9406 validDomains = 0xe;
9407 } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
9408 validDomains = 0xe;
9409 } else if (Subtarget.hasDQI() &&
9410 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
9411 validDomains = 0xe;
9412 } else if (Subtarget.hasDQI()) {
9413 if (const uint16_t *table =
9414 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) {
9415 if (domain == 1 || (domain == 3 && table[3] == opcode))
9416 validDomains = 0xa;
9417 else
9418 validDomains = 0xc;
9419 }
9420 }
9421 }
9422 return std::make_pair(domain, validDomains);
9423}
9424
9426 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9427 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9428 assert(dom && "Not an SSE instruction");
9429
9430 // Attempt to match for custom instructions.
9432 return;
9433
9434 const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
9435 if (!table) { // try the other table
9436 assert((Subtarget.hasAVX2() || Domain < 3) &&
9437 "256-bit vector operations only available in AVX2");
9438 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
9439 }
9440 if (!table) { // try the FP table
9441 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
9442 assert((!table || Domain < 3) &&
9443 "Can only select PackedSingle or PackedDouble");
9444 }
9445 if (!table) { // try the other table
9446 assert(Subtarget.hasAVX2() &&
9447 "256-bit insert/extract only available in AVX2");
9448 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
9449 }
9450 if (!table) { // try the AVX512 table
9451 assert(Subtarget.hasAVX512() && "Requires AVX-512");
9452 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
9453 // Don't change integer Q instructions to D instructions.
9454 if (table && Domain == 3 && table[3] == MI.getOpcode())
9455 Domain = 4;
9456 }
9457 if (!table) { // try the AVX512DQ table
9458 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9459 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
9460 // Don't change integer Q instructions to D instructions and
9461 // use D instructions if we started with a PS instruction.
9462 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9463 Domain = 4;
9464 }
9465 if (!table) { // try the AVX512DQMasked table
9466 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9467 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
9468 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9469 Domain = 4;
9470 }
9471 assert(table && "Cannot change domain");
9472 MI.setDesc(get(table[Domain - 1]));
9473}
9474
9480
9481/// Return the noop instruction to use for a noop.
9483 MCInst Nop;
9484 Nop.setOpcode(X86::NOOP);
9485 return Nop;
9486}
9487
9489 switch (opc) {
9490 default:
9491 return false;
9492 case X86::DIVPDrm:
9493 case X86::DIVPDrr:
9494 case X86::DIVPSrm:
9495 case X86::DIVPSrr:
9496 case X86::DIVSDrm:
9497 case X86::DIVSDrm_Int:
9498 case X86::DIVSDrr:
9499 case X86::DIVSDrr_Int:
9500 case X86::DIVSSrm:
9501 case X86::DIVSSrm_Int:
9502 case X86::DIVSSrr:
9503 case X86::DIVSSrr_Int:
9504 case X86::SQRTPDm:
9505 case X86::SQRTPDr:
9506 case X86::SQRTPSm:
9507 case X86::SQRTPSr:
9508 case X86::SQRTSDm:
9509 case X86::SQRTSDm_Int:
9510 case X86::SQRTSDr:
9511 case X86::SQRTSDr_Int:
9512 case X86::SQRTSSm:
9513 case X86::SQRTSSm_Int:
9514 case X86::SQRTSSr:
9515 case X86::SQRTSSr_Int:
9516 // AVX instructions with high latency
9517 case X86::VDIVPDrm:
9518 case X86::VDIVPDrr:
9519 case X86::VDIVPDYrm:
9520 case X86::VDIVPDYrr:
9521 case X86::VDIVPSrm:
9522 case X86::VDIVPSrr:
9523 case X86::VDIVPSYrm:
9524 case X86::VDIVPSYrr:
9525 case X86::VDIVSDrm:
9526 case X86::VDIVSDrm_Int:
9527 case X86::VDIVSDrr:
9528 case X86::VDIVSDrr_Int:
9529 case X86::VDIVSSrm:
9530 case X86::VDIVSSrm_Int:
9531 case X86::VDIVSSrr:
9532 case X86::VDIVSSrr_Int:
9533 case X86::VSQRTPDm:
9534 case X86::VSQRTPDr:
9535 case X86::VSQRTPDYm:
9536 case X86::VSQRTPDYr:
9537 case X86::VSQRTPSm:
9538 case X86::VSQRTPSr:
9539 case X86::VSQRTPSYm:
9540 case X86::VSQRTPSYr:
9541 case X86::VSQRTSDm:
9542 case X86::VSQRTSDm_Int:
9543 case X86::VSQRTSDr:
9544 case X86::VSQRTSDr_Int:
9545 case X86::VSQRTSSm:
9546 case X86::VSQRTSSm_Int:
9547 case X86::VSQRTSSr:
9548 case X86::VSQRTSSr_Int:
9549 // AVX512 instructions with high latency
9550 case X86::VDIVPDZ128rm:
9551 case X86::VDIVPDZ128rmb:
9552 case X86::VDIVPDZ128rmbk:
9553 case X86::VDIVPDZ128rmbkz:
9554 case X86::VDIVPDZ128rmk:
9555 case X86::VDIVPDZ128rmkz:
9556 case X86::VDIVPDZ128rr:
9557 case X86::VDIVPDZ128rrk:
9558 case X86::VDIVPDZ128rrkz:
9559 case X86::VDIVPDZ256rm:
9560 case X86::VDIVPDZ256rmb:
9561 case X86::VDIVPDZ256rmbk:
9562 case X86::VDIVPDZ256rmbkz:
9563 case X86::VDIVPDZ256rmk:
9564 case X86::VDIVPDZ256rmkz:
9565 case X86::VDIVPDZ256rr:
9566 case X86::VDIVPDZ256rrk:
9567 case X86::VDIVPDZ256rrkz:
9568 case X86::VDIVPDZrrb:
9569 case X86::VDIVPDZrrbk:
9570 case X86::VDIVPDZrrbkz:
9571 case X86::VDIVPDZrm:
9572 case X86::VDIVPDZrmb:
9573 case X86::VDIVPDZrmbk:
9574 case X86::VDIVPDZrmbkz:
9575 case X86::VDIVPDZrmk:
9576 case X86::VDIVPDZrmkz:
9577 case X86::VDIVPDZrr:
9578 case X86::VDIVPDZrrk:
9579 case X86::VDIVPDZrrkz:
9580 case X86::VDIVPSZ128rm:
9581 case X86::VDIVPSZ128rmb:
9582 case X86::VDIVPSZ128rmbk:
9583 case X86::VDIVPSZ128rmbkz:
9584 case X86::VDIVPSZ128rmk:
9585 case X86::VDIVPSZ128rmkz:
9586 case X86::VDIVPSZ128rr:
9587 case X86::VDIVPSZ128rrk:
9588 case X86::VDIVPSZ128rrkz:
9589 case X86::VDIVPSZ256rm:
9590 case X86::VDIVPSZ256rmb:
9591 case X86::VDIVPSZ256rmbk:
9592 case X86::VDIVPSZ256rmbkz:
9593 case X86::VDIVPSZ256rmk:
9594 case X86::VDIVPSZ256rmkz:
9595 case X86::VDIVPSZ256rr:
9596 case X86::VDIVPSZ256rrk:
9597 case X86::VDIVPSZ256rrkz:
9598 case X86::VDIVPSZrrb:
9599 case X86::VDIVPSZrrbk:
9600 case X86::VDIVPSZrrbkz:
9601 case X86::VDIVPSZrm:
9602 case X86::VDIVPSZrmb:
9603 case X86::VDIVPSZrmbk:
9604 case X86::VDIVPSZrmbkz:
9605 case X86::VDIVPSZrmk:
9606 case X86::VDIVPSZrmkz:
9607 case X86::VDIVPSZrr:
9608 case X86::VDIVPSZrrk:
9609 case X86::VDIVPSZrrkz:
9610 case X86::VDIVSDZrm:
9611 case X86::VDIVSDZrr:
9612 case X86::VDIVSDZrm_Int:
9613 case X86::VDIVSDZrmk_Int:
9614 case X86::VDIVSDZrmkz_Int:
9615 case X86::VDIVSDZrr_Int:
9616 case X86::VDIVSDZrrk_Int:
9617 case X86::VDIVSDZrrkz_Int:
9618 case X86::VDIVSDZrrb_Int:
9619 case X86::VDIVSDZrrbk_Int:
9620 case X86::VDIVSDZrrbkz_Int:
9621 case X86::VDIVSSZrm:
9622 case X86::VDIVSSZrr:
9623 case X86::VDIVSSZrm_Int:
9624 case X86::VDIVSSZrmk_Int:
9625 case X86::VDIVSSZrmkz_Int:
9626 case X86::VDIVSSZrr_Int:
9627 case X86::VDIVSSZrrk_Int:
9628 case X86::VDIVSSZrrkz_Int:
9629 case X86::VDIVSSZrrb_Int:
9630 case X86::VDIVSSZrrbk_Int:
9631 case X86::VDIVSSZrrbkz_Int:
9632 case X86::VSQRTPDZ128m:
9633 case X86::VSQRTPDZ128mb:
9634 case X86::VSQRTPDZ128mbk:
9635 case X86::VSQRTPDZ128mbkz:
9636 case X86::VSQRTPDZ128mk:
9637 case X86::VSQRTPDZ128mkz:
9638 case X86::VSQRTPDZ128r:
9639 case X86::VSQRTPDZ128rk:
9640 case X86::VSQRTPDZ128rkz:
9641 case X86::VSQRTPDZ256m:
9642 case X86::VSQRTPDZ256mb:
9643 case X86::VSQRTPDZ256mbk:
9644 case X86::VSQRTPDZ256mbkz:
9645 case X86::VSQRTPDZ256mk:
9646 case X86::VSQRTPDZ256mkz:
9647 case X86::VSQRTPDZ256r:
9648 case X86::VSQRTPDZ256rk:
9649 case X86::VSQRTPDZ256rkz:
9650 case X86::VSQRTPDZm:
9651 case X86::VSQRTPDZmb:
9652 case X86::VSQRTPDZmbk:
9653 case X86::VSQRTPDZmbkz:
9654 case X86::VSQRTPDZmk:
9655 case X86::VSQRTPDZmkz:
9656 case X86::VSQRTPDZr:
9657 case X86::VSQRTPDZrb:
9658 case X86::VSQRTPDZrbk:
9659 case X86::VSQRTPDZrbkz:
9660 case X86::VSQRTPDZrk:
9661 case X86::VSQRTPDZrkz:
9662 case X86::VSQRTPSZ128m:
9663 case X86::VSQRTPSZ128mb:
9664 case X86::VSQRTPSZ128mbk:
9665 case X86::VSQRTPSZ128mbkz:
9666 case X86::VSQRTPSZ128mk:
9667 case X86::VSQRTPSZ128mkz:
9668 case X86::VSQRTPSZ128r:
9669 case X86::VSQRTPSZ128rk:
9670 case X86::VSQRTPSZ128rkz:
9671 case X86::VSQRTPSZ256m:
9672 case X86::VSQRTPSZ256mb:
9673 case X86::VSQRTPSZ256mbk:
9674 case X86::VSQRTPSZ256mbkz:
9675 case X86::VSQRTPSZ256mk:
9676 case X86::VSQRTPSZ256mkz:
9677 case X86::VSQRTPSZ256r:
9678 case X86::VSQRTPSZ256rk:
9679 case X86::VSQRTPSZ256rkz:
9680 case X86::VSQRTPSZm:
9681 case X86::VSQRTPSZmb:
9682 case X86::VSQRTPSZmbk:
9683 case X86::VSQRTPSZmbkz:
9684 case X86::VSQRTPSZmk:
9685 case X86::VSQRTPSZmkz:
9686 case X86::VSQRTPSZr:
9687 case X86::VSQRTPSZrb:
9688 case X86::VSQRTPSZrbk:
9689 case X86::VSQRTPSZrbkz:
9690 case X86::VSQRTPSZrk:
9691 case X86::VSQRTPSZrkz:
9692 case X86::VSQRTSDZm:
9693 case X86::VSQRTSDZm_Int:
9694 case X86::VSQRTSDZmk_Int:
9695 case X86::VSQRTSDZmkz_Int:
9696 case X86::VSQRTSDZr:
9697 case X86::VSQRTSDZr_Int:
9698 case X86::VSQRTSDZrk_Int:
9699 case X86::VSQRTSDZrkz_Int:
9700 case X86::VSQRTSDZrb_Int:
9701 case X86::VSQRTSDZrbk_Int:
9702 case X86::VSQRTSDZrbkz_Int:
9703 case X86::VSQRTSSZm:
9704 case X86::VSQRTSSZm_Int:
9705 case X86::VSQRTSSZmk_Int:
9706 case X86::VSQRTSSZmkz_Int:
9707 case X86::VSQRTSSZr:
9708 case X86::VSQRTSSZr_Int:
9709 case X86::VSQRTSSZrk_Int:
9710 case X86::VSQRTSSZrkz_Int:
9711 case X86::VSQRTSSZrb_Int:
9712 case X86::VSQRTSSZrbk_Int:
9713 case X86::VSQRTSSZrbkz_Int:
9714
9715 case X86::VGATHERDPDYrm:
9716 case X86::VGATHERDPDZ128rm:
9717 case X86::VGATHERDPDZ256rm:
9718 case X86::VGATHERDPDZrm:
9719 case X86::VGATHERDPDrm:
9720 case X86::VGATHERDPSYrm:
9721 case X86::VGATHERDPSZ128rm:
9722 case X86::VGATHERDPSZ256rm:
9723 case X86::VGATHERDPSZrm:
9724 case X86::VGATHERDPSrm:
9725 case X86::VGATHERPF0DPDm:
9726 case X86::VGATHERPF0DPSm:
9727 case X86::VGATHERPF0QPDm:
9728 case X86::VGATHERPF0QPSm:
9729 case X86::VGATHERPF1DPDm:
9730 case X86::VGATHERPF1DPSm:
9731 case X86::VGATHERPF1QPDm:
9732 case X86::VGATHERPF1QPSm:
9733 case X86::VGATHERQPDYrm:
9734 case X86::VGATHERQPDZ128rm:
9735 case X86::VGATHERQPDZ256rm:
9736 case X86::VGATHERQPDZrm:
9737 case X86::VGATHERQPDrm:
9738 case X86::VGATHERQPSYrm:
9739 case X86::VGATHERQPSZ128rm:
9740 case X86::VGATHERQPSZ256rm:
9741 case X86::VGATHERQPSZrm:
9742 case X86::VGATHERQPSrm:
9743 case X86::VPGATHERDDYrm:
9744 case X86::VPGATHERDDZ128rm:
9745 case X86::VPGATHERDDZ256rm:
9746 case X86::VPGATHERDDZrm:
9747 case X86::VPGATHERDDrm:
9748 case X86::VPGATHERDQYrm:
9749 case X86::VPGATHERDQZ128rm:
9750 case X86::VPGATHERDQZ256rm:
9751 case X86::VPGATHERDQZrm:
9752 case X86::VPGATHERDQrm:
9753 case X86::VPGATHERQDYrm:
9754 case X86::VPGATHERQDZ128rm:
9755 case X86::VPGATHERQDZ256rm:
9756 case X86::VPGATHERQDZrm:
9757 case X86::VPGATHERQDrm:
9758 case X86::VPGATHERQQYrm:
9759 case X86::VPGATHERQQZ128rm:
9760 case X86::VPGATHERQQZ256rm:
9761 case X86::VPGATHERQQZrm:
9762 case X86::VPGATHERQQrm:
9763 case X86::VSCATTERDPDZ128mr:
9764 case X86::VSCATTERDPDZ256mr:
9765 case X86::VSCATTERDPDZmr:
9766 case X86::VSCATTERDPSZ128mr:
9767 case X86::VSCATTERDPSZ256mr:
9768 case X86::VSCATTERDPSZmr:
9769 case X86::VSCATTERPF0DPDm:
9770 case X86::VSCATTERPF0DPSm:
9771 case X86::VSCATTERPF0QPDm:
9772 case X86::VSCATTERPF0QPSm:
9773 case X86::VSCATTERPF1DPDm:
9774 case X86::VSCATTERPF1DPSm:
9775 case X86::VSCATTERPF1QPDm:
9776 case X86::VSCATTERPF1QPSm:
9777 case X86::VSCATTERQPDZ128mr:
9778 case X86::VSCATTERQPDZ256mr:
9779 case X86::VSCATTERQPDZmr:
9780 case X86::VSCATTERQPSZ128mr:
9781 case X86::VSCATTERQPSZ256mr:
9782 case X86::VSCATTERQPSZmr:
9783 case X86::VPSCATTERDDZ128mr:
9784 case X86::VPSCATTERDDZ256mr:
9785 case X86::VPSCATTERDDZmr:
9786 case X86::VPSCATTERDQZ128mr:
9787 case X86::VPSCATTERDQZ256mr:
9788 case X86::VPSCATTERDQZmr:
9789 case X86::VPSCATTERQDZ128mr:
9790 case X86::VPSCATTERQDZ256mr:
9791 case X86::VPSCATTERQDZmr:
9792 case X86::VPSCATTERQQZ128mr:
9793 case X86::VPSCATTERQQZ256mr:
9794 case X86::VPSCATTERQQZmr:
9795 return true;
9796 }
9797}
9798
9800 const MachineRegisterInfo *MRI,
9801 const MachineInstr &DefMI,
9802 unsigned DefIdx,
9803 const MachineInstr &UseMI,
9804 unsigned UseIdx) const {
9805 return isHighLatencyDef(DefMI.getOpcode());
9806}
9807
9809 const MachineBasicBlock *MBB) const {
9810 assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 &&
9811 Inst.getNumDefs() <= 2 && "Reassociation needs binary operators");
9812
9813 // Integer binary math/logic instructions have a third source operand:
9814 // the EFLAGS register. That operand must be both defined here and never
9815 // used; ie, it must be dead. If the EFLAGS operand is live, then we can
9816 // not change anything because rearranging the operands could affect other
9817 // instructions that depend on the exact status flags (zero, sign, etc.)
9818 // that are set by using these particular operands with this operation.
9819 const MachineOperand *FlagDef =
9820 Inst.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
9821 assert((Inst.getNumDefs() == 1 || FlagDef) && "Implicit def isn't flags?");
9822 if (FlagDef && !FlagDef->isDead())
9823 return false;
9824
9826}
9827
9828// TODO: There are many more machine instruction opcodes to match:
9829// 1. Other data types (integer, vectors)
9830// 2. Other math / logic operations (xor, or)
9831// 3. Other forms of the same operation (intrinsics and other variants)
9833 bool Invert) const {
9834 if (Invert)
9835 return false;
9836 switch (Inst.getOpcode()) {
9837 CASE_ND(ADD8rr)
9838 CASE_ND(ADD16rr)
9839 CASE_ND(ADD32rr)
9840 CASE_ND(ADD64rr)
9841 CASE_ND(AND8rr)
9842 CASE_ND(AND16rr)
9843 CASE_ND(AND32rr)
9844 CASE_ND(AND64rr)
9845 CASE_ND(OR8rr)
9846 CASE_ND(OR16rr)
9847 CASE_ND(OR32rr)
9848 CASE_ND(OR64rr)
9849 CASE_ND(XOR8rr)
9850 CASE_ND(XOR16rr)
9851 CASE_ND(XOR32rr)
9852 CASE_ND(XOR64rr)
9853 CASE_ND(IMUL16rr)
9854 CASE_ND(IMUL32rr)
9855 CASE_ND(IMUL64rr)
9856 case X86::PANDrr:
9857 case X86::PORrr:
9858 case X86::PXORrr:
9859 case X86::ANDPDrr:
9860 case X86::ANDPSrr:
9861 case X86::ORPDrr:
9862 case X86::ORPSrr:
9863 case X86::XORPDrr:
9864 case X86::XORPSrr:
9865 case X86::PADDBrr:
9866 case X86::PADDWrr:
9867 case X86::PADDDrr:
9868 case X86::PADDQrr:
9869 case X86::PMULLWrr:
9870 case X86::PMULLDrr:
9871 case X86::PMAXSBrr:
9872 case X86::PMAXSDrr:
9873 case X86::PMAXSWrr:
9874 case X86::PMAXUBrr:
9875 case X86::PMAXUDrr:
9876 case X86::PMAXUWrr:
9877 case X86::PMINSBrr:
9878 case X86::PMINSDrr:
9879 case X86::PMINSWrr:
9880 case X86::PMINUBrr:
9881 case X86::PMINUDrr:
9882 case X86::PMINUWrr:
9883 case X86::VPANDrr:
9884 case X86::VPANDYrr:
9885 case X86::VPANDDZ128rr:
9886 case X86::VPANDDZ256rr:
9887 case X86::VPANDDZrr:
9888 case X86::VPANDQZ128rr:
9889 case X86::VPANDQZ256rr:
9890 case X86::VPANDQZrr:
9891 case X86::VPORrr:
9892 case X86::VPORYrr:
9893 case X86::VPORDZ128rr:
9894 case X86::VPORDZ256rr:
9895 case X86::VPORDZrr:
9896 case X86::VPORQZ128rr:
9897 case X86::VPORQZ256rr:
9898 case X86::VPORQZrr:
9899 case X86::VPXORrr:
9900 case X86::VPXORYrr:
9901 case X86::VPXORDZ128rr:
9902 case X86::VPXORDZ256rr:
9903 case X86::VPXORDZrr:
9904 case X86::VPXORQZ128rr:
9905 case X86::VPXORQZ256rr:
9906 case X86::VPXORQZrr:
9907 case X86::VANDPDrr:
9908 case X86::VANDPSrr:
9909 case X86::VANDPDYrr:
9910 case X86::VANDPSYrr:
9911 case X86::VANDPDZ128rr:
9912 case X86::VANDPSZ128rr:
9913 case X86::VANDPDZ256rr:
9914 case X86::VANDPSZ256rr:
9915 case X86::VANDPDZrr:
9916 case X86::VANDPSZrr:
9917 case X86::VORPDrr:
9918 case X86::VORPSrr:
9919 case X86::VORPDYrr:
9920 case X86::VORPSYrr:
9921 case X86::VORPDZ128rr:
9922 case X86::VORPSZ128rr:
9923 case X86::VORPDZ256rr:
9924 case X86::VORPSZ256rr:
9925 case X86::VORPDZrr:
9926 case X86::VORPSZrr:
9927 case X86::VXORPDrr:
9928 case X86::VXORPSrr:
9929 case X86::VXORPDYrr:
9930 case X86::VXORPSYrr:
9931 case X86::VXORPDZ128rr:
9932 case X86::VXORPSZ128rr:
9933 case X86::VXORPDZ256rr:
9934 case X86::VXORPSZ256rr:
9935 case X86::VXORPDZrr:
9936 case X86::VXORPSZrr:
9937 case X86::KADDBkk:
9938 case X86::KADDWkk:
9939 case X86::KADDDkk:
9940 case X86::KADDQkk:
9941 case X86::KANDBkk:
9942 case X86::KANDWkk:
9943 case X86::KANDDkk:
9944 case X86::KANDQkk:
9945 case X86::KORBkk:
9946 case X86::KORWkk:
9947 case X86::KORDkk:
9948 case X86::KORQkk:
9949 case X86::KXORBkk:
9950 case X86::KXORWkk:
9951 case X86::KXORDkk:
9952 case X86::KXORQkk:
9953 case X86::VPADDBrr:
9954 case X86::VPADDWrr:
9955 case X86::VPADDDrr:
9956 case X86::VPADDQrr:
9957 case X86::VPADDBYrr:
9958 case X86::VPADDWYrr:
9959 case X86::VPADDDYrr:
9960 case X86::VPADDQYrr:
9961 case X86::VPADDBZ128rr:
9962 case X86::VPADDWZ128rr:
9963 case X86::VPADDDZ128rr:
9964 case X86::VPADDQZ128rr:
9965 case X86::VPADDBZ256rr:
9966 case X86::VPADDWZ256rr:
9967 case X86::VPADDDZ256rr:
9968 case X86::VPADDQZ256rr:
9969 case X86::VPADDBZrr:
9970 case X86::VPADDWZrr:
9971 case X86::VPADDDZrr:
9972 case X86::VPADDQZrr:
9973 case X86::VPMULLWrr:
9974 case X86::VPMULLWYrr:
9975 case X86::VPMULLWZ128rr:
9976 case X86::VPMULLWZ256rr:
9977 case X86::VPMULLWZrr:
9978 case X86::VPMULLDrr:
9979 case X86::VPMULLDYrr:
9980 case X86::VPMULLDZ128rr:
9981 case X86::VPMULLDZ256rr:
9982 case X86::VPMULLDZrr:
9983 case X86::VPMULLQZ128rr:
9984 case X86::VPMULLQZ256rr:
9985 case X86::VPMULLQZrr:
9986 case X86::VPMAXSBrr:
9987 case X86::VPMAXSBYrr:
9988 case X86::VPMAXSBZ128rr:
9989 case X86::VPMAXSBZ256rr:
9990 case X86::VPMAXSBZrr:
9991 case X86::VPMAXSDrr:
9992 case X86::VPMAXSDYrr:
9993 case X86::VPMAXSDZ128rr:
9994 case X86::VPMAXSDZ256rr:
9995 case X86::VPMAXSDZrr:
9996 case X86::VPMAXSQZ128rr:
9997 case X86::VPMAXSQZ256rr:
9998 case X86::VPMAXSQZrr:
9999 case X86::VPMAXSWrr:
10000 case X86::VPMAXSWYrr:
10001 case X86::VPMAXSWZ128rr:
10002 case X86::VPMAXSWZ256rr:
10003 case X86::VPMAXSWZrr:
10004 case X86::VPMAXUBrr:
10005 case X86::VPMAXUBYrr:
10006 case X86::VPMAXUBZ128rr:
10007 case X86::VPMAXUBZ256rr:
10008 case X86::VPMAXUBZrr:
10009 case X86::VPMAXUDrr:
10010 case X86::VPMAXUDYrr:
10011 case X86::VPMAXUDZ128rr:
10012 case X86::VPMAXUDZ256rr:
10013 case X86::VPMAXUDZrr:
10014 case X86::VPMAXUQZ128rr:
10015 case X86::VPMAXUQZ256rr:
10016 case X86::VPMAXUQZrr:
10017 case X86::VPMAXUWrr:
10018 case X86::VPMAXUWYrr:
10019 case X86::VPMAXUWZ128rr:
10020 case X86::VPMAXUWZ256rr:
10021 case X86::VPMAXUWZrr:
10022 case X86::VPMINSBrr:
10023 case X86::VPMINSBYrr:
10024 case X86::VPMINSBZ128rr:
10025 case X86::VPMINSBZ256rr:
10026 case X86::VPMINSBZrr:
10027 case X86::VPMINSDrr:
10028 case X86::VPMINSDYrr:
10029 case X86::VPMINSDZ128rr:
10030 case X86::VPMINSDZ256rr:
10031 case X86::VPMINSDZrr:
10032 case X86::VPMINSQZ128rr:
10033 case X86::VPMINSQZ256rr:
10034 case X86::VPMINSQZrr:
10035 case X86::VPMINSWrr:
10036 case X86::VPMINSWYrr:
10037 case X86::VPMINSWZ128rr:
10038 case X86::VPMINSWZ256rr:
10039 case X86::VPMINSWZrr:
10040 case X86::VPMINUBrr:
10041 case X86::VPMINUBYrr:
10042 case X86::VPMINUBZ128rr:
10043 case X86::VPMINUBZ256rr:
10044 case X86::VPMINUBZrr:
10045 case X86::VPMINUDrr:
10046 case X86::VPMINUDYrr:
10047 case X86::VPMINUDZ128rr:
10048 case X86::VPMINUDZ256rr:
10049 case X86::VPMINUDZrr:
10050 case X86::VPMINUQZ128rr:
10051 case X86::VPMINUQZ256rr:
10052 case X86::VPMINUQZrr:
10053 case X86::VPMINUWrr:
10054 case X86::VPMINUWYrr:
10055 case X86::VPMINUWZ128rr:
10056 case X86::VPMINUWZ256rr:
10057 case X86::VPMINUWZrr:
10058 // Normal min/max instructions are not commutative because of NaN and signed
10059 // zero semantics, but these are. Thus, there's no need to check for global
10060 // relaxed math; the instructions themselves have the properties we need.
10061 case X86::MAXCPDrr:
10062 case X86::MAXCPSrr:
10063 case X86::MAXCSDrr:
10064 case X86::MAXCSSrr:
10065 case X86::MINCPDrr:
10066 case X86::MINCPSrr:
10067 case X86::MINCSDrr:
10068 case X86::MINCSSrr:
10069 case X86::VMAXCPDrr:
10070 case X86::VMAXCPSrr:
10071 case X86::VMAXCPDYrr:
10072 case X86::VMAXCPSYrr:
10073 case X86::VMAXCPDZ128rr:
10074 case X86::VMAXCPSZ128rr:
10075 case X86::VMAXCPDZ256rr:
10076 case X86::VMAXCPSZ256rr:
10077 case X86::VMAXCPDZrr:
10078 case X86::VMAXCPSZrr:
10079 case X86::VMAXCSDrr:
10080 case X86::VMAXCSSrr:
10081 case X86::VMAXCSDZrr:
10082 case X86::VMAXCSSZrr:
10083 case X86::VMINCPDrr:
10084 case X86::VMINCPSrr:
10085 case X86::VMINCPDYrr:
10086 case X86::VMINCPSYrr:
10087 case X86::VMINCPDZ128rr:
10088 case X86::VMINCPSZ128rr:
10089 case X86::VMINCPDZ256rr:
10090 case X86::VMINCPSZ256rr:
10091 case X86::VMINCPDZrr:
10092 case X86::VMINCPSZrr:
10093 case X86::VMINCSDrr:
10094 case X86::VMINCSSrr:
10095 case X86::VMINCSDZrr:
10096 case X86::VMINCSSZrr:
10097 case X86::VMAXCPHZ128rr:
10098 case X86::VMAXCPHZ256rr:
10099 case X86::VMAXCPHZrr:
10100 case X86::VMAXCSHZrr:
10101 case X86::VMINCPHZ128rr:
10102 case X86::VMINCPHZ256rr:
10103 case X86::VMINCPHZrr:
10104 case X86::VMINCSHZrr:
10105 return true;
10106 case X86::ADDPDrr:
10107 case X86::ADDPSrr:
10108 case X86::ADDSDrr:
10109 case X86::ADDSSrr:
10110 case X86::MULPDrr:
10111 case X86::MULPSrr:
10112 case X86::MULSDrr:
10113 case X86::MULSSrr:
10114 case X86::VADDPDrr:
10115 case X86::VADDPSrr:
10116 case X86::VADDPDYrr:
10117 case X86::VADDPSYrr:
10118 case X86::VADDPDZ128rr:
10119 case X86::VADDPSZ128rr:
10120 case X86::VADDPDZ256rr:
10121 case X86::VADDPSZ256rr:
10122 case X86::VADDPDZrr:
10123 case X86::VADDPSZrr:
10124 case X86::VADDSDrr:
10125 case X86::VADDSSrr:
10126 case X86::VADDSDZrr:
10127 case X86::VADDSSZrr:
10128 case X86::VMULPDrr:
10129 case X86::VMULPSrr:
10130 case X86::VMULPDYrr:
10131 case X86::VMULPSYrr:
10132 case X86::VMULPDZ128rr:
10133 case X86::VMULPSZ128rr:
10134 case X86::VMULPDZ256rr:
10135 case X86::VMULPSZ256rr:
10136 case X86::VMULPDZrr:
10137 case X86::VMULPSZrr:
10138 case X86::VMULSDrr:
10139 case X86::VMULSSrr:
10140 case X86::VMULSDZrr:
10141 case X86::VMULSSZrr:
10142 case X86::VADDPHZ128rr:
10143 case X86::VADDPHZ256rr:
10144 case X86::VADDPHZrr:
10145 case X86::VADDSHZrr:
10146 case X86::VMULPHZ128rr:
10147 case X86::VMULPHZ256rr:
10148 case X86::VMULPHZrr:
10149 case X86::VMULSHZrr:
10152 default:
10153 return false;
10154 }
10155}
10156
10157/// If \p DescribedReg overlaps with the MOVrr instruction's destination
10158/// register then, if possible, describe the value in terms of the source
10159/// register.
10160static std::optional<ParamLoadedValue>
10162 const TargetRegisterInfo *TRI) {
10163 Register DestReg = MI.getOperand(0).getReg();
10164 Register SrcReg = MI.getOperand(1).getReg();
10165
10166 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10167
10168 // If the described register is the destination, just return the source.
10169 if (DestReg == DescribedReg)
10170 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10171
10172 // If the described register is a sub-register of the destination register,
10173 // then pick out the source register's corresponding sub-register.
10174 if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
10175 Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
10176 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
10177 }
10178
10179 // The remaining case to consider is when the described register is a
10180 // super-register of the destination register. MOV8rr and MOV16rr does not
10181 // write to any of the other bytes in the register, meaning that we'd have to
10182 // describe the value using a combination of the source register and the
10183 // non-overlapping bits in the described register, which is not currently
10184 // possible.
10185 if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr ||
10186 !TRI->isSuperRegister(DestReg, DescribedReg))
10187 return std::nullopt;
10188
10189 assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case");
10190 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10191}
10192
10193std::optional<ParamLoadedValue>
10195 const MachineOperand *Op = nullptr;
10196 DIExpression *Expr = nullptr;
10197
10199
10200 switch (MI.getOpcode()) {
10201 case X86::LEA32r:
10202 case X86::LEA64r:
10203 case X86::LEA64_32r: {
10204 // We may need to describe a 64-bit parameter with a 32-bit LEA.
10205 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10206 return std::nullopt;
10207
10208 // Operand 4 could be global address. For now we do not support
10209 // such situation.
10210 if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
10211 return std::nullopt;
10212
10213 const MachineOperand &Op1 = MI.getOperand(1);
10214 const MachineOperand &Op2 = MI.getOperand(3);
10215 assert(Op2.isReg() &&
10216 (Op2.getReg() == X86::NoRegister || Op2.getReg().isPhysical()));
10217
10218 // Omit situations like:
10219 // %rsi = lea %rsi, 4, ...
10220 if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
10221 Op2.getReg() == MI.getOperand(0).getReg())
10222 return std::nullopt;
10223 else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
10224 TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
10225 (Op2.getReg() != X86::NoRegister &&
10226 TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
10227 return std::nullopt;
10228
10229 int64_t Coef = MI.getOperand(2).getImm();
10230 int64_t Offset = MI.getOperand(4).getImm();
10232
10233 if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
10234 Op = &Op1;
10235 } else if (Op1.isFI())
10236 Op = &Op1;
10237
10238 if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
10239 Ops.push_back(dwarf::DW_OP_constu);
10240 Ops.push_back(Coef + 1);
10241 Ops.push_back(dwarf::DW_OP_mul);
10242 } else {
10243 if (Op && Op2.getReg() != X86::NoRegister) {
10244 int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
10245 if (dwarfReg < 0)
10246 return std::nullopt;
10247 else if (dwarfReg < 32) {
10248 Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
10249 Ops.push_back(0);
10250 } else {
10251 Ops.push_back(dwarf::DW_OP_bregx);
10252 Ops.push_back(dwarfReg);
10253 Ops.push_back(0);
10254 }
10255 } else if (!Op) {
10256 assert(Op2.getReg() != X86::NoRegister);
10257 Op = &Op2;
10258 }
10259
10260 if (Coef > 1) {
10261 assert(Op2.getReg() != X86::NoRegister);
10262 Ops.push_back(dwarf::DW_OP_constu);
10263 Ops.push_back(Coef);
10264 Ops.push_back(dwarf::DW_OP_mul);
10265 }
10266
10267 if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
10268 Op2.getReg() != X86::NoRegister) {
10269 Ops.push_back(dwarf::DW_OP_plus);
10270 }
10271 }
10272
10274 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
10275
10276 return ParamLoadedValue(*Op, Expr);
10277 }
10278 case X86::MOV8ri:
10279 case X86::MOV16ri:
10280 // TODO: Handle MOV8ri and MOV16ri.
10281 return std::nullopt;
10282 case X86::MOV32ri:
10283 case X86::MOV64ri:
10284 case X86::MOV64ri32:
10285 // MOV32ri may be used for producing zero-extended 32-bit immediates in
10286 // 64-bit parameters, so we need to consider super-registers.
10287 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10288 return std::nullopt;
10289 return ParamLoadedValue(MI.getOperand(1), Expr);
10290 case X86::MOV8rr:
10291 case X86::MOV16rr:
10292 case X86::MOV32rr:
10293 case X86::MOV64rr:
10294 return describeMOVrrLoadedValue(MI, Reg, TRI);
10295 case X86::XOR32rr: {
10296 // 64-bit parameters are zero-materialized using XOR32rr, so also consider
10297 // super-registers.
10298 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10299 return std::nullopt;
10300 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
10302 return std::nullopt;
10303 }
10304 case X86::MOVSX64rr32: {
10305 // We may need to describe the lower 32 bits of the MOVSX; for example, in
10306 // cases like this:
10307 //
10308 // $ebx = [...]
10309 // $rdi = MOVSX64rr32 $ebx
10310 // $esi = MOV32rr $edi
10311 if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg))
10312 return std::nullopt;
10313
10314 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10315
10316 // If the described register is the destination register we need to
10317 // sign-extend the source register from 32 bits. The other case we handle
10318 // is when the described register is the 32-bit sub-register of the
10319 // destination register, in case we just need to return the source
10320 // register.
10321 if (Reg == MI.getOperand(0).getReg())
10322 Expr = DIExpression::appendExt(Expr, 32, 64, true);
10323 else
10324 assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) &&
10325 "Unhandled sub-register case for MOVSX64rr32");
10326
10327 return ParamLoadedValue(MI.getOperand(1), Expr);
10328 }
10329 default:
10330 assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction");
10332 }
10333}
10334
10335/// This is an architecture-specific helper function of reassociateOps.
10336/// Set special operand attributes for new instructions after reassociation.
10338 MachineInstr &OldMI2,
10339 MachineInstr &NewMI1,
10340 MachineInstr &NewMI2) const {
10341 // Integer instructions may define an implicit EFLAGS dest register operand.
10342 MachineOperand *OldFlagDef1 =
10343 OldMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10344 MachineOperand *OldFlagDef2 =
10345 OldMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10346
10347 assert(!OldFlagDef1 == !OldFlagDef2 &&
10348 "Unexpected instruction type for reassociation");
10349
10350 if (!OldFlagDef1 || !OldFlagDef2)
10351 return;
10352
10353 assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() &&
10354 "Must have dead EFLAGS operand in reassociable instruction");
10355
10356 MachineOperand *NewFlagDef1 =
10357 NewMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10358 MachineOperand *NewFlagDef2 =
10359 NewMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10360
10361 assert(NewFlagDef1 && NewFlagDef2 &&
10362 "Unexpected operand in reassociable instruction");
10363
10364 // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
10365 // of this pass or other passes. The EFLAGS operands must be dead in these new
10366 // instructions because the EFLAGS operands in the original instructions must
10367 // be dead in order for reassociation to occur.
10368 NewFlagDef1->setIsDead();
10369 NewFlagDef2->setIsDead();
10370}
10371
10372std::pair<unsigned, unsigned>
10374 return std::make_pair(TF, 0u);
10375}
10376
10379 using namespace X86II;
10380 static const std::pair<unsigned, const char *> TargetFlags[] = {
10381 {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
10382 {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
10383 {MO_GOT, "x86-got"},
10384 {MO_GOTOFF, "x86-gotoff"},
10385 {MO_GOTPCREL, "x86-gotpcrel"},
10386 {MO_GOTPCREL_NORELAX, "x86-gotpcrel-norelax"},
10387 {MO_PLT, "x86-plt"},
10388 {MO_TLSGD, "x86-tlsgd"},
10389 {MO_TLSLD, "x86-tlsld"},
10390 {MO_TLSLDM, "x86-tlsldm"},
10391 {MO_GOTTPOFF, "x86-gottpoff"},
10392 {MO_INDNTPOFF, "x86-indntpoff"},
10393 {MO_TPOFF, "x86-tpoff"},
10394 {MO_DTPOFF, "x86-dtpoff"},
10395 {MO_NTPOFF, "x86-ntpoff"},
10396 {MO_GOTNTPOFF, "x86-gotntpoff"},
10397 {MO_DLLIMPORT, "x86-dllimport"},
10398 {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
10399 {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
10400 {MO_TLVP, "x86-tlvp"},
10401 {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
10402 {MO_SECREL, "x86-secrel"},
10403 {MO_COFFSTUB, "x86-coffstub"}};
10404 return ArrayRef(TargetFlags);
10405}
10406
10407/// Constants defining how certain sequences should be outlined.
10408///
10409/// \p MachineOutlinerDefault implies that the function is called with a call
10410/// instruction, and a return must be emitted for the outlined function frame.
10411///
10412/// That is,
10413///
10414/// I1 OUTLINED_FUNCTION:
10415/// I2 --> call OUTLINED_FUNCTION I1
10416/// I3 I2
10417/// I3
10418/// ret
10419///
10420/// * Call construction overhead: 1 (call instruction)
10421/// * Frame construction overhead: 1 (return instruction)
10422///
10423/// \p MachineOutlinerTailCall implies that the function is being tail called.
10424/// A jump is emitted instead of a call, and the return is already present in
10425/// the outlined sequence. That is,
10426///
10427/// I1 OUTLINED_FUNCTION:
10428/// I2 --> jmp OUTLINED_FUNCTION I1
10429/// ret I2
10430/// ret
10431///
10432/// * Call construction overhead: 1 (jump instruction)
10433/// * Frame construction overhead: 0 (don't need to return)
10434///
10436
10437std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10439 const MachineModuleInfo &MMI,
10440 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10441 unsigned MinRepeats) const {
10442 unsigned SequenceSize = 0;
10443 for (auto &MI : RepeatedSequenceLocs[0]) {
10444 // FIXME: x86 doesn't implement getInstSizeInBytes, so
10445 // we can't tell the cost. Just assume each instruction
10446 // is one byte.
10447 if (MI.isDebugInstr() || MI.isKill())
10448 continue;
10449 SequenceSize += 1;
10450 }
10451
10452 // We check to see if CFI Instructions are present, and if they are
10453 // we find the number of CFI Instructions in the candidates.
10454 unsigned CFICount = 0;
10455 for (auto &I : RepeatedSequenceLocs[0]) {
10456 if (I.isCFIInstruction())
10457 CFICount++;
10458 }
10459
10460 // We compare the number of found CFI Instructions to the number of CFI
10461 // instructions in the parent function for each candidate. We must check this
10462 // since if we outline one of the CFI instructions in a function, we have to
10463 // outline them all for correctness. If we do not, the address offsets will be
10464 // incorrect between the two sections of the program.
10465 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10466 std::vector<MCCFIInstruction> CFIInstructions =
10467 C.getMF()->getFrameInstructions();
10468
10469 if (CFICount > 0 && CFICount != CFIInstructions.size())
10470 return std::nullopt;
10471 }
10472
10473 // FIXME: Use real size in bytes for call and ret instructions.
10474 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10475 for (outliner::Candidate &C : RepeatedSequenceLocs)
10476 C.setCallInfo(MachineOutlinerTailCall, 1);
10477
10478 return std::make_unique<outliner::OutlinedFunction>(
10479 RepeatedSequenceLocs, SequenceSize,
10480 0, // Number of bytes to emit frame.
10481 MachineOutlinerTailCall // Type of frame.
10482 );
10483 }
10484
10485 if (CFICount > 0)
10486 return std::nullopt;
10487
10488 for (outliner::Candidate &C : RepeatedSequenceLocs)
10489 C.setCallInfo(MachineOutlinerDefault, 1);
10490
10491 return std::make_unique<outliner::OutlinedFunction>(
10492 RepeatedSequenceLocs, SequenceSize, 1, MachineOutlinerDefault);
10493}
10494
10496 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10497 const Function &F = MF.getFunction();
10498
10499 // Does the function use a red zone? If it does, then we can't risk messing
10500 // with the stack.
10501 if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) {
10502 // It could have a red zone. If it does, then we don't want to touch it.
10504 if (!X86FI || X86FI->getUsesRedZone())
10505 return false;
10506 }
10507
10508 // If we *don't* want to outline from things that could potentially be deduped
10509 // then return false.
10510 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10511 return false;
10512
10513 // This function is viable for outlining, so return true.
10514 return true;
10515}
10516
10520 unsigned Flags) const {
10521 MachineInstr &MI = *MIT;
10522
10523 // Is this a terminator for a basic block?
10524 if (MI.isTerminator())
10525 // TargetInstrInfo::getOutliningType has already filtered out anything
10526 // that would break this, so we can allow it here.
10528
10529 // Don't outline anything that modifies or reads from the stack pointer.
10530 //
10531 // FIXME: There are instructions which are being manually built without
10532 // explicit uses/defs so we also have to check the MCInstrDesc. We should be
10533 // able to remove the extra checks once those are fixed up. For example,
10534 // sometimes we might get something like %rax = POP64r 1. This won't be
10535 // caught by modifiesRegister or readsRegister even though the instruction
10536 // really ought to be formed so that modifiesRegister/readsRegister would
10537 // catch it.
10538 if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
10539 MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
10540 MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
10542
10543 // Outlined calls change the instruction pointer, so don't read from it.
10544 if (MI.readsRegister(X86::RIP, &RI) ||
10545 MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
10546 MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
10548
10549 // Don't outline CFI instructions.
10550 if (MI.isCFIInstruction())
10552
10554}
10555
10558 const outliner::OutlinedFunction &OF) const {
10559 // If we're a tail call, we already have a return, so don't do anything.
10560 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10561 return;
10562
10563 // We're a normal call, so our sequence doesn't have a return instruction.
10564 // Add it in.
10565 MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RET64));
10566 MBB.insert(MBB.end(), retq);
10567}
10568
10572 // Is it a tail call?
10573 if (C.CallConstructionID == MachineOutlinerTailCall) {
10574 // Yes, just insert a JMP.
10575 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64))
10576 .addGlobalAddress(M.getNamedValue(MF.getName())));
10577 } else {
10578 // No, insert a call.
10579 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
10580 .addGlobalAddress(M.getNamedValue(MF.getName())));
10581 }
10582
10583 return It;
10584}
10585
10588 DebugLoc &DL,
10589 bool AllowSideEffects) const {
10590 const MachineFunction &MF = *MBB.getParent();
10591 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
10593
10594 if (ST.hasMMX() && X86::VR64RegClass.contains(Reg))
10595 // FIXME: Should we ignore MMX registers?
10596 return;
10597
10598 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10599 // Convert register to the 32-bit version. Both 'movl' and 'xorl' clear the
10600 // upper bits of a 64-bit register automagically.
10601 Reg = getX86SubSuperRegister(Reg, 32);
10602
10603 if (!AllowSideEffects)
10604 // XOR affects flags, so use a MOV instead.
10605 BuildMI(MBB, Iter, DL, get(X86::MOV32ri), Reg).addImm(0);
10606 else
10607 BuildMI(MBB, Iter, DL, get(X86::XOR32rr), Reg)
10608 .addReg(Reg, RegState::Undef)
10609 .addReg(Reg, RegState::Undef);
10610 } else if (X86::VR128RegClass.contains(Reg)) {
10611 // XMM#
10612 if (!ST.hasSSE1())
10613 return;
10614
10615 BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
10616 } else if (X86::VR256RegClass.contains(Reg)) {
10617 // YMM#
10618 if (!ST.hasAVX())
10619 return;
10620
10621 BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
10622 } else if (X86::VR512RegClass.contains(Reg)) {
10623 // ZMM#
10624 if (!ST.hasAVX512())
10625 return;
10626
10627 BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
10628 } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
10629 X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
10630 X86::VK16RegClass.contains(Reg)) {
10631 if (!ST.hasVLX())
10632 return;
10633
10634 unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
10635 BuildMI(MBB, Iter, DL, get(Op), Reg);
10636 }
10637}
10638
10640 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
10641 bool DoRegPressureReduce) const {
10642 unsigned Opc = Root.getOpcode();
10643 switch (Opc) {
10644 case X86::VPDPWSSDrr:
10645 case X86::VPDPWSSDrm:
10646 case X86::VPDPWSSDYrr:
10647 case X86::VPDPWSSDYrm: {
10648 if (!Subtarget.hasFastDPWSSD()) {
10650 return true;
10651 }
10652 break;
10653 }
10654 case X86::VPDPWSSDZ128rr:
10655 case X86::VPDPWSSDZ128rm:
10656 case X86::VPDPWSSDZ256rr:
10657 case X86::VPDPWSSDZ256rm:
10658 case X86::VPDPWSSDZrr:
10659 case X86::VPDPWSSDZrm: {
10660 if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
10662 return true;
10663 }
10664 break;
10665 }
10666 }
10668 Patterns, DoRegPressureReduce);
10669}
10670
10671static void
10675 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
10676 MachineFunction *MF = Root.getMF();
10678
10679 unsigned Opc = Root.getOpcode();
10680 unsigned AddOpc = 0;
10681 unsigned MaddOpc = 0;
10682 switch (Opc) {
10683 default:
10684 assert(false && "It should not reach here");
10685 break;
10686 // vpdpwssd xmm2,xmm3,xmm1
10687 // -->
10688 // vpmaddwd xmm3,xmm3,xmm1
10689 // vpaddd xmm2,xmm2,xmm3
10690 case X86::VPDPWSSDrr:
10691 MaddOpc = X86::VPMADDWDrr;
10692 AddOpc = X86::VPADDDrr;
10693 break;
10694 case X86::VPDPWSSDrm:
10695 MaddOpc = X86::VPMADDWDrm;
10696 AddOpc = X86::VPADDDrr;
10697 break;
10698 case X86::VPDPWSSDZ128rr:
10699 MaddOpc = X86::VPMADDWDZ128rr;
10700 AddOpc = X86::VPADDDZ128rr;
10701 break;
10702 case X86::VPDPWSSDZ128rm:
10703 MaddOpc = X86::VPMADDWDZ128rm;
10704 AddOpc = X86::VPADDDZ128rr;
10705 break;
10706 // vpdpwssd ymm2,ymm3,ymm1
10707 // -->
10708 // vpmaddwd ymm3,ymm3,ymm1
10709 // vpaddd ymm2,ymm2,ymm3
10710 case X86::VPDPWSSDYrr:
10711 MaddOpc = X86::VPMADDWDYrr;
10712 AddOpc = X86::VPADDDYrr;
10713 break;
10714 case X86::VPDPWSSDYrm:
10715 MaddOpc = X86::VPMADDWDYrm;
10716 AddOpc = X86::VPADDDYrr;
10717 break;
10718 case X86::VPDPWSSDZ256rr:
10719 MaddOpc = X86::VPMADDWDZ256rr;
10720 AddOpc = X86::VPADDDZ256rr;
10721 break;
10722 case X86::VPDPWSSDZ256rm:
10723 MaddOpc = X86::VPMADDWDZ256rm;
10724 AddOpc = X86::VPADDDZ256rr;
10725 break;
10726 // vpdpwssd zmm2,zmm3,zmm1
10727 // -->
10728 // vpmaddwd zmm3,zmm3,zmm1
10729 // vpaddd zmm2,zmm2,zmm3
10730 case X86::VPDPWSSDZrr:
10731 MaddOpc = X86::VPMADDWDZrr;
10732 AddOpc = X86::VPADDDZrr;
10733 break;
10734 case X86::VPDPWSSDZrm:
10735 MaddOpc = X86::VPMADDWDZrm;
10736 AddOpc = X86::VPADDDZrr;
10737 break;
10738 }
10739 // Create vpmaddwd.
10740 const TargetRegisterClass *RC =
10741 RegInfo.getRegClass(Root.getOperand(0).getReg());
10742 Register NewReg = RegInfo.createVirtualRegister(RC);
10743 MachineInstr *Madd = Root.getMF()->CloneMachineInstr(&Root);
10744 Madd->setDesc(TII.get(MaddOpc));
10745 Madd->untieRegOperand(1);
10746 Madd->removeOperand(1);
10747 Madd->getOperand(0).setReg(NewReg);
10748 InstrIdxForVirtReg.insert(std::make_pair(NewReg, 0));
10749 // Create vpaddd.
10750 Register DstReg = Root.getOperand(0).getReg();
10751 bool IsKill = Root.getOperand(1).isKill();
10752 MachineInstr *Add =
10753 BuildMI(*MF, MIMetadata(Root), TII.get(AddOpc), DstReg)
10754 .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill))
10755 .addReg(Madd->getOperand(0).getReg(), getKillRegState(true));
10756 InsInstrs.push_back(Madd);
10757 InsInstrs.push_back(Add);
10758 DelInstrs.push_back(&Root);
10759}
10760
10762 MachineInstr &Root, unsigned Pattern,
10765 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
10766 switch (Pattern) {
10767 default:
10768 // Reassociate instructions.
10770 DelInstrs, InstrIdxForVirtReg);
10771 return;
10773 genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs,
10774 InstrIdxForVirtReg);
10775 return;
10776 }
10777}
10778
10779// See also: X86DAGToDAGISel::SelectInlineAsmMemoryOperand().
10781 int FI) const {
10784 M.Base.FrameIndex = FI;
10785 M.getFullAddress(Ops);
10786}
10787
10788#define GET_INSTRINFO_HELPERS
10789#include "X86GenInstrInfo.inc"
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
return SDValue()
static bool isFrameStoreOpcode(int Opcode)
static bool isFrameLoadOpcode(int Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerDefault
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
static SDValue isNOT(SDValue V, SelectionDAG &DAG)
static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
Expand a single-def pseudo instruction to a two-addr instruction with two undef reads of the register...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
Provides some synthesis utilities to produce sequences of values.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
#define LLVM_DEBUG(...)
Definition Debug.h:114
#define FROM_TO(FROM, TO)
cl::opt< bool > X86EnableAPXForRelocation
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static bool isLEA(unsigned Opcode)
static void addOperands(MachineInstrBuilder &MIB, ArrayRef< MachineOperand > MOs, int PtrOffset=0)
static std::optional< ParamLoadedValue > describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetRegisterInfo *TRI)
If DescribedReg overlaps with the MOVrr instruction's destination register then, if possible,...
static cl::opt< unsigned > PartialRegUpdateClearance("partial-reg-update-clearance", cl::desc("Clearance between two register writes " "for inserting XOR to avoid partial " "register update"), cl::init(64), cl::Hidden)
static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI)
static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg, const X86Subtarget &Subtarget)
static bool isConvertibleLEA(MachineInstr *MI)
static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, const X86Subtarget &Subtarget)
static bool isAMXOpcode(unsigned Opc)
static int getJumpTableIndexFromReg(const MachineRegisterInfo &MRI, Register Reg)
static void updateOperandRegConstraints(MachineFunction &MF, MachineInstr &NewMI, const TargetInstrInfo &TII)
static int getJumpTableIndexFromAddr(const MachineInstr &MI)
static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth, unsigned NewWidth, unsigned *pNewMask=nullptr)
static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, bool MinusOne)
static unsigned getNewOpcFromTable(ArrayRef< X86TableEntry > Table, unsigned Opc)
static unsigned getStoreRegOpcode(Register SrcReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
#define FOLD_BROADCAST(SIZE)
static cl::opt< unsigned > UndefRegClearance("undef-reg-clearance", cl::desc("How many idle instructions we would like before " "certain undef register reads"), cl::init(128), cl::Hidden)
#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64)
static bool isTruncatedShiftCountForLEA(unsigned ShAmt)
Check whether the given shift count is appropriate can be represented by a LEA instruction.
static cl::opt< bool > ReMatPICStubLoad("remat-pic-stub-load", cl::desc("Re-materialize load from stub in PIC mode"), cl::init(false), cl::Hidden)
static SmallVector< MachineMemOperand *, 2 > extractLoadMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static MachineInstr * fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII)
static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx)
static bool canConvert2Copy(unsigned Opc)
static cl::opt< bool > NoFusing("disable-spill-fusing", cl::desc("Disable fusing of spill code into instructions"), cl::Hidden)
static bool expandNOVLXStore(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &StoreDesc, const MCInstrDesc &ExtractDesc, unsigned SubIdx)
static bool isX87Reg(Register Reg)
Return true if the Reg is X87 register.
static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc, Register Reg)
Expand a single-def pseudo instruction to a two-addr instruction with two k0 reads.
#define VPERM_CASES_BROADCAST(Suffix)
static std::pair< X86::CondCode, unsigned > isUseDefConvertible(const MachineInstr &MI)
Check whether the use can be converted to remove a comparison against zero.
static bool findRedundantFlagInstr(MachineInstr &CmpInstr, MachineInstr &CmpValDefInstr, const MachineRegisterInfo *MRI, MachineInstr **AndInstr, const TargetRegisterInfo *TRI, const X86Subtarget &ST, bool &NoSignFlag, bool &ClearsOverflowFlag)
static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
static unsigned getLoadRegOpcode(Register DestReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, bool ForLoadFold=false)
static MachineInstr * makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI)
#define GET_ND_IF_ENABLED(OPC)
static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI, const TargetInstrInfo &TII, bool HasAVX)
static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget, bool ForLoadFold=false)
Return true for all instructions that only update the first 32 or 64-bits of the destination register...
#define CASE_NF(OP)
static const uint16_t * lookupAVX512(unsigned opcode, unsigned domain, ArrayRef< uint16_t[4]> Table)
static unsigned getLoadStoreRegOpcode(Register Reg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI, bool Load)
#define VPERM_CASES(Suffix)
#define FROM_TO_SIZE(A, B, S)
static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, bool &ClearsOverflowFlag)
Check whether the definition can be converted to remove a comparison against zero.
static MachineInstr * fuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII, int PtrOffset=0)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode)
static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static MachineBasicBlock * getFallThroughMBB(MachineBasicBlock *MBB, MachineBasicBlock *TBB)
static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, const MachineInstr &UserMI, const MachineFunction &MF)
Check if LoadMI is a partial register load that we can't fold into MI because the latter uses content...
static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI)
static bool isHReg(Register Reg)
Test if the given register is a physical h register.
static cl::opt< bool > PrintFailedFusing("print-failed-fuse-candidates", cl::desc("Print instructions that the allocator wants to" " fuse, but the X86 backend currently can't"), cl::Hidden)
static bool expandNOVLXLoad(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &LoadDesc, const MCInstrDesc &BroadcastDesc, unsigned SubIdx)
static void genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
#define CASE_ND(OP)
static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
This determines which of three possible cases of a three source commute the source indexes correspond...
static unsigned getTruncatedShiftCount(const MachineInstr &MI, unsigned ShiftAmtOperandIdx)
Check whether the shift count for a machine operand is non-zero.
static SmallVector< MachineMemOperand *, 2 > extractStoreMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static unsigned getBroadcastOpcode(const X86FoldTableEntry *I, const TargetRegisterClass *RC, const X86Subtarget &STI)
static unsigned convertALUrr2ALUri(unsigned Opc)
Convert an ALUrr opcode to corresponding ALUri opcode.
static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI)
Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
static bool isCommutableVPERMV3Instruction(unsigned Opcode)
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DWARF expression.
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
static LLVM_ABI DIExpression * appendExt(const DIExpression *Expr, unsigned FromSize, unsigned ToSize, bool Signed)
Append a zero- or sign-extension to Expr.
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:714
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
LiveInterval - This class represents the liveness of a register, or stack slot.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
A set of physical registers with utility functions to track liveness when walking backward/forward th...
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
static LocationSize precise(uint64_t Value)
bool usesWindowsCFI() const
Definition MCAsmInfo.h:655
static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int64_t Adjustment, SMLoc Loc={})
.cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but Offset is a relative value that is added/subt...
Definition MCDwarf.h:599
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
void setOpcode(unsigned Op)
Definition MCInst.h:201
Describe properties that are true of each instruction in the target description file.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
Set of metadata that should be preserved when using BuildMI().
SimpleValueType SimpleTy
MachineInstrBundleIterator< const MachineInstr > const_iterator
void push_back(MachineInstr *MI)
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
This class is a data container for one entry in a MachineConstantPool.
union llvm::MachineConstantPoolEntry::@004270020304201266316354007027341142157160323045 Val
The constant itself.
bool isMachineConstantPoolEntry() const
isMachineConstantPoolEntry - Return true if the MachineConstantPoolEntry is indeed a target specific ...
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
bool needsFrameMoves() const
True if this function needs frame moves for debug or exceptions.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
mop_iterator operands_begin()
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isImplicitDef() const
const MachineBasicBlock * getParent() const
void dropDebugNumber()
Drop any variable location debugging information associated with this instruction.
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI unsigned getNumExplicitDefs() const
Returns the number of non-implicit definitions.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
void setFlag(MIFlag Flag)
Set a MI flag.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
unsigned getNumDefs() const
Returns the total number of definitions.
void setDebugLoc(DebugLoc DL)
Replace current source information with new such.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImplicit(bool Val=true)
void setImm(int64_t immVal)
int64_t getImm() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
bool isCPI() const
isCPI - Tests if this is a MO_ConstantPoolIndex operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
bool isJTI() const
isJTI - Tests if this is a MO_JumpTableIndex operand.
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateCPI(unsigned Idx, int Offset, unsigned TargetFlags=0)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
MachineFunction & getMachineFunction() const
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
virtual const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
virtual bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const
Returns true iff the routine could find two commutable operands in the given machine instruction.
virtual bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const
Return true when \P Inst has reassociable operands in the same \P MBB.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
bool isPositionIndependent() const
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
virtual const TargetFrameLowering * getFrameLowering() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getZero()
Definition TypeSize.h:349
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
static LLVM_ABI Type * getFP128Ty(LLVMContext &C)
Definition Type.cpp:289
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:282
SlotIndex def
The index of the defining instruction.
LLVM Value Representation.
Definition Value.h:75
void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst, MachineInstr::MIFlag Flag=MachineInstr::NoFlags) const
Wraps up getting a CFI index and building a MachineInstr for it.
void getFrameIndexOperands(SmallVectorImpl< MachineOperand > &Ops, int FI) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
Check if there exists an earlier instruction that operates on the same source operands and sets eflag...
bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
Overrides the isSchedulingBoundary from Codegen/TargetInstrInfo.cpp to make it capable of identifying...
MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, outliner::Candidate &C) const override
void replaceBranchWithTailCall(MachineBasicBlock &MBB, SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex=nullptr) const override
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override
Returns true iff the routine could find two commutable operands in the given machine instruction.
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
X86InstrInfo(const X86Subtarget &STI)
static bool isDataInvariantLoad(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value l...
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned CommuteOpIdx1, unsigned CommuteOpIdx2) const override
bool isFunctionSafeToOutlineFrom(MachineFunction &MF, bool OutlineFromLinkOnceODRs) const override
const X86RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
bool hasCommutePreference(MachineInstr &MI, bool &Commute) const override
Returns true if we have preference on the operands order in MI, the commute decision is returned in C...
bool hasLiveCondCodeDef(MachineInstr &MI) const
True if MI has a condition code def, e.g.
std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const override
bool canMakeTailCallConditional(SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl< MachineInstr * > &NewMIs) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
convertToThreeAddress - This method must be implemented by targets that set the M_CONVERTIBLE_TO_3_AD...
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool expandPostRAPseudo(MachineInstr &MI) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isAssociativeAndCommutative(const MachineInstr &Inst, bool Invert) const override
MCInst getNop() const override
Return the noop instruction to use for a noop.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
outliner::InstrType getOutliningTypeImpl(const MachineModuleInfo &MMI, MachineBasicBlock::iterator &MIT, unsigned Flags) const override
bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override
This is a used by the pre-regalloc scheduler to determine (in conjunction with areLoadsFromSameBasePt...
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
Fold a load or store of the specified stack slot into the specified machine instruction for the speci...
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isStoreToStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const override
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
bool isUnconditionalTailCall(const MachineInstr &MI) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
std::optional< std::unique_ptr< outliner::OutlinedFunction > > getOutliningCandidateInfo(const MachineModuleInfo &MMI, std::vector< outliner::Candidate > &RepeatedSequenceLocs, unsigned MinRepeats) const override
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, Register &NewSrc, unsigned &NewSrcSubReg, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV, LiveIntervals *LIS) const
Given an operand within a MachineInstr, insert preceding code to put it into the right format for a p...
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isLoadFromStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const
int getSPAdjust(const MachineInstr &MI) const override
getSPAdjust - This returns the stack pointer adjustment made by this instruction.
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isReMaterializableImpl(const MachineInstr &MI) const override
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
int getJumpTableIndex(const MachineInstr &MI) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const override
This is an architecture-specific helper function of reassociateOps.
std::pair< uint16_t, uint16_t > getExecutionDomain(const MachineInstr &MI) const override
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
isCoalescableExtInstr - Return true if the instruction is a "coalescable" extension instruction.
void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Opc, Register Reg, int FrameIdx, bool isKill=false) const
void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg) const override
When getMachineCombinerPatterns() finds potential patterns, this function generates the instructions ...
bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, TargetInstrInfo::MachineBranchPredicate &MBP, bool AllowModify=false) const override
static bool isDataInvariant(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value o...
unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before certain undef register...
void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
void buildClearRegister(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator Iter, DebugLoc &DL, bool AllowSideEffects=true) const override
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
int64_t getFrameAdjustment(const MachineInstr &I) const
Returns the stack pointer adjustment that happens inside the frame setup..destroy sequence (e....
bool hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr &DefMI, unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const override
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override
uint16_t getExecutionDomainCustom(const MachineInstr &MI) const
bool isHighLatencyDef(int opc) const override
void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const override
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override
foldImmediate - 'Reg' is known to be defined by a move immediate instruction, try to fold the immedia...
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, const X86InstrFMA3Group &FMA3Group) const
Returns an adjusted FMA opcode that must be used in FMA instruction that performs the same computatio...
bool preservesZeroValueInReg(const MachineInstr *MI, const Register NullValueReg, const TargetRegisterInfo *TRI) const override
unsigned getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before a partial register upd...
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
const TargetRegisterClass * constrainRegClassToNonRex2(const TargetRegisterClass *RC) const
bool hasAVX512() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
const X86FrameLowering * getFrameLowering() const override
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
X86II - This namespace holds all of the target specific flags that instruction info tracks.
bool isKMergeMasked(uint64_t TSFlags)
bool hasNewDataDest(uint64_t TSFlags)
@ MO_GOT_ABSOLUTE_ADDRESS
MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a relocation of: SYMBOL_LABEL + [.
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
@ SSEDomainShift
Execution domain for SSE instructions.
bool canUseApxExtendedReg(const MCInstrDesc &Desc)
bool isPseudo(uint64_t TSFlags)
bool isKMasked(uint64_t TSFlags)
int getMemoryOperandNo(uint64_t TSFlags)
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
Define some predicates that are used for node matching.
CondCode getCondFromBranch(const MachineInstr &MI)
CondCode getCondFromCFCMov(const MachineInstr &MI)
@ LAST_VALID_COND
Definition X86BaseInfo.h:94
CondCode getCondFromMI(const MachineInstr &MI)
Return the condition code of the instruction.
int getFirstAddrOperandIdx(const MachineInstr &MI)
Return the index of the instruction's first address operand, if it has a memory reference,...
@ AddrNumOperands
Definition X86BaseInfo.h:36
unsigned getSwappedVCMPImm(unsigned Imm)
Get the VCMP immediate if the opcodes are swapped.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
unsigned getSwappedVPCOMImm(unsigned Imm)
Get the VPCOM immediate if the opcodes are swapped.
bool isX87Instruction(MachineInstr &MI)
Check if the instruction is X87 instruction.
unsigned getNonNDVariant(unsigned Opc)
unsigned getVPCMPImmForCond(ISD::CondCode CC)
Get the VPCMP immediate for the given condition.
std::pair< CondCode, bool > getX86ConditionCode(CmpInst::Predicate Predicate)
Return a pair of condition code for the given predicate and whether the instruction operands should b...
CondCode getCondFromSETCC(const MachineInstr &MI)
unsigned getSwappedVPCMPImm(unsigned Imm)
Get the VPCMP immediate if the opcodes are swapped.
CondCode getCondFromCCMP(const MachineInstr &MI)
int getCCMPCondFlagsFromCondCode(CondCode CC)
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
const Constant * getConstantFromPool(const MachineInstr &MI, unsigned OpNo)
Find any constant pool entry associated with a specific instruction operand.
unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand=false, bool HasNDD=false)
Return a cmov opcode for the given register size in bytes, and operand type.
unsigned getNFVariant(unsigned Opc)
unsigned getVectorRegisterWidth(const MCOperandInfo &Info)
Get the width of the vector register operand.
CondCode getCondFromCMov(const MachineInstr &MI)
initializer< Ty > init(const Ty &Val)
InstrType
Represents how an instruction should be mapped by the outliner.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
static bool isAddMemInstrWithRelocation(const MachineInstr &MI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
static bool isMem(const MachineInstr &MI, unsigned Op)
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition Alignment.h:134
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
static const MachineInstrBuilder & addRegReg(const MachineInstrBuilder &MIB, Register Reg1, bool isKill1, unsigned SubReg1, Register Reg2, bool isKill2, unsigned SubReg2)
addRegReg - This function is used to add a memory reference of the form: [Reg + Reg].
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
constexpr RegState getDeadRegState(bool B)
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
const X86FoldTableEntry * lookupBroadcastFoldTable(unsigned RegOp, unsigned OpNum)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
const X86InstrFMA3Group * getFMA3Group(unsigned Opcode, uint64_t TSFlags)
Returns a reference to a group of FMA3 opcodes to where the given Opcode is included.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
const X86FoldTableEntry * lookupTwoAddrFoldTable(unsigned RegOp)
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1970
constexpr RegState getDefRegState(bool B)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
RegState getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
static bool isMemInstrWithGOTPCREL(const MachineInstr &MI)
static const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, int Offset)
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2052
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
const X86FoldTableEntry * lookupUnfoldTable(unsigned MemOp)
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool matchBroadcastSize(const X86FoldTableEntry &Entry, unsigned BroadcastBits)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const X86FoldTableEntry * lookupFoldTable(unsigned RegOp, unsigned OpNum)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
constexpr RegState getUndefRegState(bool B)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
This represents a simple continuous liveness interval for a value.
std::vector< MachineInstr * > Kills
Kills - List of MachineInstruction's which are the last use of this virtual register (kill it) in the...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
X86AddressMode - This struct holds a generalized full x86 address mode.
enum llvm::X86AddressMode::@202116273335065351270200035056227005202106004277 BaseType
This class is used to group {132, 213, 231} forms of FMA opcodes together.
unsigned get213Opcode() const
Returns the 213 form of FMA opcode.
unsigned get231Opcode() const
Returns the 231 form of FMA opcode.
bool isIntrinsic() const
Returns true iff the group of FMA opcodes holds intrinsic opcodes.
unsigned get132Opcode() const
Returns the 132 form of FMA opcode.
An individual sequence of instructions to be replaced with a call to an outlined function.
The information necessary to create an outlined function for some class of candidate.