LLVM 23.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the X86 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86InstrInfo.h"
14#include "X86.h"
15#include "X86InstrBuilder.h"
16#include "X86InstrFoldTables.h"
18#include "X86Subtarget.h"
19#include "X86TargetMachine.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/Sequence.h"
36#include "llvm/IR/Function.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Module.h"
39#include "llvm/MC/MCAsmInfo.h"
40#include "llvm/MC/MCExpr.h"
41#include "llvm/MC/MCInst.h"
43#include "llvm/Support/Debug.h"
47#include <atomic>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "x86-instr-info"
53
54#define GET_INSTRINFO_CTOR_DTOR
55#include "X86GenInstrInfo.inc"
56
58
59static cl::opt<bool>
60 NoFusing("disable-spill-fusing",
61 cl::desc("Disable fusing of spill code into instructions"),
63static cl::opt<bool>
64 PrintFailedFusing("print-failed-fuse-candidates",
65 cl::desc("Print instructions that the allocator wants to"
66 " fuse, but the X86 backend currently can't"),
68static cl::opt<bool>
69 ReMatPICStubLoad("remat-pic-stub-load",
70 cl::desc("Re-materialize load from stub in PIC mode"),
71 cl::init(false), cl::Hidden);
73 PartialRegUpdateClearance("partial-reg-update-clearance",
74 cl::desc("Clearance between two register writes "
75 "for inserting XOR to avoid partial "
76 "register update"),
77 cl::init(64), cl::Hidden);
79 "undef-reg-clearance",
80 cl::desc("How many idle instructions we would like before "
81 "certain undef register reads"),
82 cl::init(128), cl::Hidden);
83
84// Pin the vtable to this file.
85void X86InstrInfo::anchor() {}
86
88 : X86GenInstrInfo(STI, RI,
89 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
90 : X86::ADJCALLSTACKDOWN32),
91 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
92 : X86::ADJCALLSTACKUP32),
93 X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
94 Subtarget(STI), RI(STI.getTargetTriple()) {}
95
97 unsigned OpNum) const {
98 auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum);
99 // If the target does not have egpr, then r16-r31 will be resereved for all
100 // instructions.
101 if (!RC || !Subtarget.hasEGPR())
102 return RC;
103
105 return RC;
106
107 const X86RegisterInfo *RI = Subtarget.getRegisterInfo();
108 return RI->constrainRegClassToNonRex2(RC);
109}
110
112 Register &SrcReg, Register &DstReg,
113 unsigned &SubIdx) const {
114 switch (MI.getOpcode()) {
115 default:
116 break;
117 case X86::MOVSX16rr8:
118 case X86::MOVZX16rr8:
119 case X86::MOVSX32rr8:
120 case X86::MOVZX32rr8:
121 case X86::MOVSX64rr8:
122 if (!Subtarget.is64Bit())
123 // It's not always legal to reference the low 8-bit of the larger
124 // register in 32-bit mode.
125 return false;
126 [[fallthrough]];
127 case X86::MOVSX32rr16:
128 case X86::MOVZX32rr16:
129 case X86::MOVSX64rr16:
130 case X86::MOVSX64rr32: {
131 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
132 // Be conservative.
133 return false;
134 SrcReg = MI.getOperand(1).getReg();
135 DstReg = MI.getOperand(0).getReg();
136 switch (MI.getOpcode()) {
137 default:
138 llvm_unreachable("Unreachable!");
139 case X86::MOVSX16rr8:
140 case X86::MOVZX16rr8:
141 case X86::MOVSX32rr8:
142 case X86::MOVZX32rr8:
143 case X86::MOVSX64rr8:
144 SubIdx = X86::sub_8bit;
145 break;
146 case X86::MOVSX32rr16:
147 case X86::MOVZX32rr16:
148 case X86::MOVSX64rr16:
149 SubIdx = X86::sub_16bit;
150 break;
151 case X86::MOVSX64rr32:
152 SubIdx = X86::sub_32bit;
153 break;
154 }
155 return true;
156 }
157 }
158 return false;
159}
160
162 if (MI.mayLoad() || MI.mayStore())
163 return false;
164
165 // Some target-independent operations that trivially lower to data-invariant
166 // instructions.
167 if (MI.isCopyLike() || MI.isInsertSubreg())
168 return true;
169
170 unsigned Opcode = MI.getOpcode();
171 using namespace X86;
172 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
173 // However, they set flags and are perhaps the most surprisingly constant
174 // time operations so we call them out here separately.
175 if (isIMUL(Opcode))
176 return true;
177 // Bit scanning and counting instructions that are somewhat surprisingly
178 // constant time as they scan across bits and do other fairly complex
179 // operations like popcnt, but are believed to be constant time on x86.
180 // However, these set flags.
181 if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
182 isTZCNT(Opcode))
183 return true;
184 // Bit manipulation instructions are effectively combinations of basic
185 // arithmetic ops, and should still execute in constant time. These also
186 // set flags.
187 if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
188 isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
189 isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
190 isTZMSK(Opcode))
191 return true;
192 // Bit extracting and clearing instructions should execute in constant time,
193 // and set flags.
194 if (isBEXTR(Opcode) || isBZHI(Opcode))
195 return true;
196 // Shift and rotate.
197 if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
198 isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
199 return true;
200 // Basic arithmetic is constant time on the input but does set flags.
201 if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
202 isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
203 return true;
204 // Arithmetic with just 32-bit and 64-bit variants and no immediates.
205 if (isANDN(Opcode))
206 return true;
207 // Unary arithmetic operations.
208 if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
209 return true;
210 // Unlike other arithmetic, NOT doesn't set EFLAGS.
211 if (isNOT(Opcode))
212 return true;
213 // Various move instructions used to zero or sign extend things. Note that we
214 // intentionally don't support the _NOREX variants as we can't handle that
215 // register constraint anyways.
216 if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
217 return true;
218 // Arithmetic instructions that are both constant time and don't set flags.
219 if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
220 return true;
221 // LEA doesn't actually access memory, and its arithmetic is constant time.
222 if (isLEA(Opcode))
223 return true;
224 // By default, assume that the instruction is not data invariant.
225 return false;
226}
227
229 switch (MI.getOpcode()) {
230 default:
231 // By default, assume that the load will immediately leak.
232 return false;
233
234 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
235 // However, they set flags and are perhaps the most surprisingly constant
236 // time operations so we call them out here separately.
237 case X86::IMUL16rm:
238 case X86::IMUL16rmi:
239 case X86::IMUL32rm:
240 case X86::IMUL32rmi:
241 case X86::IMUL64rm:
242 case X86::IMUL64rmi32:
243
244 // Bit scanning and counting instructions that are somewhat surprisingly
245 // constant time as they scan across bits and do other fairly complex
246 // operations like popcnt, but are believed to be constant time on x86.
247 // However, these set flags.
248 case X86::BSF16rm:
249 case X86::BSF32rm:
250 case X86::BSF64rm:
251 case X86::BSR16rm:
252 case X86::BSR32rm:
253 case X86::BSR64rm:
254 case X86::LZCNT16rm:
255 case X86::LZCNT32rm:
256 case X86::LZCNT64rm:
257 case X86::POPCNT16rm:
258 case X86::POPCNT32rm:
259 case X86::POPCNT64rm:
260 case X86::TZCNT16rm:
261 case X86::TZCNT32rm:
262 case X86::TZCNT64rm:
263
264 // Bit manipulation instructions are effectively combinations of basic
265 // arithmetic ops, and should still execute in constant time. These also
266 // set flags.
267 case X86::BLCFILL32rm:
268 case X86::BLCFILL64rm:
269 case X86::BLCI32rm:
270 case X86::BLCI64rm:
271 case X86::BLCIC32rm:
272 case X86::BLCIC64rm:
273 case X86::BLCMSK32rm:
274 case X86::BLCMSK64rm:
275 case X86::BLCS32rm:
276 case X86::BLCS64rm:
277 case X86::BLSFILL32rm:
278 case X86::BLSFILL64rm:
279 case X86::BLSI32rm:
280 case X86::BLSI64rm:
281 case X86::BLSIC32rm:
282 case X86::BLSIC64rm:
283 case X86::BLSMSK32rm:
284 case X86::BLSMSK64rm:
285 case X86::BLSR32rm:
286 case X86::BLSR64rm:
287 case X86::TZMSK32rm:
288 case X86::TZMSK64rm:
289
290 // Bit extracting and clearing instructions should execute in constant time,
291 // and set flags.
292 case X86::BEXTR32rm:
293 case X86::BEXTR64rm:
294 case X86::BEXTRI32mi:
295 case X86::BEXTRI64mi:
296 case X86::BZHI32rm:
297 case X86::BZHI64rm:
298
299 // Basic arithmetic is constant time on the input but does set flags.
300 case X86::ADC8rm:
301 case X86::ADC16rm:
302 case X86::ADC32rm:
303 case X86::ADC64rm:
304 case X86::ADD8rm:
305 case X86::ADD16rm:
306 case X86::ADD32rm:
307 case X86::ADD64rm:
308 case X86::AND8rm:
309 case X86::AND16rm:
310 case X86::AND32rm:
311 case X86::AND64rm:
312 case X86::ANDN32rm:
313 case X86::ANDN64rm:
314 case X86::OR8rm:
315 case X86::OR16rm:
316 case X86::OR32rm:
317 case X86::OR64rm:
318 case X86::SBB8rm:
319 case X86::SBB16rm:
320 case X86::SBB32rm:
321 case X86::SBB64rm:
322 case X86::SUB8rm:
323 case X86::SUB16rm:
324 case X86::SUB32rm:
325 case X86::SUB64rm:
326 case X86::XOR8rm:
327 case X86::XOR16rm:
328 case X86::XOR32rm:
329 case X86::XOR64rm:
330
331 // Integer multiply w/o affecting flags is still believed to be constant
332 // time on x86. Called out separately as this is among the most surprising
333 // instructions to exhibit that behavior.
334 case X86::MULX32rm:
335 case X86::MULX64rm:
336
337 // Arithmetic instructions that are both constant time and don't set flags.
338 case X86::RORX32mi:
339 case X86::RORX64mi:
340 case X86::SARX32rm:
341 case X86::SARX64rm:
342 case X86::SHLX32rm:
343 case X86::SHLX64rm:
344 case X86::SHRX32rm:
345 case X86::SHRX64rm:
346
347 // Conversions are believed to be constant time and don't set flags.
348 case X86::CVTTSD2SI64rm:
349 case X86::VCVTTSD2SI64rm:
350 case X86::VCVTTSD2SI64Zrm:
351 case X86::CVTTSD2SIrm:
352 case X86::VCVTTSD2SIrm:
353 case X86::VCVTTSD2SIZrm:
354 case X86::CVTTSS2SI64rm:
355 case X86::VCVTTSS2SI64rm:
356 case X86::VCVTTSS2SI64Zrm:
357 case X86::CVTTSS2SIrm:
358 case X86::VCVTTSS2SIrm:
359 case X86::VCVTTSS2SIZrm:
360 case X86::CVTSI2SDrm:
361 case X86::VCVTSI2SDrm:
362 case X86::VCVTSI2SDZrm:
363 case X86::CVTSI2SSrm:
364 case X86::VCVTSI2SSrm:
365 case X86::VCVTSI2SSZrm:
366 case X86::CVTSI642SDrm:
367 case X86::VCVTSI642SDrm:
368 case X86::VCVTSI642SDZrm:
369 case X86::CVTSI642SSrm:
370 case X86::VCVTSI642SSrm:
371 case X86::VCVTSI642SSZrm:
372 case X86::CVTSS2SDrm:
373 case X86::VCVTSS2SDrm:
374 case X86::VCVTSS2SDZrm:
375 case X86::CVTSD2SSrm:
376 case X86::VCVTSD2SSrm:
377 case X86::VCVTSD2SSZrm:
378 // AVX512 added unsigned integer conversions.
379 case X86::VCVTTSD2USI64Zrm:
380 case X86::VCVTTSD2USIZrm:
381 case X86::VCVTTSS2USI64Zrm:
382 case X86::VCVTTSS2USIZrm:
383 case X86::VCVTUSI2SDZrm:
384 case X86::VCVTUSI642SDZrm:
385 case X86::VCVTUSI2SSZrm:
386 case X86::VCVTUSI642SSZrm:
387
388 // Loads to register don't set flags.
389 case X86::MOV8rm:
390 case X86::MOV8rm_NOREX:
391 case X86::MOV16rm:
392 case X86::MOV32rm:
393 case X86::MOV64rm:
394 case X86::MOVSX16rm8:
395 case X86::MOVSX32rm16:
396 case X86::MOVSX32rm8:
397 case X86::MOVSX32rm8_NOREX:
398 case X86::MOVSX64rm16:
399 case X86::MOVSX64rm32:
400 case X86::MOVSX64rm8:
401 case X86::MOVZX16rm8:
402 case X86::MOVZX32rm16:
403 case X86::MOVZX32rm8:
404 case X86::MOVZX32rm8_NOREX:
405 case X86::MOVZX64rm16:
406 case X86::MOVZX64rm8:
407 return true;
408 }
409}
410
412 const MachineFunction *MF = MI.getParent()->getParent();
414
415 if (isFrameInstr(MI)) {
416 int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
417 SPAdj -= getFrameAdjustment(MI);
418 if (!isFrameSetup(MI))
419 SPAdj = -SPAdj;
420 return SPAdj;
421 }
422
423 // To know whether a call adjusts the stack, we need information
424 // that is bound to the following ADJCALLSTACKUP pseudo.
425 // Look for the next ADJCALLSTACKUP that follows the call.
426 if (MI.isCall()) {
427 const MachineBasicBlock *MBB = MI.getParent();
429 for (auto E = MBB->end(); I != E; ++I) {
430 if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall())
431 break;
432 }
433
434 // If we could not find a frame destroy opcode, then it has already
435 // been simplified, so we don't care.
436 if (I->getOpcode() != getCallFrameDestroyOpcode())
437 return 0;
438
439 return -(I->getOperand(1).getImm());
440 }
441
442 // Currently handle only PUSHes we can reasonably expect to see
443 // in call sequences
444 switch (MI.getOpcode()) {
445 default:
446 return 0;
447 case X86::PUSH32r:
448 case X86::PUSH32rmm:
449 case X86::PUSH32rmr:
450 case X86::PUSH32i:
451 return 4;
452 case X86::PUSH64r:
453 case X86::PUSH64rmm:
454 case X86::PUSH64rmr:
455 case X86::PUSH64i32:
456 return 8;
457 }
458}
459
460/// Return true and the FrameIndex if the specified
461/// operand and follow operands form a reference to the stack frame.
462bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
463 int &FrameIndex) const {
464 if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
465 MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
466 MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
467 MI.getOperand(Op + X86::AddrDisp).isImm() &&
468 MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
469 MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
470 MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
471 FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
472 return true;
473 }
474 return false;
475}
476
477static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
478 switch (Opcode) {
479 default:
480 return false;
481 case X86::MOV8rm:
482 case X86::KMOVBkm:
483 case X86::KMOVBkm_EVEX:
484 MemBytes = TypeSize::getFixed(1);
485 return true;
486 case X86::MOV16rm:
487 case X86::KMOVWkm:
488 case X86::KMOVWkm_EVEX:
489 case X86::VMOVSHZrm:
490 case X86::VMOVSHZrm_alt:
491 MemBytes = TypeSize::getFixed(2);
492 return true;
493 case X86::MOV32rm:
494 case X86::MOVSSrm:
495 case X86::MOVSSrm_alt:
496 case X86::VMOVSSrm:
497 case X86::VMOVSSrm_alt:
498 case X86::VMOVSSZrm:
499 case X86::VMOVSSZrm_alt:
500 case X86::KMOVDkm:
501 case X86::KMOVDkm_EVEX:
502 MemBytes = TypeSize::getFixed(4);
503 return true;
504 case X86::MOV64rm:
505 case X86::LD_Fp64m:
506 case X86::MOVSDrm:
507 case X86::MOVSDrm_alt:
508 case X86::VMOVSDrm:
509 case X86::VMOVSDrm_alt:
510 case X86::VMOVSDZrm:
511 case X86::VMOVSDZrm_alt:
512 case X86::MMX_MOVD64rm:
513 case X86::MMX_MOVQ64rm:
514 case X86::KMOVQkm:
515 case X86::KMOVQkm_EVEX:
516 MemBytes = TypeSize::getFixed(8);
517 return true;
518 case X86::MOVAPSrm:
519 case X86::MOVUPSrm:
520 case X86::MOVAPDrm:
521 case X86::MOVUPDrm:
522 case X86::MOVDQArm:
523 case X86::MOVDQUrm:
524 case X86::VMOVAPSrm:
525 case X86::VMOVUPSrm:
526 case X86::VMOVAPDrm:
527 case X86::VMOVUPDrm:
528 case X86::VMOVDQArm:
529 case X86::VMOVDQUrm:
530 case X86::VMOVAPSZ128rm:
531 case X86::VMOVUPSZ128rm:
532 case X86::VMOVAPSZ128rm_NOVLX:
533 case X86::VMOVUPSZ128rm_NOVLX:
534 case X86::VMOVAPDZ128rm:
535 case X86::VMOVUPDZ128rm:
536 case X86::VMOVDQU8Z128rm:
537 case X86::VMOVDQU16Z128rm:
538 case X86::VMOVDQA32Z128rm:
539 case X86::VMOVDQU32Z128rm:
540 case X86::VMOVDQA64Z128rm:
541 case X86::VMOVDQU64Z128rm:
542 MemBytes = TypeSize::getFixed(16);
543 return true;
544 case X86::VMOVAPSYrm:
545 case X86::VMOVUPSYrm:
546 case X86::VMOVAPDYrm:
547 case X86::VMOVUPDYrm:
548 case X86::VMOVDQAYrm:
549 case X86::VMOVDQUYrm:
550 case X86::VMOVAPSZ256rm:
551 case X86::VMOVUPSZ256rm:
552 case X86::VMOVAPSZ256rm_NOVLX:
553 case X86::VMOVUPSZ256rm_NOVLX:
554 case X86::VMOVAPDZ256rm:
555 case X86::VMOVUPDZ256rm:
556 case X86::VMOVDQU8Z256rm:
557 case X86::VMOVDQU16Z256rm:
558 case X86::VMOVDQA32Z256rm:
559 case X86::VMOVDQU32Z256rm:
560 case X86::VMOVDQA64Z256rm:
561 case X86::VMOVDQU64Z256rm:
562 MemBytes = TypeSize::getFixed(32);
563 return true;
564 case X86::VMOVAPSZrm:
565 case X86::VMOVUPSZrm:
566 case X86::VMOVAPDZrm:
567 case X86::VMOVUPDZrm:
568 case X86::VMOVDQU8Zrm:
569 case X86::VMOVDQU16Zrm:
570 case X86::VMOVDQA32Zrm:
571 case X86::VMOVDQU32Zrm:
572 case X86::VMOVDQA64Zrm:
573 case X86::VMOVDQU64Zrm:
574 MemBytes = TypeSize::getFixed(64);
575 return true;
576 }
577}
578
579static bool isFrameStoreOpcode(int Opcode, TypeSize &MemBytes) {
580 switch (Opcode) {
581 default:
582 return false;
583 case X86::MOV8mr:
584 case X86::KMOVBmk:
585 case X86::KMOVBmk_EVEX:
586 MemBytes = TypeSize::getFixed(1);
587 return true;
588 case X86::MOV16mr:
589 case X86::KMOVWmk:
590 case X86::KMOVWmk_EVEX:
591 case X86::VMOVSHZmr:
592 MemBytes = TypeSize::getFixed(2);
593 return true;
594 case X86::MOV32mr:
595 case X86::MOVSSmr:
596 case X86::VMOVSSmr:
597 case X86::VMOVSSZmr:
598 case X86::KMOVDmk:
599 case X86::KMOVDmk_EVEX:
600 MemBytes = TypeSize::getFixed(4);
601 return true;
602 case X86::MOV64mr:
603 case X86::ST_FpP64m:
604 case X86::MOVSDmr:
605 case X86::VMOVSDmr:
606 case X86::VMOVSDZmr:
607 case X86::MMX_MOVD64mr:
608 case X86::MMX_MOVQ64mr:
609 case X86::MMX_MOVNTQmr:
610 case X86::KMOVQmk:
611 case X86::KMOVQmk_EVEX:
612 MemBytes = TypeSize::getFixed(8);
613 return true;
614 case X86::MOVAPSmr:
615 case X86::MOVUPSmr:
616 case X86::MOVAPDmr:
617 case X86::MOVUPDmr:
618 case X86::MOVDQAmr:
619 case X86::MOVDQUmr:
620 case X86::VMOVAPSmr:
621 case X86::VMOVUPSmr:
622 case X86::VMOVAPDmr:
623 case X86::VMOVUPDmr:
624 case X86::VMOVDQAmr:
625 case X86::VMOVDQUmr:
626 case X86::VMOVUPSZ128mr:
627 case X86::VMOVAPSZ128mr:
628 case X86::VMOVUPSZ128mr_NOVLX:
629 case X86::VMOVAPSZ128mr_NOVLX:
630 case X86::VMOVUPDZ128mr:
631 case X86::VMOVAPDZ128mr:
632 case X86::VMOVDQA32Z128mr:
633 case X86::VMOVDQU32Z128mr:
634 case X86::VMOVDQA64Z128mr:
635 case X86::VMOVDQU64Z128mr:
636 case X86::VMOVDQU8Z128mr:
637 case X86::VMOVDQU16Z128mr:
638 MemBytes = TypeSize::getFixed(16);
639 return true;
640 case X86::VMOVUPSYmr:
641 case X86::VMOVAPSYmr:
642 case X86::VMOVUPDYmr:
643 case X86::VMOVAPDYmr:
644 case X86::VMOVDQUYmr:
645 case X86::VMOVDQAYmr:
646 case X86::VMOVUPSZ256mr:
647 case X86::VMOVAPSZ256mr:
648 case X86::VMOVUPSZ256mr_NOVLX:
649 case X86::VMOVAPSZ256mr_NOVLX:
650 case X86::VMOVUPDZ256mr:
651 case X86::VMOVAPDZ256mr:
652 case X86::VMOVDQU8Z256mr:
653 case X86::VMOVDQU16Z256mr:
654 case X86::VMOVDQA32Z256mr:
655 case X86::VMOVDQU32Z256mr:
656 case X86::VMOVDQA64Z256mr:
657 case X86::VMOVDQU64Z256mr:
658 MemBytes = TypeSize::getFixed(32);
659 return true;
660 case X86::VMOVUPSZmr:
661 case X86::VMOVAPSZmr:
662 case X86::VMOVUPDZmr:
663 case X86::VMOVAPDZmr:
664 case X86::VMOVDQU8Zmr:
665 case X86::VMOVDQU16Zmr:
666 case X86::VMOVDQA32Zmr:
667 case X86::VMOVDQU32Zmr:
668 case X86::VMOVDQA64Zmr:
669 case X86::VMOVDQU64Zmr:
670 MemBytes = TypeSize::getFixed(64);
671 return true;
672 }
673 return false;
674}
675
677 int &FrameIndex) const {
678 TypeSize Dummy = TypeSize::getZero();
679 return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
680}
681
683 int &FrameIndex,
684 TypeSize &MemBytes) const {
685 if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
686 if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
687 return MI.getOperand(0).getReg();
688 return Register();
689}
690
692 int &FrameIndex) const {
693 TypeSize Dummy = TypeSize::getZero();
694 if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
695 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
696 return Reg;
697 // Check for post-frame index elimination operations
699 if (hasLoadFromStackSlot(MI, Accesses)) {
700 FrameIndex =
701 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
702 ->getFrameIndex();
703 return MI.getOperand(0).getReg();
704 }
705 }
706 return Register();
707}
708
710 int &FrameIndex) const {
711 TypeSize Dummy = TypeSize::getZero();
712 return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
713}
714
716 int &FrameIndex,
717 TypeSize &MemBytes) const {
718 if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
719 if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
720 isFrameOperand(MI, 0, FrameIndex))
721 return MI.getOperand(X86::AddrNumOperands).getReg();
722 return Register();
723}
724
726 int &FrameIndex) const {
727 TypeSize Dummy = TypeSize::getZero();
728 if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
729 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
730 return Reg;
731 // Check for post-frame index elimination operations
733 if (hasStoreToStackSlot(MI, Accesses)) {
734 FrameIndex =
735 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
736 ->getFrameIndex();
737 return MI.getOperand(X86::AddrNumOperands).getReg();
738 }
739 }
740 return Register();
741}
742
743/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
744static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
745 // Don't waste compile time scanning use-def chains of physregs.
746 if (!BaseReg.isVirtual())
747 return false;
748 bool isPICBase = false;
749 for (const MachineInstr &DefMI : MRI.def_instructions(BaseReg)) {
750 if (DefMI.getOpcode() != X86::MOVPC32r)
751 return false;
752 assert(!isPICBase && "More than one PIC base?");
753 isPICBase = true;
754 }
755 return isPICBase;
756}
757
759 const MachineInstr &MI) const {
760 switch (MI.getOpcode()) {
761 default:
762 // This function should only be called for opcodes with the ReMaterializable
763 // flag set.
764 llvm_unreachable("Unknown rematerializable operation!");
765 break;
766 case X86::IMPLICIT_DEF:
767 // Defer to generic logic.
768 break;
769 case X86::LOAD_STACK_GUARD:
770 case X86::LD_Fp032:
771 case X86::LD_Fp064:
772 case X86::LD_Fp080:
773 case X86::LD_Fp132:
774 case X86::LD_Fp164:
775 case X86::LD_Fp180:
776 case X86::AVX1_SETALLONES:
777 case X86::AVX2_SETALLONES:
778 case X86::AVX512_128_SET0:
779 case X86::AVX512_256_SET0:
780 case X86::AVX512_512_SET0:
781 case X86::AVX512_128_SETALLONES:
782 case X86::AVX512_256_SETALLONES:
783 case X86::AVX512_512_SETALLONES:
784 case X86::AVX512_FsFLD0SD:
785 case X86::AVX512_FsFLD0SH:
786 case X86::AVX512_FsFLD0SS:
787 case X86::AVX512_FsFLD0F128:
788 case X86::AVX_SET0:
789 case X86::FsFLD0SD:
790 case X86::FsFLD0SS:
791 case X86::FsFLD0SH:
792 case X86::FsFLD0F128:
793 case X86::KSET0B:
794 case X86::KSET0D:
795 case X86::KSET0Q:
796 case X86::KSET0W:
797 case X86::KSET1B:
798 case X86::KSET1D:
799 case X86::KSET1Q:
800 case X86::KSET1W:
801 case X86::MMX_SET0:
802 case X86::MOV32ImmSExti8:
803 case X86::MOV32r0:
804 case X86::MOV32r1:
805 case X86::MOV32r_1:
806 case X86::MOV32ri64:
807 case X86::MOV64ImmSExti8:
808 case X86::V_SET0:
809 case X86::V_SETALLONES:
810 case X86::MOV16ri:
811 case X86::MOV32ri:
812 case X86::MOV64ri:
813 case X86::MOV64ri32:
814 case X86::MOV8ri:
815 case X86::PTILEZEROV:
816 return true;
817
818 case X86::MOV8rm:
819 case X86::MOV8rm_NOREX:
820 case X86::MOV16rm:
821 case X86::MOV32rm:
822 case X86::MOV64rm:
823 case X86::MOVSSrm:
824 case X86::MOVSSrm_alt:
825 case X86::MOVSDrm:
826 case X86::MOVSDrm_alt:
827 case X86::MOVAPSrm:
828 case X86::MOVUPSrm:
829 case X86::MOVAPDrm:
830 case X86::MOVUPDrm:
831 case X86::MOVDQArm:
832 case X86::MOVDQUrm:
833 case X86::VMOVSSrm:
834 case X86::VMOVSSrm_alt:
835 case X86::VMOVSDrm:
836 case X86::VMOVSDrm_alt:
837 case X86::VMOVAPSrm:
838 case X86::VMOVUPSrm:
839 case X86::VMOVAPDrm:
840 case X86::VMOVUPDrm:
841 case X86::VMOVDQArm:
842 case X86::VMOVDQUrm:
843 case X86::VMOVAPSYrm:
844 case X86::VMOVUPSYrm:
845 case X86::VMOVAPDYrm:
846 case X86::VMOVUPDYrm:
847 case X86::VMOVDQAYrm:
848 case X86::VMOVDQUYrm:
849 case X86::MMX_MOVD64rm:
850 case X86::MMX_MOVQ64rm:
851 case X86::VBROADCASTSSrm:
852 case X86::VBROADCASTSSYrm:
853 case X86::VBROADCASTSDYrm:
854 // AVX-512
855 case X86::VPBROADCASTBZ128rm:
856 case X86::VPBROADCASTBZ256rm:
857 case X86::VPBROADCASTBZrm:
858 case X86::VBROADCASTF32X2Z256rm:
859 case X86::VBROADCASTF32X2Zrm:
860 case X86::VBROADCASTI32X2Z128rm:
861 case X86::VBROADCASTI32X2Z256rm:
862 case X86::VBROADCASTI32X2Zrm:
863 case X86::VPBROADCASTWZ128rm:
864 case X86::VPBROADCASTWZ256rm:
865 case X86::VPBROADCASTWZrm:
866 case X86::VPBROADCASTDZ128rm:
867 case X86::VPBROADCASTDZ256rm:
868 case X86::VPBROADCASTDZrm:
869 case X86::VBROADCASTSSZ128rm:
870 case X86::VBROADCASTSSZ256rm:
871 case X86::VBROADCASTSSZrm:
872 case X86::VPBROADCASTQZ128rm:
873 case X86::VPBROADCASTQZ256rm:
874 case X86::VPBROADCASTQZrm:
875 case X86::VBROADCASTSDZ256rm:
876 case X86::VBROADCASTSDZrm:
877 case X86::VMOVSSZrm:
878 case X86::VMOVSSZrm_alt:
879 case X86::VMOVSDZrm:
880 case X86::VMOVSDZrm_alt:
881 case X86::VMOVSHZrm:
882 case X86::VMOVSHZrm_alt:
883 case X86::VMOVAPDZ128rm:
884 case X86::VMOVAPDZ256rm:
885 case X86::VMOVAPDZrm:
886 case X86::VMOVAPSZ128rm:
887 case X86::VMOVAPSZ256rm:
888 case X86::VMOVAPSZ128rm_NOVLX:
889 case X86::VMOVAPSZ256rm_NOVLX:
890 case X86::VMOVAPSZrm:
891 case X86::VMOVDQA32Z128rm:
892 case X86::VMOVDQA32Z256rm:
893 case X86::VMOVDQA32Zrm:
894 case X86::VMOVDQA64Z128rm:
895 case X86::VMOVDQA64Z256rm:
896 case X86::VMOVDQA64Zrm:
897 case X86::VMOVDQU16Z128rm:
898 case X86::VMOVDQU16Z256rm:
899 case X86::VMOVDQU16Zrm:
900 case X86::VMOVDQU32Z128rm:
901 case X86::VMOVDQU32Z256rm:
902 case X86::VMOVDQU32Zrm:
903 case X86::VMOVDQU64Z128rm:
904 case X86::VMOVDQU64Z256rm:
905 case X86::VMOVDQU64Zrm:
906 case X86::VMOVDQU8Z128rm:
907 case X86::VMOVDQU8Z256rm:
908 case X86::VMOVDQU8Zrm:
909 case X86::VMOVUPDZ128rm:
910 case X86::VMOVUPDZ256rm:
911 case X86::VMOVUPDZrm:
912 case X86::VMOVUPSZ128rm:
913 case X86::VMOVUPSZ256rm:
914 case X86::VMOVUPSZ128rm_NOVLX:
915 case X86::VMOVUPSZ256rm_NOVLX:
916 case X86::VMOVUPSZrm: {
917 // Loads from constant pools are trivially rematerializable.
918 if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
919 MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
920 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
921 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
922 MI.isDereferenceableInvariantLoad()) {
923 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
924 if (BaseReg == 0 || BaseReg == X86::RIP)
925 return true;
926 // Allow re-materialization of PIC load.
927 if (!(!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())) {
928 const MachineFunction &MF = *MI.getParent()->getParent();
929 const MachineRegisterInfo &MRI = MF.getRegInfo();
930 if (regIsPICBase(BaseReg, MRI))
931 return true;
932 }
933 }
934 break;
935 }
936
937 case X86::LEA32r:
938 case X86::LEA64r: {
939 if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
940 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
941 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
942 !MI.getOperand(1 + X86::AddrDisp).isReg()) {
943 // lea fi#, lea GV, etc. are all rematerializable.
944 if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
945 return true;
946 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
947 if (BaseReg == 0)
948 return true;
949 // Allow re-materialization of lea PICBase + x.
950 const MachineFunction &MF = *MI.getParent()->getParent();
951 const MachineRegisterInfo &MRI = MF.getRegInfo();
952 if (regIsPICBase(BaseReg, MRI))
953 return true;
954 }
955 break;
956 }
957 }
959}
960
963 Register DestReg, unsigned SubIdx,
964 const MachineInstr &Orig,
965 LaneBitmask UsedLanes) const {
966 bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
967 if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
969 // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
970 // effects.
971 int Value;
972 switch (Orig.getOpcode()) {
973 case X86::MOV32r0:
974 Value = 0;
975 break;
976 case X86::MOV32r1:
977 Value = 1;
978 break;
979 case X86::MOV32r_1:
980 Value = -1;
981 break;
982 default:
983 llvm_unreachable("Unexpected instruction!");
984 }
985
986 const DebugLoc &DL = Orig.getDebugLoc();
987 BuildMI(MBB, I, DL, get(X86::MOV32ri))
988 .add(Orig.getOperand(0))
989 .addImm(Value);
990 } else {
991 MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
992 MBB.insert(I, MI);
993 }
994
995 MachineInstr &NewMI = *std::prev(I);
996 NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
997}
998
999/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
1001 for (const MachineOperand &MO : MI.operands()) {
1002 if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS &&
1003 !MO.isDead()) {
1004 return true;
1005 }
1006 }
1007 return false;
1008}
1009
1010/// Check whether the shift count for a machine operand is non-zero.
1011inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1012 unsigned ShiftAmtOperandIdx) {
1013 // The shift count is six bits with the REX.W prefix and five bits without.
1014 unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1015 unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1016 return Imm & ShiftCountMask;
1017}
1018
1019/// Check whether the given shift count is appropriate
1020/// can be represented by a LEA instruction.
1021inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1022 // Left shift instructions can be transformed into load-effective-address
1023 // instructions if we can encode them appropriately.
1024 // A LEA instruction utilizes a SIB byte to encode its scale factor.
1025 // The SIB.scale field is two bits wide which means that we can encode any
1026 // shift amount less than 4.
1027 return ShAmt < 4 && ShAmt > 0;
1028}
1029
1030static bool
1032 const MachineRegisterInfo *MRI, MachineInstr **AndInstr,
1033 const TargetRegisterInfo *TRI, const X86Subtarget &ST,
1034 bool &NoSignFlag, bool &ClearsOverflowFlag) {
1035 if (!(CmpValDefInstr.getOpcode() == X86::SUBREG_TO_REG &&
1036 CmpInstr.getOpcode() == X86::TEST64rr) &&
1037 !(CmpValDefInstr.getOpcode() == X86::COPY &&
1038 CmpInstr.getOpcode() == X86::TEST16rr))
1039 return false;
1040
1041 // CmpInstr is a TEST16rr/TEST64rr instruction, and
1042 // `X86InstrInfo::analyzeCompare` guarantees that it's analyzable only if two
1043 // registers are identical.
1044 assert((CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
1045 "CmpInstr is an analyzable TEST16rr/TEST64rr, and "
1046 "`X86InstrInfo::analyzeCompare` requires two reg operands are the"
1047 "same.");
1048
1049 // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
1050 // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
1051 // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
1052 // redundant.
1053 assert(
1054 (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
1055 "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG or TEST16rr "
1056 "is a user of COPY sub16bit.");
1057 MachineInstr *VregDefInstr = nullptr;
1058 if (CmpInstr.getOpcode() == X86::TEST16rr) {
1059 if (!CmpValDefInstr.getOperand(1).getReg().isVirtual())
1060 return false;
1061 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1062 if (!VregDefInstr)
1063 return false;
1064 // We can only remove test when AND32ri or AND64ri32 whose imm can fit 16bit
1065 // size, others 32/64 bit ops would test higher bits which test16rr don't
1066 // want to.
1067 if (!((VregDefInstr->getOpcode() == X86::AND32ri ||
1068 VregDefInstr->getOpcode() == X86::AND64ri32) &&
1069 isUInt<16>(VregDefInstr->getOperand(2).getImm())))
1070 return false;
1071 }
1072
1073 if (CmpInstr.getOpcode() == X86::TEST64rr) {
1074 // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1075 // sub_32bit or sub_xmm.
1076 if (CmpValDefInstr.getOperand(2).getImm() != X86::sub_32bit)
1077 return false;
1078
1079 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1080 }
1081
1082 assert(VregDefInstr && "Must have a definition (SSA)");
1083
1084 // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1085 // to simplify the subsequent analysis.
1086 //
1087 // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1088 // `CmpValDefInstr.getParent()`, this could be handled.
1089 if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1090 return false;
1091
1092 if (X86::isAND(VregDefInstr->getOpcode()) &&
1093 (!ST.hasNF() || VregDefInstr->modifiesRegister(X86::EFLAGS, TRI))) {
1094 // Get a sequence of instructions like
1095 // %reg = and* ... // Set EFLAGS
1096 // ... // EFLAGS not changed
1097 // %extended_reg = subreg_to_reg %reg, %subreg.sub_32bit
1098 // test64rr %extended_reg, %extended_reg, implicit-def $eflags
1099 // or
1100 // %reg = and32* ...
1101 // ... // EFLAGS not changed.
1102 // %src_reg = copy %reg.sub_16bit:gr32
1103 // test16rr %src_reg, %src_reg, implicit-def $eflags
1104 //
1105 // If subsequent readers use a subset of bits that don't change
1106 // after `and*` instructions, it's likely that the test64rr could
1107 // be optimized away.
1108 for (const MachineInstr &Instr :
1109 make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1110 MachineBasicBlock::iterator(CmpValDefInstr))) {
1111 // There are instructions between 'VregDefInstr' and
1112 // 'CmpValDefInstr' that modifies EFLAGS.
1113 if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1114 return false;
1115 }
1116
1117 *AndInstr = VregDefInstr;
1118
1119 // AND instruction will essentially update SF and clear OF, so
1120 // NoSignFlag should be false in the sense that SF is modified by `AND`.
1121 //
1122 // However, the implementation artifically sets `NoSignFlag` to true
1123 // to poison the SF bit; that is to say, if SF is looked at later, the
1124 // optimization (to erase TEST64rr) will be disabled.
1125 //
1126 // The reason to poison SF bit is that SF bit value could be different
1127 // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1128 // and is known to be 0 as a result of `TEST64rr`.
1129 //
1130 // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1131 // the AND instruction and using the static information to guide peephole
1132 // optimization if possible. For example, it's possible to fold a
1133 // conditional move into a copy if the relevant EFLAG bits could be deduced
1134 // from an immediate operand of and operation.
1135 //
1136 NoSignFlag = true;
1137 // ClearsOverflowFlag is true for AND operation (no surprise).
1138 ClearsOverflowFlag = true;
1139 return true;
1140 }
1141 return false;
1142}
1143
1145 unsigned Opc, bool AllowSP, Register &NewSrc,
1146 unsigned &NewSrcSubReg, bool &isKill,
1147 MachineOperand &ImplicitOp, LiveVariables *LV,
1148 LiveIntervals *LIS) const {
1149 MachineFunction &MF = *MI.getParent()->getParent();
1150 const TargetRegisterClass *RC;
1151 if (AllowSP) {
1152 RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1153 } else {
1154 RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1155 }
1156 Register SrcReg = Src.getReg();
1157 unsigned SubReg = Src.getSubReg();
1158 isKill = MI.killsRegister(SrcReg, /*TRI=*/nullptr);
1159
1160 NewSrcSubReg = X86::NoSubRegister;
1161
1162 // For both LEA64 and LEA32 the register already has essentially the right
1163 // type (32-bit or 64-bit) we may just need to forbid SP.
1164 if (Opc != X86::LEA64_32r) {
1165 NewSrc = SrcReg;
1166 NewSrcSubReg = SubReg;
1167 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1168
1169 if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1170 return false;
1171
1172 return true;
1173 }
1174
1175 // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1176 // another we need to add 64-bit registers to the final MI.
1177 if (SrcReg.isPhysical()) {
1178 ImplicitOp = Src;
1179 ImplicitOp.setImplicit();
1180
1181 NewSrc = getX86SubSuperRegister(SrcReg, 64);
1182 assert(!SubReg && "no superregister for source");
1183 assert(NewSrc.isValid() && "Invalid Operand");
1184 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1185 } else {
1186 // Virtual register of the wrong class, we have to create a temporary 64-bit
1187 // vreg to feed into the LEA.
1188 NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1189 NewSrcSubReg = X86::NoSubRegister;
1190 MachineInstr *Copy =
1191 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1192 .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1193 .addReg(SrcReg, getKillRegState(isKill), SubReg);
1194
1195 // Which is obviously going to be dead after we're done with it.
1196 isKill = true;
1197
1198 if (LV)
1199 LV->replaceKillInstruction(SrcReg, MI, *Copy);
1200
1201 if (LIS) {
1202 SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1203 SlotIndex Idx = LIS->getInstructionIndex(MI);
1204 LiveInterval &LI = LIS->getInterval(SrcReg);
1206 if (S->end.getBaseIndex() == Idx)
1207 S->end = CopyIdx.getRegSlot();
1208 }
1209 }
1210
1211 // We've set all the parameters without issue.
1212 return true;
1213}
1214
1215MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1217 LiveVariables *LV,
1218 LiveIntervals *LIS,
1219 bool Is8BitOp) const {
1220 // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1221 MachineBasicBlock &MBB = *MI.getParent();
1222 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
1223 assert((Is8BitOp ||
1224 RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1225 *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1226 "Unexpected type for LEA transform");
1227
1228 // TODO: For a 32-bit target, we need to adjust the LEA variables with
1229 // something like this:
1230 // Opcode = X86::LEA32r;
1231 // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1232 // OutRegLEA =
1233 // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1234 // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1235 if (!Subtarget.is64Bit())
1236 return nullptr;
1237
1238 unsigned Opcode = X86::LEA64_32r;
1239 Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1240 Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1241 Register InRegLEA2;
1242
1243 // Build and insert into an implicit UNDEF value. This is OK because
1244 // we will be shifting and then extracting the lower 8/16-bits.
1245 // This has the potential to cause partial register stall. e.g.
1246 // movw (%rbp,%rcx,2), %dx
1247 // leal -65(%rdx), %esi
1248 // But testing has shown this *does* help performance in 64-bit mode (at
1249 // least on modern x86 machines).
1250 MachineBasicBlock::iterator MBBI = MI.getIterator();
1251 Register Dest = MI.getOperand(0).getReg();
1252 Register Src = MI.getOperand(1).getReg();
1253 unsigned SrcSubReg = MI.getOperand(1).getSubReg();
1254 Register Src2;
1255 unsigned Src2SubReg;
1256 bool IsDead = MI.getOperand(0).isDead();
1257 bool IsKill = MI.getOperand(1).isKill();
1258 unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1259 assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1260 MachineInstr *ImpDef =
1261 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1262 MachineInstr *InsMI =
1263 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1264 .addReg(InRegLEA, RegState::Define, SubReg)
1265 .addReg(Src, getKillRegState(IsKill), SrcSubReg);
1266 MachineInstr *ImpDef2 = nullptr;
1267 MachineInstr *InsMI2 = nullptr;
1268
1270 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1271#define CASE_NF(OP) \
1272 case X86::OP: \
1273 case X86::OP##_NF:
1274 switch (MIOpc) {
1275 default:
1276 llvm_unreachable("Unreachable!");
1277 CASE_NF(SHL8ri)
1278 CASE_NF(SHL16ri) {
1279 unsigned ShAmt = MI.getOperand(2).getImm();
1280 MIB.addReg(0)
1281 .addImm(1LL << ShAmt)
1282 .addReg(InRegLEA, RegState::Kill)
1283 .addImm(0)
1284 .addReg(0);
1285 break;
1286 }
1287 CASE_NF(INC8r)
1288 CASE_NF(INC16r)
1289 addRegOffset(MIB, InRegLEA, true, 1);
1290 break;
1291 CASE_NF(DEC8r)
1292 CASE_NF(DEC16r)
1293 addRegOffset(MIB, InRegLEA, true, -1);
1294 break;
1295 CASE_NF(ADD8ri)
1296 CASE_NF(ADD16ri)
1297 case X86::ADD8ri_DB:
1298 case X86::ADD16ri_DB:
1299 addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1300 break;
1301 CASE_NF(ADD8rr)
1302 CASE_NF(ADD16rr)
1303 case X86::ADD8rr_DB:
1304 case X86::ADD16rr_DB: {
1305 Src2 = MI.getOperand(2).getReg();
1306 Src2SubReg = MI.getOperand(2).getSubReg();
1307 bool IsKill2 = MI.getOperand(2).isKill();
1308 assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1309 if (Src == Src2) {
1310 // ADD8rr/ADD16rr killed %reg1028, %reg1028
1311 // just a single insert_subreg.
1312 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA, false,
1313 X86::NoSubRegister);
1314 } else {
1315 if (Subtarget.is64Bit())
1316 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1317 else
1318 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1319 // Build and insert into an implicit UNDEF value. This is OK because
1320 // we will be shifting and then extracting the lower 8/16-bits.
1321 ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1322 InRegLEA2);
1323 InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1324 .addReg(InRegLEA2, RegState::Define, SubReg)
1325 .addReg(Src2, getKillRegState(IsKill2), Src2SubReg);
1326 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA2, true,
1327 X86::NoSubRegister);
1328 }
1329 if (LV && IsKill2 && InsMI2)
1330 LV->replaceKillInstruction(Src2, MI, *InsMI2);
1331 break;
1332 }
1333 }
1334
1335 MachineInstr *NewMI = MIB;
1336 MachineInstr *ExtMI =
1337 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1339 .addReg(OutRegLEA, RegState::Kill, SubReg);
1340
1341 if (LV) {
1342 // Update live variables.
1343 LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1344 if (InRegLEA2)
1345 LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI);
1346 LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1347 if (IsKill)
1348 LV->replaceKillInstruction(Src, MI, *InsMI);
1349 if (IsDead)
1350 LV->replaceKillInstruction(Dest, MI, *ExtMI);
1351 }
1352
1353 if (LIS) {
1354 LIS->InsertMachineInstrInMaps(*ImpDef);
1355 SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1356 if (ImpDef2)
1357 LIS->InsertMachineInstrInMaps(*ImpDef2);
1358 SlotIndex Ins2Idx;
1359 if (InsMI2)
1360 Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1361 SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1362 SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1363 LIS->getInterval(InRegLEA);
1364 LIS->getInterval(OutRegLEA);
1365 if (InRegLEA2)
1366 LIS->getInterval(InRegLEA2);
1367
1368 // Move the use of Src up to InsMI.
1369 LiveInterval &SrcLI = LIS->getInterval(Src);
1370 LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1371 if (SrcSeg->end == NewIdx.getRegSlot())
1372 SrcSeg->end = InsIdx.getRegSlot();
1373
1374 if (InsMI2) {
1375 // Move the use of Src2 up to InsMI2.
1376 LiveInterval &Src2LI = LIS->getInterval(Src2);
1377 LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1378 if (Src2Seg->end == NewIdx.getRegSlot())
1379 Src2Seg->end = Ins2Idx.getRegSlot();
1380 }
1381
1382 // Move the definition of Dest down to ExtMI.
1383 LiveInterval &DestLI = LIS->getInterval(Dest);
1384 LiveRange::Segment *DestSeg =
1385 DestLI.getSegmentContaining(NewIdx.getRegSlot());
1386 assert(DestSeg->start == NewIdx.getRegSlot() &&
1387 DestSeg->valno->def == NewIdx.getRegSlot());
1388 DestSeg->start = ExtIdx.getRegSlot();
1389 DestSeg->valno->def = ExtIdx.getRegSlot();
1390 }
1391
1392 return ExtMI;
1393}
1394
1395/// This method must be implemented by targets that
1396/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1397/// may be able to convert a two-address instruction into a true
1398/// three-address instruction on demand. This allows the X86 target (for
1399/// example) to convert ADD and SHL instructions into LEA instructions if they
1400/// would require register copies due to two-addressness.
1401///
1402/// This method returns a null pointer if the transformation cannot be
1403/// performed, otherwise it returns the new instruction.
1404///
1406 LiveVariables *LV,
1407 LiveIntervals *LIS) const {
1408 // The following opcodes also sets the condition code register(s). Only
1409 // convert them to equivalent lea if the condition code register def's
1410 // are dead!
1412 return nullptr;
1413
1414 MachineFunction &MF = *MI.getParent()->getParent();
1415 // All instructions input are two-addr instructions. Get the known operands.
1416 const MachineOperand &Dest = MI.getOperand(0);
1417 const MachineOperand &Src = MI.getOperand(1);
1418
1419 // Ideally, operations with undef should be folded before we get here, but we
1420 // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1421 // Without this, we have to forward undef state to new register operands to
1422 // avoid machine verifier errors.
1423 if (Src.isUndef())
1424 return nullptr;
1425 if (MI.getNumOperands() > 2)
1426 if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1427 return nullptr;
1428
1429 MachineInstr *NewMI = nullptr;
1430 Register SrcReg, SrcReg2;
1431 unsigned SrcSubReg, SrcSubReg2;
1432 bool Is64Bit = Subtarget.is64Bit();
1433
1434 bool Is8BitOp = false;
1435 unsigned NumRegOperands = 2;
1436 unsigned MIOpc = MI.getOpcode();
1437 switch (MIOpc) {
1438 default:
1439 llvm_unreachable("Unreachable!");
1440 CASE_NF(SHL64ri) {
1441 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1442 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1443 if (!isTruncatedShiftCountForLEA(ShAmt))
1444 return nullptr;
1445
1446 // LEA can't handle RSP.
1447 if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1448 Src.getReg(), &X86::GR64_NOSPRegClass))
1449 return nullptr;
1450
1451 NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1452 .add(Dest)
1453 .addReg(0)
1454 .addImm(1LL << ShAmt)
1455 .add(Src)
1456 .addImm(0)
1457 .addReg(0);
1458 break;
1459 }
1460 CASE_NF(SHL32ri) {
1461 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1462 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1463 if (!isTruncatedShiftCountForLEA(ShAmt))
1464 return nullptr;
1465
1466 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1467
1468 // LEA can't handle ESP.
1469 bool isKill;
1470 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1471 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1472 isKill, ImplicitOp, LV, LIS))
1473 return nullptr;
1474
1476 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1477 .add(Dest)
1478 .addReg(0)
1479 .addImm(1LL << ShAmt)
1480 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
1481 .addImm(0)
1482 .addReg(0);
1483 if (ImplicitOp.getReg() != 0)
1484 MIB.add(ImplicitOp);
1485 NewMI = MIB;
1486
1487 // Add kills if classifyLEAReg created a new register.
1488 if (LV && SrcReg != Src.getReg())
1489 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1490 break;
1491 }
1492 CASE_NF(SHL8ri)
1493 Is8BitOp = true;
1494 [[fallthrough]];
1495 CASE_NF(SHL16ri) {
1496 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1497 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1498 if (!isTruncatedShiftCountForLEA(ShAmt))
1499 return nullptr;
1500 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1501 }
1502 CASE_NF(INC64r)
1503 CASE_NF(INC32r) {
1504 assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1505 unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
1506 ? X86::LEA64r
1507 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1508 bool isKill;
1509 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1510 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1511 isKill, ImplicitOp, LV, LIS))
1512 return nullptr;
1513
1514 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1515 .add(Dest)
1516 .addReg(SrcReg, getKillRegState(isKill));
1517 if (ImplicitOp.getReg() != 0)
1518 MIB.add(ImplicitOp);
1519
1520 NewMI = addOffset(MIB, 1);
1521
1522 // Add kills if classifyLEAReg created a new register.
1523 if (LV && SrcReg != Src.getReg())
1524 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1525 break;
1526 }
1527 CASE_NF(DEC64r)
1528 CASE_NF(DEC32r) {
1529 assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1530 unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
1531 ? X86::LEA64r
1532 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1533
1534 bool isKill;
1535 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1536 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1537 isKill, ImplicitOp, LV, LIS))
1538 return nullptr;
1539
1540 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1541 .add(Dest)
1542 .addReg(SrcReg, getKillRegState(isKill));
1543 if (ImplicitOp.getReg() != 0)
1544 MIB.add(ImplicitOp);
1545
1546 NewMI = addOffset(MIB, -1);
1547
1548 // Add kills if classifyLEAReg created a new register.
1549 if (LV && SrcReg != Src.getReg())
1550 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1551 break;
1552 }
1553 CASE_NF(DEC8r)
1554 CASE_NF(INC8r)
1555 Is8BitOp = true;
1556 [[fallthrough]];
1557 CASE_NF(DEC16r)
1558 CASE_NF(INC16r)
1559 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1560 CASE_NF(ADD64rr)
1561 CASE_NF(ADD32rr)
1562 case X86::ADD64rr_DB:
1563 case X86::ADD32rr_DB: {
1564 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1565 unsigned Opc;
1566 if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_NF ||
1567 MIOpc == X86::ADD64rr_DB)
1568 Opc = X86::LEA64r;
1569 else
1570 Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1571
1572 const MachineOperand &Src2 = MI.getOperand(2);
1573 bool isKill2;
1574 MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1575 if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, SrcSubReg2,
1576 isKill2, ImplicitOp2, LV, LIS))
1577 return nullptr;
1578
1579 bool isKill;
1580 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1581 if (Src.getReg() == Src2.getReg()) {
1582 // Don't call classify LEAReg a second time on the same register, in case
1583 // the first call inserted a COPY from Src2 and marked it as killed.
1584 isKill = isKill2;
1585 SrcReg = SrcReg2;
1586 SrcSubReg = SrcSubReg2;
1587 } else {
1588 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1589 isKill, ImplicitOp, LV, LIS))
1590 return nullptr;
1591 }
1592
1593 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1594 if (ImplicitOp.getReg() != 0)
1595 MIB.add(ImplicitOp);
1596 if (ImplicitOp2.getReg() != 0)
1597 MIB.add(ImplicitOp2);
1598
1599 NewMI =
1600 addRegReg(MIB, SrcReg, isKill, SrcSubReg, SrcReg2, isKill2, SrcSubReg2);
1601
1602 // Add kills if classifyLEAReg created a new register.
1603 if (LV) {
1604 if (SrcReg2 != Src2.getReg())
1605 LV->getVarInfo(SrcReg2).Kills.push_back(NewMI);
1606 if (SrcReg != SrcReg2 && SrcReg != Src.getReg())
1607 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1608 }
1609 NumRegOperands = 3;
1610 break;
1611 }
1612 CASE_NF(ADD8rr)
1613 case X86::ADD8rr_DB:
1614 Is8BitOp = true;
1615 [[fallthrough]];
1616 CASE_NF(ADD16rr)
1617 case X86::ADD16rr_DB:
1618 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1619 CASE_NF(ADD64ri32)
1620 case X86::ADD64ri32_DB:
1621 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1622 NewMI = addOffset(
1623 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1624 MI.getOperand(2));
1625 break;
1626 CASE_NF(ADD32ri)
1627 case X86::ADD32ri_DB: {
1628 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1629 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1630
1631 bool isKill;
1632 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1633 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1634 isKill, ImplicitOp, LV, LIS))
1635 return nullptr;
1636
1638 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1639 .add(Dest)
1640 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1641 if (ImplicitOp.getReg() != 0)
1642 MIB.add(ImplicitOp);
1643
1644 NewMI = addOffset(MIB, MI.getOperand(2));
1645
1646 // Add kills if classifyLEAReg created a new register.
1647 if (LV && SrcReg != Src.getReg())
1648 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1649 break;
1650 }
1651 CASE_NF(ADD8ri)
1652 case X86::ADD8ri_DB:
1653 Is8BitOp = true;
1654 [[fallthrough]];
1655 CASE_NF(ADD16ri)
1656 case X86::ADD16ri_DB:
1657 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1658 CASE_NF(SUB8ri)
1659 CASE_NF(SUB16ri)
1660 /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1661 return nullptr;
1662 CASE_NF(SUB32ri) {
1663 if (!MI.getOperand(2).isImm())
1664 return nullptr;
1665 int64_t Imm = MI.getOperand(2).getImm();
1666 if (!isInt<32>(-Imm))
1667 return nullptr;
1668
1669 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1670 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1671
1672 bool isKill;
1673 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1674 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1675 isKill, ImplicitOp, LV, LIS))
1676 return nullptr;
1677
1679 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1680 .add(Dest)
1681 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1682 if (ImplicitOp.getReg() != 0)
1683 MIB.add(ImplicitOp);
1684
1685 NewMI = addOffset(MIB, -Imm);
1686
1687 // Add kills if classifyLEAReg created a new register.
1688 if (LV && SrcReg != Src.getReg())
1689 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1690 break;
1691 }
1692
1693 CASE_NF(SUB64ri32) {
1694 if (!MI.getOperand(2).isImm())
1695 return nullptr;
1696 int64_t Imm = MI.getOperand(2).getImm();
1697 if (!isInt<32>(-Imm))
1698 return nullptr;
1699
1700 assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1701
1703 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
1704 NewMI = addOffset(MIB, -Imm);
1705 break;
1706 }
1707
1708 case X86::VMOVDQU8Z128rmk:
1709 case X86::VMOVDQU8Z256rmk:
1710 case X86::VMOVDQU8Zrmk:
1711 case X86::VMOVDQU16Z128rmk:
1712 case X86::VMOVDQU16Z256rmk:
1713 case X86::VMOVDQU16Zrmk:
1714 case X86::VMOVDQU32Z128rmk:
1715 case X86::VMOVDQA32Z128rmk:
1716 case X86::VMOVDQU32Z256rmk:
1717 case X86::VMOVDQA32Z256rmk:
1718 case X86::VMOVDQU32Zrmk:
1719 case X86::VMOVDQA32Zrmk:
1720 case X86::VMOVDQU64Z128rmk:
1721 case X86::VMOVDQA64Z128rmk:
1722 case X86::VMOVDQU64Z256rmk:
1723 case X86::VMOVDQA64Z256rmk:
1724 case X86::VMOVDQU64Zrmk:
1725 case X86::VMOVDQA64Zrmk:
1726 case X86::VMOVUPDZ128rmk:
1727 case X86::VMOVAPDZ128rmk:
1728 case X86::VMOVUPDZ256rmk:
1729 case X86::VMOVAPDZ256rmk:
1730 case X86::VMOVUPDZrmk:
1731 case X86::VMOVAPDZrmk:
1732 case X86::VMOVUPSZ128rmk:
1733 case X86::VMOVAPSZ128rmk:
1734 case X86::VMOVUPSZ256rmk:
1735 case X86::VMOVAPSZ256rmk:
1736 case X86::VMOVUPSZrmk:
1737 case X86::VMOVAPSZrmk:
1738 case X86::VBROADCASTSDZ256rmk:
1739 case X86::VBROADCASTSDZrmk:
1740 case X86::VBROADCASTSSZ128rmk:
1741 case X86::VBROADCASTSSZ256rmk:
1742 case X86::VBROADCASTSSZrmk:
1743 case X86::VPBROADCASTDZ128rmk:
1744 case X86::VPBROADCASTDZ256rmk:
1745 case X86::VPBROADCASTDZrmk:
1746 case X86::VPBROADCASTQZ128rmk:
1747 case X86::VPBROADCASTQZ256rmk:
1748 case X86::VPBROADCASTQZrmk: {
1749 unsigned Opc;
1750 switch (MIOpc) {
1751 default:
1752 llvm_unreachable("Unreachable!");
1753 case X86::VMOVDQU8Z128rmk:
1754 Opc = X86::VPBLENDMBZ128rmk;
1755 break;
1756 case X86::VMOVDQU8Z256rmk:
1757 Opc = X86::VPBLENDMBZ256rmk;
1758 break;
1759 case X86::VMOVDQU8Zrmk:
1760 Opc = X86::VPBLENDMBZrmk;
1761 break;
1762 case X86::VMOVDQU16Z128rmk:
1763 Opc = X86::VPBLENDMWZ128rmk;
1764 break;
1765 case X86::VMOVDQU16Z256rmk:
1766 Opc = X86::VPBLENDMWZ256rmk;
1767 break;
1768 case X86::VMOVDQU16Zrmk:
1769 Opc = X86::VPBLENDMWZrmk;
1770 break;
1771 case X86::VMOVDQU32Z128rmk:
1772 Opc = X86::VPBLENDMDZ128rmk;
1773 break;
1774 case X86::VMOVDQU32Z256rmk:
1775 Opc = X86::VPBLENDMDZ256rmk;
1776 break;
1777 case X86::VMOVDQU32Zrmk:
1778 Opc = X86::VPBLENDMDZrmk;
1779 break;
1780 case X86::VMOVDQU64Z128rmk:
1781 Opc = X86::VPBLENDMQZ128rmk;
1782 break;
1783 case X86::VMOVDQU64Z256rmk:
1784 Opc = X86::VPBLENDMQZ256rmk;
1785 break;
1786 case X86::VMOVDQU64Zrmk:
1787 Opc = X86::VPBLENDMQZrmk;
1788 break;
1789 case X86::VMOVUPDZ128rmk:
1790 Opc = X86::VBLENDMPDZ128rmk;
1791 break;
1792 case X86::VMOVUPDZ256rmk:
1793 Opc = X86::VBLENDMPDZ256rmk;
1794 break;
1795 case X86::VMOVUPDZrmk:
1796 Opc = X86::VBLENDMPDZrmk;
1797 break;
1798 case X86::VMOVUPSZ128rmk:
1799 Opc = X86::VBLENDMPSZ128rmk;
1800 break;
1801 case X86::VMOVUPSZ256rmk:
1802 Opc = X86::VBLENDMPSZ256rmk;
1803 break;
1804 case X86::VMOVUPSZrmk:
1805 Opc = X86::VBLENDMPSZrmk;
1806 break;
1807 case X86::VMOVDQA32Z128rmk:
1808 Opc = X86::VPBLENDMDZ128rmk;
1809 break;
1810 case X86::VMOVDQA32Z256rmk:
1811 Opc = X86::VPBLENDMDZ256rmk;
1812 break;
1813 case X86::VMOVDQA32Zrmk:
1814 Opc = X86::VPBLENDMDZrmk;
1815 break;
1816 case X86::VMOVDQA64Z128rmk:
1817 Opc = X86::VPBLENDMQZ128rmk;
1818 break;
1819 case X86::VMOVDQA64Z256rmk:
1820 Opc = X86::VPBLENDMQZ256rmk;
1821 break;
1822 case X86::VMOVDQA64Zrmk:
1823 Opc = X86::VPBLENDMQZrmk;
1824 break;
1825 case X86::VMOVAPDZ128rmk:
1826 Opc = X86::VBLENDMPDZ128rmk;
1827 break;
1828 case X86::VMOVAPDZ256rmk:
1829 Opc = X86::VBLENDMPDZ256rmk;
1830 break;
1831 case X86::VMOVAPDZrmk:
1832 Opc = X86::VBLENDMPDZrmk;
1833 break;
1834 case X86::VMOVAPSZ128rmk:
1835 Opc = X86::VBLENDMPSZ128rmk;
1836 break;
1837 case X86::VMOVAPSZ256rmk:
1838 Opc = X86::VBLENDMPSZ256rmk;
1839 break;
1840 case X86::VMOVAPSZrmk:
1841 Opc = X86::VBLENDMPSZrmk;
1842 break;
1843 case X86::VBROADCASTSDZ256rmk:
1844 Opc = X86::VBLENDMPDZ256rmbk;
1845 break;
1846 case X86::VBROADCASTSDZrmk:
1847 Opc = X86::VBLENDMPDZrmbk;
1848 break;
1849 case X86::VBROADCASTSSZ128rmk:
1850 Opc = X86::VBLENDMPSZ128rmbk;
1851 break;
1852 case X86::VBROADCASTSSZ256rmk:
1853 Opc = X86::VBLENDMPSZ256rmbk;
1854 break;
1855 case X86::VBROADCASTSSZrmk:
1856 Opc = X86::VBLENDMPSZrmbk;
1857 break;
1858 case X86::VPBROADCASTDZ128rmk:
1859 Opc = X86::VPBLENDMDZ128rmbk;
1860 break;
1861 case X86::VPBROADCASTDZ256rmk:
1862 Opc = X86::VPBLENDMDZ256rmbk;
1863 break;
1864 case X86::VPBROADCASTDZrmk:
1865 Opc = X86::VPBLENDMDZrmbk;
1866 break;
1867 case X86::VPBROADCASTQZ128rmk:
1868 Opc = X86::VPBLENDMQZ128rmbk;
1869 break;
1870 case X86::VPBROADCASTQZ256rmk:
1871 Opc = X86::VPBLENDMQZ256rmbk;
1872 break;
1873 case X86::VPBROADCASTQZrmk:
1874 Opc = X86::VPBLENDMQZrmbk;
1875 break;
1876 }
1877
1878 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1879 .add(Dest)
1880 .add(MI.getOperand(2))
1881 .add(Src)
1882 .add(MI.getOperand(3))
1883 .add(MI.getOperand(4))
1884 .add(MI.getOperand(5))
1885 .add(MI.getOperand(6))
1886 .add(MI.getOperand(7));
1887 NumRegOperands = 4;
1888 break;
1889 }
1890
1891 case X86::VMOVDQU8Z128rrk:
1892 case X86::VMOVDQU8Z256rrk:
1893 case X86::VMOVDQU8Zrrk:
1894 case X86::VMOVDQU16Z128rrk:
1895 case X86::VMOVDQU16Z256rrk:
1896 case X86::VMOVDQU16Zrrk:
1897 case X86::VMOVDQU32Z128rrk:
1898 case X86::VMOVDQA32Z128rrk:
1899 case X86::VMOVDQU32Z256rrk:
1900 case X86::VMOVDQA32Z256rrk:
1901 case X86::VMOVDQU32Zrrk:
1902 case X86::VMOVDQA32Zrrk:
1903 case X86::VMOVDQU64Z128rrk:
1904 case X86::VMOVDQA64Z128rrk:
1905 case X86::VMOVDQU64Z256rrk:
1906 case X86::VMOVDQA64Z256rrk:
1907 case X86::VMOVDQU64Zrrk:
1908 case X86::VMOVDQA64Zrrk:
1909 case X86::VMOVUPDZ128rrk:
1910 case X86::VMOVAPDZ128rrk:
1911 case X86::VMOVUPDZ256rrk:
1912 case X86::VMOVAPDZ256rrk:
1913 case X86::VMOVUPDZrrk:
1914 case X86::VMOVAPDZrrk:
1915 case X86::VMOVUPSZ128rrk:
1916 case X86::VMOVAPSZ128rrk:
1917 case X86::VMOVUPSZ256rrk:
1918 case X86::VMOVAPSZ256rrk:
1919 case X86::VMOVUPSZrrk:
1920 case X86::VMOVAPSZrrk: {
1921 unsigned Opc;
1922 switch (MIOpc) {
1923 default:
1924 llvm_unreachable("Unreachable!");
1925 case X86::VMOVDQU8Z128rrk:
1926 Opc = X86::VPBLENDMBZ128rrk;
1927 break;
1928 case X86::VMOVDQU8Z256rrk:
1929 Opc = X86::VPBLENDMBZ256rrk;
1930 break;
1931 case X86::VMOVDQU8Zrrk:
1932 Opc = X86::VPBLENDMBZrrk;
1933 break;
1934 case X86::VMOVDQU16Z128rrk:
1935 Opc = X86::VPBLENDMWZ128rrk;
1936 break;
1937 case X86::VMOVDQU16Z256rrk:
1938 Opc = X86::VPBLENDMWZ256rrk;
1939 break;
1940 case X86::VMOVDQU16Zrrk:
1941 Opc = X86::VPBLENDMWZrrk;
1942 break;
1943 case X86::VMOVDQU32Z128rrk:
1944 Opc = X86::VPBLENDMDZ128rrk;
1945 break;
1946 case X86::VMOVDQU32Z256rrk:
1947 Opc = X86::VPBLENDMDZ256rrk;
1948 break;
1949 case X86::VMOVDQU32Zrrk:
1950 Opc = X86::VPBLENDMDZrrk;
1951 break;
1952 case X86::VMOVDQU64Z128rrk:
1953 Opc = X86::VPBLENDMQZ128rrk;
1954 break;
1955 case X86::VMOVDQU64Z256rrk:
1956 Opc = X86::VPBLENDMQZ256rrk;
1957 break;
1958 case X86::VMOVDQU64Zrrk:
1959 Opc = X86::VPBLENDMQZrrk;
1960 break;
1961 case X86::VMOVUPDZ128rrk:
1962 Opc = X86::VBLENDMPDZ128rrk;
1963 break;
1964 case X86::VMOVUPDZ256rrk:
1965 Opc = X86::VBLENDMPDZ256rrk;
1966 break;
1967 case X86::VMOVUPDZrrk:
1968 Opc = X86::VBLENDMPDZrrk;
1969 break;
1970 case X86::VMOVUPSZ128rrk:
1971 Opc = X86::VBLENDMPSZ128rrk;
1972 break;
1973 case X86::VMOVUPSZ256rrk:
1974 Opc = X86::VBLENDMPSZ256rrk;
1975 break;
1976 case X86::VMOVUPSZrrk:
1977 Opc = X86::VBLENDMPSZrrk;
1978 break;
1979 case X86::VMOVDQA32Z128rrk:
1980 Opc = X86::VPBLENDMDZ128rrk;
1981 break;
1982 case X86::VMOVDQA32Z256rrk:
1983 Opc = X86::VPBLENDMDZ256rrk;
1984 break;
1985 case X86::VMOVDQA32Zrrk:
1986 Opc = X86::VPBLENDMDZrrk;
1987 break;
1988 case X86::VMOVDQA64Z128rrk:
1989 Opc = X86::VPBLENDMQZ128rrk;
1990 break;
1991 case X86::VMOVDQA64Z256rrk:
1992 Opc = X86::VPBLENDMQZ256rrk;
1993 break;
1994 case X86::VMOVDQA64Zrrk:
1995 Opc = X86::VPBLENDMQZrrk;
1996 break;
1997 case X86::VMOVAPDZ128rrk:
1998 Opc = X86::VBLENDMPDZ128rrk;
1999 break;
2000 case X86::VMOVAPDZ256rrk:
2001 Opc = X86::VBLENDMPDZ256rrk;
2002 break;
2003 case X86::VMOVAPDZrrk:
2004 Opc = X86::VBLENDMPDZrrk;
2005 break;
2006 case X86::VMOVAPSZ128rrk:
2007 Opc = X86::VBLENDMPSZ128rrk;
2008 break;
2009 case X86::VMOVAPSZ256rrk:
2010 Opc = X86::VBLENDMPSZ256rrk;
2011 break;
2012 case X86::VMOVAPSZrrk:
2013 Opc = X86::VBLENDMPSZrrk;
2014 break;
2015 }
2016
2017 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
2018 .add(Dest)
2019 .add(MI.getOperand(2))
2020 .add(Src)
2021 .add(MI.getOperand(3));
2022 NumRegOperands = 4;
2023 break;
2024 }
2025 }
2026#undef CASE_NF
2027
2028 if (!NewMI)
2029 return nullptr;
2030
2031 if (LV) { // Update live variables
2032 for (unsigned I = 0; I < NumRegOperands; ++I) {
2033 MachineOperand &Op = MI.getOperand(I);
2034 if (Op.isReg() && (Op.isDead() || Op.isKill()))
2035 LV->replaceKillInstruction(Op.getReg(), MI, *NewMI);
2036 }
2037 }
2038
2039 MachineBasicBlock &MBB = *MI.getParent();
2040 MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
2041
2042 if (LIS) {
2043 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
2044 if (SrcReg)
2045 LIS->getInterval(SrcReg);
2046 if (SrcReg2)
2047 LIS->getInterval(SrcReg2);
2048 }
2049
2050 return NewMI;
2051}
2052
2053/// This determines which of three possible cases of a three source commute
2054/// the source indexes correspond to taking into account any mask operands.
2055/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
2056/// possible.
2057/// Case 0 - Possible to commute the first and second operands.
2058/// Case 1 - Possible to commute the first and third operands.
2059/// Case 2 - Possible to commute the second and third operands.
2060static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
2061 unsigned SrcOpIdx2) {
2062 // Put the lowest index to SrcOpIdx1 to simplify the checks below.
2063 if (SrcOpIdx1 > SrcOpIdx2)
2064 std::swap(SrcOpIdx1, SrcOpIdx2);
2065
2066 unsigned Op1 = 1, Op2 = 2, Op3 = 3;
2067 if (X86II::isKMasked(TSFlags)) {
2068 Op2++;
2069 Op3++;
2070 }
2071
2072 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
2073 return 0;
2074 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
2075 return 1;
2076 if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
2077 return 2;
2078 llvm_unreachable("Unknown three src commute case.");
2079}
2080
2082 const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
2083 const X86InstrFMA3Group &FMA3Group) const {
2084
2085 unsigned Opc = MI.getOpcode();
2086
2087 // TODO: Commuting the 1st operand of FMA*_Int requires some additional
2088 // analysis. The commute optimization is legal only if all users of FMA*_Int
2089 // use only the lowest element of the FMA*_Int instruction. Such analysis are
2090 // not implemented yet. So, just return 0 in that case.
2091 // When such analysis are available this place will be the right place for
2092 // calling it.
2093 assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
2094 "Intrinsic instructions can't commute operand 1");
2095
2096 // Determine which case this commute is or if it can't be done.
2097 unsigned Case =
2098 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2099 assert(Case < 3 && "Unexpected case number!");
2100
2101 // Define the FMA forms mapping array that helps to map input FMA form
2102 // to output FMA form to preserve the operation semantics after
2103 // commuting the operands.
2104 const unsigned Form132Index = 0;
2105 const unsigned Form213Index = 1;
2106 const unsigned Form231Index = 2;
2107 static const unsigned FormMapping[][3] = {
2108 // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
2109 // FMA132 A, C, b; ==> FMA231 C, A, b;
2110 // FMA213 B, A, c; ==> FMA213 A, B, c;
2111 // FMA231 C, A, b; ==> FMA132 A, C, b;
2112 {Form231Index, Form213Index, Form132Index},
2113 // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
2114 // FMA132 A, c, B; ==> FMA132 B, c, A;
2115 // FMA213 B, a, C; ==> FMA231 C, a, B;
2116 // FMA231 C, a, B; ==> FMA213 B, a, C;
2117 {Form132Index, Form231Index, Form213Index},
2118 // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
2119 // FMA132 a, C, B; ==> FMA213 a, B, C;
2120 // FMA213 b, A, C; ==> FMA132 b, C, A;
2121 // FMA231 c, A, B; ==> FMA231 c, B, A;
2122 {Form213Index, Form132Index, Form231Index}};
2123
2124 unsigned FMAForms[3];
2125 FMAForms[0] = FMA3Group.get132Opcode();
2126 FMAForms[1] = FMA3Group.get213Opcode();
2127 FMAForms[2] = FMA3Group.get231Opcode();
2128
2129 // Everything is ready, just adjust the FMA opcode and return it.
2130 for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
2131 if (Opc == FMAForms[FormIndex])
2132 return FMAForms[FormMapping[Case][FormIndex]];
2133
2134 llvm_unreachable("Illegal FMA3 format");
2135}
2136
2137static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
2138 unsigned SrcOpIdx2) {
2139 // Determine which case this commute is or if it can't be done.
2140 unsigned Case =
2141 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2142 assert(Case < 3 && "Unexpected case value!");
2143
2144 // For each case we need to swap two pairs of bits in the final immediate.
2145 static const uint8_t SwapMasks[3][4] = {
2146 {0x04, 0x10, 0x08, 0x20}, // Swap bits 2/4 and 3/5.
2147 {0x02, 0x10, 0x08, 0x40}, // Swap bits 1/4 and 3/6.
2148 {0x02, 0x04, 0x20, 0x40}, // Swap bits 1/2 and 5/6.
2149 };
2150
2151 uint8_t Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2152 // Clear out the bits we are swapping.
2153 uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
2154 SwapMasks[Case][2] | SwapMasks[Case][3]);
2155 // If the immediate had a bit of the pair set, then set the opposite bit.
2156 if (Imm & SwapMasks[Case][0])
2157 NewImm |= SwapMasks[Case][1];
2158 if (Imm & SwapMasks[Case][1])
2159 NewImm |= SwapMasks[Case][0];
2160 if (Imm & SwapMasks[Case][2])
2161 NewImm |= SwapMasks[Case][3];
2162 if (Imm & SwapMasks[Case][3])
2163 NewImm |= SwapMasks[Case][2];
2164 MI.getOperand(MI.getNumOperands() - 1).setImm(NewImm);
2165}
2166
2167// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
2168// commuted.
2169static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
2170#define VPERM_CASES(Suffix) \
2171 case X86::VPERMI2##Suffix##Z128rr: \
2172 case X86::VPERMT2##Suffix##Z128rr: \
2173 case X86::VPERMI2##Suffix##Z256rr: \
2174 case X86::VPERMT2##Suffix##Z256rr: \
2175 case X86::VPERMI2##Suffix##Zrr: \
2176 case X86::VPERMT2##Suffix##Zrr: \
2177 case X86::VPERMI2##Suffix##Z128rm: \
2178 case X86::VPERMT2##Suffix##Z128rm: \
2179 case X86::VPERMI2##Suffix##Z256rm: \
2180 case X86::VPERMT2##Suffix##Z256rm: \
2181 case X86::VPERMI2##Suffix##Zrm: \
2182 case X86::VPERMT2##Suffix##Zrm: \
2183 case X86::VPERMI2##Suffix##Z128rrkz: \
2184 case X86::VPERMT2##Suffix##Z128rrkz: \
2185 case X86::VPERMI2##Suffix##Z256rrkz: \
2186 case X86::VPERMT2##Suffix##Z256rrkz: \
2187 case X86::VPERMI2##Suffix##Zrrkz: \
2188 case X86::VPERMT2##Suffix##Zrrkz: \
2189 case X86::VPERMI2##Suffix##Z128rmkz: \
2190 case X86::VPERMT2##Suffix##Z128rmkz: \
2191 case X86::VPERMI2##Suffix##Z256rmkz: \
2192 case X86::VPERMT2##Suffix##Z256rmkz: \
2193 case X86::VPERMI2##Suffix##Zrmkz: \
2194 case X86::VPERMT2##Suffix##Zrmkz:
2195
2196#define VPERM_CASES_BROADCAST(Suffix) \
2197 VPERM_CASES(Suffix) \
2198 case X86::VPERMI2##Suffix##Z128rmb: \
2199 case X86::VPERMT2##Suffix##Z128rmb: \
2200 case X86::VPERMI2##Suffix##Z256rmb: \
2201 case X86::VPERMT2##Suffix##Z256rmb: \
2202 case X86::VPERMI2##Suffix##Zrmb: \
2203 case X86::VPERMT2##Suffix##Zrmb: \
2204 case X86::VPERMI2##Suffix##Z128rmbkz: \
2205 case X86::VPERMT2##Suffix##Z128rmbkz: \
2206 case X86::VPERMI2##Suffix##Z256rmbkz: \
2207 case X86::VPERMT2##Suffix##Z256rmbkz: \
2208 case X86::VPERMI2##Suffix##Zrmbkz: \
2209 case X86::VPERMT2##Suffix##Zrmbkz:
2210
2211 switch (Opcode) {
2212 default:
2213 return false;
2214 VPERM_CASES(B)
2219 VPERM_CASES(W)
2220 return true;
2221 }
2222#undef VPERM_CASES_BROADCAST
2223#undef VPERM_CASES
2224}
2225
2226// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
2227// from the I opcode to the T opcode and vice versa.
2228static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
2229#define VPERM_CASES(Orig, New) \
2230 case X86::Orig##Z128rr: \
2231 return X86::New##Z128rr; \
2232 case X86::Orig##Z128rrkz: \
2233 return X86::New##Z128rrkz; \
2234 case X86::Orig##Z128rm: \
2235 return X86::New##Z128rm; \
2236 case X86::Orig##Z128rmkz: \
2237 return X86::New##Z128rmkz; \
2238 case X86::Orig##Z256rr: \
2239 return X86::New##Z256rr; \
2240 case X86::Orig##Z256rrkz: \
2241 return X86::New##Z256rrkz; \
2242 case X86::Orig##Z256rm: \
2243 return X86::New##Z256rm; \
2244 case X86::Orig##Z256rmkz: \
2245 return X86::New##Z256rmkz; \
2246 case X86::Orig##Zrr: \
2247 return X86::New##Zrr; \
2248 case X86::Orig##Zrrkz: \
2249 return X86::New##Zrrkz; \
2250 case X86::Orig##Zrm: \
2251 return X86::New##Zrm; \
2252 case X86::Orig##Zrmkz: \
2253 return X86::New##Zrmkz;
2254
2255#define VPERM_CASES_BROADCAST(Orig, New) \
2256 VPERM_CASES(Orig, New) \
2257 case X86::Orig##Z128rmb: \
2258 return X86::New##Z128rmb; \
2259 case X86::Orig##Z128rmbkz: \
2260 return X86::New##Z128rmbkz; \
2261 case X86::Orig##Z256rmb: \
2262 return X86::New##Z256rmb; \
2263 case X86::Orig##Z256rmbkz: \
2264 return X86::New##Z256rmbkz; \
2265 case X86::Orig##Zrmb: \
2266 return X86::New##Zrmb; \
2267 case X86::Orig##Zrmbkz: \
2268 return X86::New##Zrmbkz;
2269
2270 switch (Opcode) {
2271 VPERM_CASES(VPERMI2B, VPERMT2B)
2272 VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
2273 VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
2274 VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
2275 VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
2276 VPERM_CASES(VPERMI2W, VPERMT2W)
2277 VPERM_CASES(VPERMT2B, VPERMI2B)
2278 VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2279 VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2280 VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2281 VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2282 VPERM_CASES(VPERMT2W, VPERMI2W)
2283 }
2284
2285 llvm_unreachable("Unreachable!");
2286#undef VPERM_CASES_BROADCAST
2287#undef VPERM_CASES
2288}
2289
2291 unsigned OpIdx1,
2292 unsigned OpIdx2) const {
2293 auto CloneIfNew = [&](MachineInstr &MI) {
2294 return std::exchange(NewMI, false)
2295 ? MI.getParent()->getParent()->CloneMachineInstr(&MI)
2296 : &MI;
2297 };
2298 MachineInstr *WorkingMI = nullptr;
2299 unsigned Opc = MI.getOpcode();
2300
2301#define CASE_ND(OP) \
2302 case X86::OP: \
2303 case X86::OP##_ND:
2304
2305 switch (Opc) {
2306 // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
2307 CASE_ND(SHRD16rri8)
2308 CASE_ND(SHLD16rri8)
2309 CASE_ND(SHRD32rri8)
2310 CASE_ND(SHLD32rri8)
2311 CASE_ND(SHRD64rri8)
2312 CASE_ND(SHLD64rri8) {
2313 unsigned Size;
2314 switch (Opc) {
2315 default:
2316 llvm_unreachable("Unreachable!");
2317#define FROM_TO_SIZE(A, B, S) \
2318 case X86::A: \
2319 Opc = X86::B; \
2320 Size = S; \
2321 break; \
2322 case X86::A##_ND: \
2323 Opc = X86::B##_ND; \
2324 Size = S; \
2325 break; \
2326 case X86::B: \
2327 Opc = X86::A; \
2328 Size = S; \
2329 break; \
2330 case X86::B##_ND: \
2331 Opc = X86::A##_ND; \
2332 Size = S; \
2333 break;
2334
2335 FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
2336 FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
2337 FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
2338#undef FROM_TO_SIZE
2339 }
2340 WorkingMI = CloneIfNew(MI);
2341 WorkingMI->setDesc(get(Opc));
2342 WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
2343 break;
2344 }
2345 case X86::PFSUBrr:
2346 case X86::PFSUBRrr:
2347 // PFSUB x, y: x = x - y
2348 // PFSUBR x, y: x = y - x
2349 WorkingMI = CloneIfNew(MI);
2350 WorkingMI->setDesc(
2351 get(X86::PFSUBRrr == Opc ? X86::PFSUBrr : X86::PFSUBRrr));
2352 break;
2353 case X86::BLENDPDrri:
2354 case X86::BLENDPSrri:
2355 case X86::PBLENDWrri:
2356 case X86::VBLENDPDrri:
2357 case X86::VBLENDPSrri:
2358 case X86::VBLENDPDYrri:
2359 case X86::VBLENDPSYrri:
2360 case X86::VPBLENDDrri:
2361 case X86::VPBLENDWrri:
2362 case X86::VPBLENDDYrri:
2363 case X86::VPBLENDWYrri: {
2364 int8_t Mask;
2365 switch (Opc) {
2366 default:
2367 llvm_unreachable("Unreachable!");
2368 case X86::BLENDPDrri:
2369 Mask = (int8_t)0x03;
2370 break;
2371 case X86::BLENDPSrri:
2372 Mask = (int8_t)0x0F;
2373 break;
2374 case X86::PBLENDWrri:
2375 Mask = (int8_t)0xFF;
2376 break;
2377 case X86::VBLENDPDrri:
2378 Mask = (int8_t)0x03;
2379 break;
2380 case X86::VBLENDPSrri:
2381 Mask = (int8_t)0x0F;
2382 break;
2383 case X86::VBLENDPDYrri:
2384 Mask = (int8_t)0x0F;
2385 break;
2386 case X86::VBLENDPSYrri:
2387 Mask = (int8_t)0xFF;
2388 break;
2389 case X86::VPBLENDDrri:
2390 Mask = (int8_t)0x0F;
2391 break;
2392 case X86::VPBLENDWrri:
2393 Mask = (int8_t)0xFF;
2394 break;
2395 case X86::VPBLENDDYrri:
2396 Mask = (int8_t)0xFF;
2397 break;
2398 case X86::VPBLENDWYrri:
2399 Mask = (int8_t)0xFF;
2400 break;
2401 }
2402 // Only the least significant bits of Imm are used.
2403 // Using int8_t to ensure it will be sign extended to the int64_t that
2404 // setImm takes in order to match isel behavior.
2405 int8_t Imm = MI.getOperand(3).getImm() & Mask;
2406 WorkingMI = CloneIfNew(MI);
2407 WorkingMI->getOperand(3).setImm(Mask ^ Imm);
2408 break;
2409 }
2410 case X86::INSERTPSrri:
2411 case X86::VINSERTPSrri:
2412 case X86::VINSERTPSZrri: {
2413 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2414 unsigned ZMask = Imm & 15;
2415 unsigned DstIdx = (Imm >> 4) & 3;
2416 unsigned SrcIdx = (Imm >> 6) & 3;
2417
2418 // We can commute insertps if we zero 2 of the elements, the insertion is
2419 // "inline" and we don't override the insertion with a zero.
2420 if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2421 llvm::popcount(ZMask) == 2) {
2422 unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15);
2423 assert(AltIdx < 4 && "Illegal insertion index");
2424 unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2425 WorkingMI = CloneIfNew(MI);
2426 WorkingMI->getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2427 break;
2428 }
2429 return nullptr;
2430 }
2431 case X86::MOVSDrr:
2432 case X86::MOVSSrr:
2433 case X86::VMOVSDrr:
2434 case X86::VMOVSSrr: {
2435 // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2436 if (Subtarget.hasSSE41()) {
2437 unsigned Mask;
2438 switch (Opc) {
2439 default:
2440 llvm_unreachable("Unreachable!");
2441 case X86::MOVSDrr:
2442 Opc = X86::BLENDPDrri;
2443 Mask = 0x02;
2444 break;
2445 case X86::MOVSSrr:
2446 Opc = X86::BLENDPSrri;
2447 Mask = 0x0E;
2448 break;
2449 case X86::VMOVSDrr:
2450 Opc = X86::VBLENDPDrri;
2451 Mask = 0x02;
2452 break;
2453 case X86::VMOVSSrr:
2454 Opc = X86::VBLENDPSrri;
2455 Mask = 0x0E;
2456 break;
2457 }
2458
2459 WorkingMI = CloneIfNew(MI);
2460 WorkingMI->setDesc(get(Opc));
2461 WorkingMI->addOperand(MachineOperand::CreateImm(Mask));
2462 break;
2463 }
2464
2465 assert(Opc == X86::MOVSDrr && "Only MOVSD can commute to SHUFPD");
2466 WorkingMI = CloneIfNew(MI);
2467 WorkingMI->setDesc(get(X86::SHUFPDrri));
2468 WorkingMI->addOperand(MachineOperand::CreateImm(0x02));
2469 break;
2470 }
2471 case X86::SHUFPDrri: {
2472 // Commute to MOVSD.
2473 assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2474 WorkingMI = CloneIfNew(MI);
2475 WorkingMI->setDesc(get(X86::MOVSDrr));
2476 WorkingMI->removeOperand(3);
2477 break;
2478 }
2479 case X86::PCLMULQDQrri:
2480 case X86::VPCLMULQDQrri:
2481 case X86::VPCLMULQDQYrri:
2482 case X86::VPCLMULQDQZrri:
2483 case X86::VPCLMULQDQZ128rri:
2484 case X86::VPCLMULQDQZ256rri: {
2485 // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2486 // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2487 unsigned Imm = MI.getOperand(3).getImm();
2488 unsigned Src1Hi = Imm & 0x01;
2489 unsigned Src2Hi = Imm & 0x10;
2490 WorkingMI = CloneIfNew(MI);
2491 WorkingMI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2492 break;
2493 }
2494 case X86::VPCMPBZ128rri:
2495 case X86::VPCMPUBZ128rri:
2496 case X86::VPCMPBZ256rri:
2497 case X86::VPCMPUBZ256rri:
2498 case X86::VPCMPBZrri:
2499 case X86::VPCMPUBZrri:
2500 case X86::VPCMPDZ128rri:
2501 case X86::VPCMPUDZ128rri:
2502 case X86::VPCMPDZ256rri:
2503 case X86::VPCMPUDZ256rri:
2504 case X86::VPCMPDZrri:
2505 case X86::VPCMPUDZrri:
2506 case X86::VPCMPQZ128rri:
2507 case X86::VPCMPUQZ128rri:
2508 case X86::VPCMPQZ256rri:
2509 case X86::VPCMPUQZ256rri:
2510 case X86::VPCMPQZrri:
2511 case X86::VPCMPUQZrri:
2512 case X86::VPCMPWZ128rri:
2513 case X86::VPCMPUWZ128rri:
2514 case X86::VPCMPWZ256rri:
2515 case X86::VPCMPUWZ256rri:
2516 case X86::VPCMPWZrri:
2517 case X86::VPCMPUWZrri:
2518 case X86::VPCMPBZ128rrik:
2519 case X86::VPCMPUBZ128rrik:
2520 case X86::VPCMPBZ256rrik:
2521 case X86::VPCMPUBZ256rrik:
2522 case X86::VPCMPBZrrik:
2523 case X86::VPCMPUBZrrik:
2524 case X86::VPCMPDZ128rrik:
2525 case X86::VPCMPUDZ128rrik:
2526 case X86::VPCMPDZ256rrik:
2527 case X86::VPCMPUDZ256rrik:
2528 case X86::VPCMPDZrrik:
2529 case X86::VPCMPUDZrrik:
2530 case X86::VPCMPQZ128rrik:
2531 case X86::VPCMPUQZ128rrik:
2532 case X86::VPCMPQZ256rrik:
2533 case X86::VPCMPUQZ256rrik:
2534 case X86::VPCMPQZrrik:
2535 case X86::VPCMPUQZrrik:
2536 case X86::VPCMPWZ128rrik:
2537 case X86::VPCMPUWZ128rrik:
2538 case X86::VPCMPWZ256rrik:
2539 case X86::VPCMPUWZ256rrik:
2540 case X86::VPCMPWZrrik:
2541 case X86::VPCMPUWZrrik:
2542 WorkingMI = CloneIfNew(MI);
2543 // Flip comparison mode immediate (if necessary).
2544 WorkingMI->getOperand(MI.getNumOperands() - 1)
2546 MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7));
2547 break;
2548 case X86::VPCOMBri:
2549 case X86::VPCOMUBri:
2550 case X86::VPCOMDri:
2551 case X86::VPCOMUDri:
2552 case X86::VPCOMQri:
2553 case X86::VPCOMUQri:
2554 case X86::VPCOMWri:
2555 case X86::VPCOMUWri:
2556 WorkingMI = CloneIfNew(MI);
2557 // Flip comparison mode immediate (if necessary).
2558 WorkingMI->getOperand(3).setImm(
2559 X86::getSwappedVPCOMImm(MI.getOperand(3).getImm() & 0x7));
2560 break;
2561 case X86::VCMPSDZrri:
2562 case X86::VCMPSSZrri:
2563 case X86::VCMPPDZrri:
2564 case X86::VCMPPSZrri:
2565 case X86::VCMPSHZrri:
2566 case X86::VCMPPHZrri:
2567 case X86::VCMPPHZ128rri:
2568 case X86::VCMPPHZ256rri:
2569 case X86::VCMPPDZ128rri:
2570 case X86::VCMPPSZ128rri:
2571 case X86::VCMPPDZ256rri:
2572 case X86::VCMPPSZ256rri:
2573 case X86::VCMPPDZrrik:
2574 case X86::VCMPPSZrrik:
2575 case X86::VCMPPHZrrik:
2576 case X86::VCMPPDZ128rrik:
2577 case X86::VCMPPSZ128rrik:
2578 case X86::VCMPPHZ128rrik:
2579 case X86::VCMPPDZ256rrik:
2580 case X86::VCMPPSZ256rrik:
2581 case X86::VCMPPHZ256rrik:
2582 WorkingMI = CloneIfNew(MI);
2583 WorkingMI->getOperand(MI.getNumExplicitOperands() - 1)
2585 MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f));
2586 break;
2587 case X86::VPERM2F128rri:
2588 case X86::VPERM2I128rri:
2589 // Flip permute source immediate.
2590 // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2591 // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2592 WorkingMI = CloneIfNew(MI);
2593 WorkingMI->getOperand(3).setImm((MI.getOperand(3).getImm() & 0xFF) ^ 0x22);
2594 break;
2595 case X86::MOVHLPSrr:
2596 case X86::UNPCKHPDrr:
2597 case X86::VMOVHLPSrr:
2598 case X86::VUNPCKHPDrr:
2599 case X86::VMOVHLPSZrr:
2600 case X86::VUNPCKHPDZ128rr:
2601 assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2602
2603 switch (Opc) {
2604 default:
2605 llvm_unreachable("Unreachable!");
2606 case X86::MOVHLPSrr:
2607 Opc = X86::UNPCKHPDrr;
2608 break;
2609 case X86::UNPCKHPDrr:
2610 Opc = X86::MOVHLPSrr;
2611 break;
2612 case X86::VMOVHLPSrr:
2613 Opc = X86::VUNPCKHPDrr;
2614 break;
2615 case X86::VUNPCKHPDrr:
2616 Opc = X86::VMOVHLPSrr;
2617 break;
2618 case X86::VMOVHLPSZrr:
2619 Opc = X86::VUNPCKHPDZ128rr;
2620 break;
2621 case X86::VUNPCKHPDZ128rr:
2622 Opc = X86::VMOVHLPSZrr;
2623 break;
2624 }
2625 WorkingMI = CloneIfNew(MI);
2626 WorkingMI->setDesc(get(Opc));
2627 break;
2628 CASE_ND(CMOV16rr)
2629 CASE_ND(CMOV32rr)
2630 CASE_ND(CMOV64rr) {
2631 WorkingMI = CloneIfNew(MI);
2632 unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2633 X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2635 break;
2636 }
2637 case X86::VPTERNLOGDZrri:
2638 case X86::VPTERNLOGDZrmi:
2639 case X86::VPTERNLOGDZ128rri:
2640 case X86::VPTERNLOGDZ128rmi:
2641 case X86::VPTERNLOGDZ256rri:
2642 case X86::VPTERNLOGDZ256rmi:
2643 case X86::VPTERNLOGQZrri:
2644 case X86::VPTERNLOGQZrmi:
2645 case X86::VPTERNLOGQZ128rri:
2646 case X86::VPTERNLOGQZ128rmi:
2647 case X86::VPTERNLOGQZ256rri:
2648 case X86::VPTERNLOGQZ256rmi:
2649 case X86::VPTERNLOGDZrrik:
2650 case X86::VPTERNLOGDZ128rrik:
2651 case X86::VPTERNLOGDZ256rrik:
2652 case X86::VPTERNLOGQZrrik:
2653 case X86::VPTERNLOGQZ128rrik:
2654 case X86::VPTERNLOGQZ256rrik:
2655 case X86::VPTERNLOGDZrrikz:
2656 case X86::VPTERNLOGDZrmikz:
2657 case X86::VPTERNLOGDZ128rrikz:
2658 case X86::VPTERNLOGDZ128rmikz:
2659 case X86::VPTERNLOGDZ256rrikz:
2660 case X86::VPTERNLOGDZ256rmikz:
2661 case X86::VPTERNLOGQZrrikz:
2662 case X86::VPTERNLOGQZrmikz:
2663 case X86::VPTERNLOGQZ128rrikz:
2664 case X86::VPTERNLOGQZ128rmikz:
2665 case X86::VPTERNLOGQZ256rrikz:
2666 case X86::VPTERNLOGQZ256rmikz:
2667 case X86::VPTERNLOGDZ128rmbi:
2668 case X86::VPTERNLOGDZ256rmbi:
2669 case X86::VPTERNLOGDZrmbi:
2670 case X86::VPTERNLOGQZ128rmbi:
2671 case X86::VPTERNLOGQZ256rmbi:
2672 case X86::VPTERNLOGQZrmbi:
2673 case X86::VPTERNLOGDZ128rmbikz:
2674 case X86::VPTERNLOGDZ256rmbikz:
2675 case X86::VPTERNLOGDZrmbikz:
2676 case X86::VPTERNLOGQZ128rmbikz:
2677 case X86::VPTERNLOGQZ256rmbikz:
2678 case X86::VPTERNLOGQZrmbikz: {
2679 WorkingMI = CloneIfNew(MI);
2680 commuteVPTERNLOG(*WorkingMI, OpIdx1, OpIdx2);
2681 break;
2682 }
2683 default:
2685 WorkingMI = CloneIfNew(MI);
2687 break;
2688 }
2689
2690 if (auto *FMA3Group = getFMA3Group(Opc, MI.getDesc().TSFlags)) {
2691 WorkingMI = CloneIfNew(MI);
2692 WorkingMI->setDesc(
2693 get(getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group)));
2694 break;
2695 }
2696 }
2697 return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2698}
2699
2700bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2701 unsigned &SrcOpIdx1,
2702 unsigned &SrcOpIdx2,
2703 bool IsIntrinsic) const {
2704 uint64_t TSFlags = MI.getDesc().TSFlags;
2705
2706 unsigned FirstCommutableVecOp = 1;
2707 unsigned LastCommutableVecOp = 3;
2708 unsigned KMaskOp = -1U;
2709 if (X86II::isKMasked(TSFlags)) {
2710 // For k-zero-masked operations it is Ok to commute the first vector
2711 // operand. Unless this is an intrinsic instruction.
2712 // For regular k-masked operations a conservative choice is done as the
2713 // elements of the first vector operand, for which the corresponding bit
2714 // in the k-mask operand is set to 0, are copied to the result of the
2715 // instruction.
2716 // TODO/FIXME: The commute still may be legal if it is known that the
2717 // k-mask operand is set to either all ones or all zeroes.
2718 // It is also Ok to commute the 1st operand if all users of MI use only
2719 // the elements enabled by the k-mask operand. For example,
2720 // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2721 // : v1[i];
2722 // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2723 // // Ok, to commute v1 in FMADD213PSZrk.
2724
2725 // The k-mask operand has index = 2 for masked and zero-masked operations.
2726 KMaskOp = 2;
2727
2728 // The operand with index = 1 is used as a source for those elements for
2729 // which the corresponding bit in the k-mask is set to 0.
2730 if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2731 FirstCommutableVecOp = 3;
2732
2733 LastCommutableVecOp++;
2734 } else if (IsIntrinsic) {
2735 // Commuting the first operand of an intrinsic instruction isn't possible
2736 // unless we can prove that only the lowest element of the result is used.
2737 FirstCommutableVecOp = 2;
2738 }
2739
2740 if (isMem(MI, LastCommutableVecOp))
2741 LastCommutableVecOp--;
2742
2743 // Only the first RegOpsNum operands are commutable.
2744 // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2745 // that the operand is not specified/fixed.
2746 if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2747 (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2748 SrcOpIdx1 == KMaskOp))
2749 return false;
2750 if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2751 (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2752 SrcOpIdx2 == KMaskOp))
2753 return false;
2754
2755 // Look for two different register operands assumed to be commutable
2756 // regardless of the FMA opcode. The FMA opcode is adjusted later.
2757 if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2758 SrcOpIdx2 == CommuteAnyOperandIndex) {
2759 unsigned CommutableOpIdx2 = SrcOpIdx2;
2760
2761 // At least one of operands to be commuted is not specified and
2762 // this method is free to choose appropriate commutable operands.
2763 if (SrcOpIdx1 == SrcOpIdx2)
2764 // Both of operands are not fixed. By default set one of commutable
2765 // operands to the last register operand of the instruction.
2766 CommutableOpIdx2 = LastCommutableVecOp;
2767 else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2768 // Only one of operands is not fixed.
2769 CommutableOpIdx2 = SrcOpIdx1;
2770
2771 // CommutableOpIdx2 is well defined now. Let's choose another commutable
2772 // operand and assign its index to CommutableOpIdx1.
2773 Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2774
2775 unsigned CommutableOpIdx1;
2776 for (CommutableOpIdx1 = LastCommutableVecOp;
2777 CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2778 // Just ignore and skip the k-mask operand.
2779 if (CommutableOpIdx1 == KMaskOp)
2780 continue;
2781
2782 // The commuted operands must have different registers.
2783 // Otherwise, the commute transformation does not change anything and
2784 // is useless then.
2785 if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2786 break;
2787 }
2788
2789 // No appropriate commutable operands were found.
2790 if (CommutableOpIdx1 < FirstCommutableVecOp)
2791 return false;
2792
2793 // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2794 // to return those values.
2795 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
2796 CommutableOpIdx2))
2797 return false;
2798 }
2799
2800 return true;
2801}
2802
2804 unsigned &SrcOpIdx1,
2805 unsigned &SrcOpIdx2) const {
2806 const MCInstrDesc &Desc = MI.getDesc();
2807 if (!Desc.isCommutable())
2808 return false;
2809
2810 switch (MI.getOpcode()) {
2811 case X86::CMPSDrri:
2812 case X86::CMPSSrri:
2813 case X86::CMPPDrri:
2814 case X86::CMPPSrri:
2815 case X86::VCMPSDrri:
2816 case X86::VCMPSSrri:
2817 case X86::VCMPPDrri:
2818 case X86::VCMPPSrri:
2819 case X86::VCMPPDYrri:
2820 case X86::VCMPPSYrri:
2821 case X86::VCMPSDZrri:
2822 case X86::VCMPSSZrri:
2823 case X86::VCMPPDZrri:
2824 case X86::VCMPPSZrri:
2825 case X86::VCMPSHZrri:
2826 case X86::VCMPPHZrri:
2827 case X86::VCMPPHZ128rri:
2828 case X86::VCMPPHZ256rri:
2829 case X86::VCMPPDZ128rri:
2830 case X86::VCMPPSZ128rri:
2831 case X86::VCMPPDZ256rri:
2832 case X86::VCMPPSZ256rri:
2833 case X86::VCMPPDZrrik:
2834 case X86::VCMPPSZrrik:
2835 case X86::VCMPPHZrrik:
2836 case X86::VCMPPDZ128rrik:
2837 case X86::VCMPPSZ128rrik:
2838 case X86::VCMPPHZ128rrik:
2839 case X86::VCMPPDZ256rrik:
2840 case X86::VCMPPSZ256rrik:
2841 case X86::VCMPPHZ256rrik: {
2842 unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2843
2844 // Float comparison can be safely commuted for
2845 // Ordered/Unordered/Equal/NotEqual tests
2846 unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2847 switch (Imm) {
2848 default:
2849 // EVEX versions can be commuted.
2850 if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2851 break;
2852 return false;
2853 case 0x00: // EQUAL
2854 case 0x03: // UNORDERED
2855 case 0x04: // NOT EQUAL
2856 case 0x07: // ORDERED
2857 break;
2858 }
2859
2860 // The indices of the commutable operands are 1 and 2 (or 2 and 3
2861 // when masked).
2862 // Assign them to the returned operand indices here.
2863 return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2864 2 + OpOffset);
2865 }
2866 case X86::MOVSSrr:
2867 // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2868 // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2869 // AVX implies sse4.1.
2870 if (Subtarget.hasSSE41())
2871 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2872 return false;
2873 case X86::SHUFPDrri:
2874 // We can commute this to MOVSD.
2875 if (MI.getOperand(3).getImm() == 0x02)
2876 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2877 return false;
2878 case X86::MOVHLPSrr:
2879 case X86::UNPCKHPDrr:
2880 case X86::VMOVHLPSrr:
2881 case X86::VUNPCKHPDrr:
2882 case X86::VMOVHLPSZrr:
2883 case X86::VUNPCKHPDZ128rr:
2884 if (Subtarget.hasSSE2())
2885 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2886 return false;
2887 case X86::VPTERNLOGDZrri:
2888 case X86::VPTERNLOGDZrmi:
2889 case X86::VPTERNLOGDZ128rri:
2890 case X86::VPTERNLOGDZ128rmi:
2891 case X86::VPTERNLOGDZ256rri:
2892 case X86::VPTERNLOGDZ256rmi:
2893 case X86::VPTERNLOGQZrri:
2894 case X86::VPTERNLOGQZrmi:
2895 case X86::VPTERNLOGQZ128rri:
2896 case X86::VPTERNLOGQZ128rmi:
2897 case X86::VPTERNLOGQZ256rri:
2898 case X86::VPTERNLOGQZ256rmi:
2899 case X86::VPTERNLOGDZrrik:
2900 case X86::VPTERNLOGDZ128rrik:
2901 case X86::VPTERNLOGDZ256rrik:
2902 case X86::VPTERNLOGQZrrik:
2903 case X86::VPTERNLOGQZ128rrik:
2904 case X86::VPTERNLOGQZ256rrik:
2905 case X86::VPTERNLOGDZrrikz:
2906 case X86::VPTERNLOGDZrmikz:
2907 case X86::VPTERNLOGDZ128rrikz:
2908 case X86::VPTERNLOGDZ128rmikz:
2909 case X86::VPTERNLOGDZ256rrikz:
2910 case X86::VPTERNLOGDZ256rmikz:
2911 case X86::VPTERNLOGQZrrikz:
2912 case X86::VPTERNLOGQZrmikz:
2913 case X86::VPTERNLOGQZ128rrikz:
2914 case X86::VPTERNLOGQZ128rmikz:
2915 case X86::VPTERNLOGQZ256rrikz:
2916 case X86::VPTERNLOGQZ256rmikz:
2917 case X86::VPTERNLOGDZ128rmbi:
2918 case X86::VPTERNLOGDZ256rmbi:
2919 case X86::VPTERNLOGDZrmbi:
2920 case X86::VPTERNLOGQZ128rmbi:
2921 case X86::VPTERNLOGQZ256rmbi:
2922 case X86::VPTERNLOGQZrmbi:
2923 case X86::VPTERNLOGDZ128rmbikz:
2924 case X86::VPTERNLOGDZ256rmbikz:
2925 case X86::VPTERNLOGDZrmbikz:
2926 case X86::VPTERNLOGQZ128rmbikz:
2927 case X86::VPTERNLOGQZ256rmbikz:
2928 case X86::VPTERNLOGQZrmbikz:
2929 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2930 case X86::VPDPWSSDYrr:
2931 case X86::VPDPWSSDrr:
2932 case X86::VPDPWSSDSYrr:
2933 case X86::VPDPWSSDSrr:
2934 case X86::VPDPWUUDrr:
2935 case X86::VPDPWUUDYrr:
2936 case X86::VPDPWUUDSrr:
2937 case X86::VPDPWUUDSYrr:
2938 case X86::VPDPBSSDSrr:
2939 case X86::VPDPBSSDSYrr:
2940 case X86::VPDPBSSDrr:
2941 case X86::VPDPBSSDYrr:
2942 case X86::VPDPBUUDSrr:
2943 case X86::VPDPBUUDSYrr:
2944 case X86::VPDPBUUDrr:
2945 case X86::VPDPBUUDYrr:
2946 case X86::VPDPBSSDSZ128rr:
2947 case X86::VPDPBSSDSZ128rrk:
2948 case X86::VPDPBSSDSZ128rrkz:
2949 case X86::VPDPBSSDSZ256rr:
2950 case X86::VPDPBSSDSZ256rrk:
2951 case X86::VPDPBSSDSZ256rrkz:
2952 case X86::VPDPBSSDSZrr:
2953 case X86::VPDPBSSDSZrrk:
2954 case X86::VPDPBSSDSZrrkz:
2955 case X86::VPDPBSSDZ128rr:
2956 case X86::VPDPBSSDZ128rrk:
2957 case X86::VPDPBSSDZ128rrkz:
2958 case X86::VPDPBSSDZ256rr:
2959 case X86::VPDPBSSDZ256rrk:
2960 case X86::VPDPBSSDZ256rrkz:
2961 case X86::VPDPBSSDZrr:
2962 case X86::VPDPBSSDZrrk:
2963 case X86::VPDPBSSDZrrkz:
2964 case X86::VPDPBUUDSZ128rr:
2965 case X86::VPDPBUUDSZ128rrk:
2966 case X86::VPDPBUUDSZ128rrkz:
2967 case X86::VPDPBUUDSZ256rr:
2968 case X86::VPDPBUUDSZ256rrk:
2969 case X86::VPDPBUUDSZ256rrkz:
2970 case X86::VPDPBUUDSZrr:
2971 case X86::VPDPBUUDSZrrk:
2972 case X86::VPDPBUUDSZrrkz:
2973 case X86::VPDPBUUDZ128rr:
2974 case X86::VPDPBUUDZ128rrk:
2975 case X86::VPDPBUUDZ128rrkz:
2976 case X86::VPDPBUUDZ256rr:
2977 case X86::VPDPBUUDZ256rrk:
2978 case X86::VPDPBUUDZ256rrkz:
2979 case X86::VPDPBUUDZrr:
2980 case X86::VPDPBUUDZrrk:
2981 case X86::VPDPBUUDZrrkz:
2982 case X86::VPDPWSSDZ128rr:
2983 case X86::VPDPWSSDZ128rrk:
2984 case X86::VPDPWSSDZ128rrkz:
2985 case X86::VPDPWSSDZ256rr:
2986 case X86::VPDPWSSDZ256rrk:
2987 case X86::VPDPWSSDZ256rrkz:
2988 case X86::VPDPWSSDZrr:
2989 case X86::VPDPWSSDZrrk:
2990 case X86::VPDPWSSDZrrkz:
2991 case X86::VPDPWSSDSZ128rr:
2992 case X86::VPDPWSSDSZ128rrk:
2993 case X86::VPDPWSSDSZ128rrkz:
2994 case X86::VPDPWSSDSZ256rr:
2995 case X86::VPDPWSSDSZ256rrk:
2996 case X86::VPDPWSSDSZ256rrkz:
2997 case X86::VPDPWSSDSZrr:
2998 case X86::VPDPWSSDSZrrk:
2999 case X86::VPDPWSSDSZrrkz:
3000 case X86::VPDPWUUDZ128rr:
3001 case X86::VPDPWUUDZ128rrk:
3002 case X86::VPDPWUUDZ128rrkz:
3003 case X86::VPDPWUUDZ256rr:
3004 case X86::VPDPWUUDZ256rrk:
3005 case X86::VPDPWUUDZ256rrkz:
3006 case X86::VPDPWUUDZrr:
3007 case X86::VPDPWUUDZrrk:
3008 case X86::VPDPWUUDZrrkz:
3009 case X86::VPDPWUUDSZ128rr:
3010 case X86::VPDPWUUDSZ128rrk:
3011 case X86::VPDPWUUDSZ128rrkz:
3012 case X86::VPDPWUUDSZ256rr:
3013 case X86::VPDPWUUDSZ256rrk:
3014 case X86::VPDPWUUDSZ256rrkz:
3015 case X86::VPDPWUUDSZrr:
3016 case X86::VPDPWUUDSZrrk:
3017 case X86::VPDPWUUDSZrrkz:
3018 case X86::VPMADD52HUQrr:
3019 case X86::VPMADD52HUQYrr:
3020 case X86::VPMADD52HUQZ128r:
3021 case X86::VPMADD52HUQZ128rk:
3022 case X86::VPMADD52HUQZ128rkz:
3023 case X86::VPMADD52HUQZ256r:
3024 case X86::VPMADD52HUQZ256rk:
3025 case X86::VPMADD52HUQZ256rkz:
3026 case X86::VPMADD52HUQZr:
3027 case X86::VPMADD52HUQZrk:
3028 case X86::VPMADD52HUQZrkz:
3029 case X86::VPMADD52LUQrr:
3030 case X86::VPMADD52LUQYrr:
3031 case X86::VPMADD52LUQZ128r:
3032 case X86::VPMADD52LUQZ128rk:
3033 case X86::VPMADD52LUQZ128rkz:
3034 case X86::VPMADD52LUQZ256r:
3035 case X86::VPMADD52LUQZ256rk:
3036 case X86::VPMADD52LUQZ256rkz:
3037 case X86::VPMADD52LUQZr:
3038 case X86::VPMADD52LUQZrk:
3039 case X86::VPMADD52LUQZrkz:
3040 case X86::VFMADDCPHZr:
3041 case X86::VFMADDCPHZrk:
3042 case X86::VFMADDCPHZrkz:
3043 case X86::VFMADDCPHZ128r:
3044 case X86::VFMADDCPHZ128rk:
3045 case X86::VFMADDCPHZ128rkz:
3046 case X86::VFMADDCPHZ256r:
3047 case X86::VFMADDCPHZ256rk:
3048 case X86::VFMADDCPHZ256rkz:
3049 case X86::VFMADDCSHZr:
3050 case X86::VFMADDCSHZrk:
3051 case X86::VFMADDCSHZrkz: {
3052 unsigned CommutableOpIdx1 = 2;
3053 unsigned CommutableOpIdx2 = 3;
3054 if (X86II::isKMasked(Desc.TSFlags)) {
3055 // Skip the mask register.
3056 ++CommutableOpIdx1;
3057 ++CommutableOpIdx2;
3058 }
3059 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3060 CommutableOpIdx2))
3061 return false;
3062 if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
3063 // No idea.
3064 return false;
3065 return true;
3066 }
3067
3068 default:
3069 const X86InstrFMA3Group *FMA3Group =
3070 getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
3071 if (FMA3Group)
3072 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
3073 FMA3Group->isIntrinsic());
3074
3075 // Handled masked instructions since we need to skip over the mask input
3076 // and the preserved input.
3077 if (X86II::isKMasked(Desc.TSFlags)) {
3078 // First assume that the first input is the mask operand and skip past it.
3079 unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
3080 unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
3081 // Check if the first input is tied. If there isn't one then we only
3082 // need to skip the mask operand which we did above.
3083 if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
3084 MCOI::TIED_TO) != -1)) {
3085 // If this is zero masking instruction with a tied operand, we need to
3086 // move the first index back to the first input since this must
3087 // be a 3 input instruction and we want the first two non-mask inputs.
3088 // Otherwise this is a 2 input instruction with a preserved input and
3089 // mask, so we need to move the indices to skip one more input.
3090 if (X86II::isKMergeMasked(Desc.TSFlags)) {
3091 ++CommutableOpIdx1;
3092 ++CommutableOpIdx2;
3093 } else {
3094 --CommutableOpIdx1;
3095 }
3096 }
3097
3098 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3099 CommutableOpIdx2))
3100 return false;
3101
3102 if (!MI.getOperand(SrcOpIdx1).isReg() ||
3103 !MI.getOperand(SrcOpIdx2).isReg())
3104 // No idea.
3105 return false;
3106 return true;
3107 }
3108
3109 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
3110 }
3111 return false;
3112}
3113
3115 unsigned Opcode = MI->getOpcode();
3116 if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
3117 Opcode != X86::LEA64_32r)
3118 return false;
3119
3120 const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
3121 const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
3122 const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
3123
3124 if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
3125 Scale.getImm() > 1)
3126 return false;
3127
3128 return true;
3129}
3130
3132 // Currently we're interested in following sequence only.
3133 // r3 = lea r1, r2
3134 // r5 = add r3, r4
3135 // Both r3 and r4 are killed in add, we hope the add instruction has the
3136 // operand order
3137 // r5 = add r4, r3
3138 // So later in X86FixupLEAs the lea instruction can be rewritten as add.
3139 unsigned Opcode = MI.getOpcode();
3140 if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
3141 return false;
3142
3143 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3144 Register Reg1 = MI.getOperand(1).getReg();
3145 Register Reg2 = MI.getOperand(2).getReg();
3146
3147 // Check if Reg1 comes from LEA in the same MBB.
3148 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
3149 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3150 Commute = true;
3151 return true;
3152 }
3153 }
3154
3155 // Check if Reg2 comes from LEA in the same MBB.
3156 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
3157 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3158 Commute = false;
3159 return true;
3160 }
3161 }
3162
3163 return false;
3164}
3165
3167 unsigned Opcode = MCID.getOpcode();
3168 if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isSETZUCC(Opcode) ||
3169 X86::isCMOVCC(Opcode) || X86::isCFCMOVCC(Opcode) ||
3170 X86::isCCMPCC(Opcode) || X86::isCTESTCC(Opcode)))
3171 return -1;
3172 // Assume that condition code is always the last use operand.
3173 unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
3174 return NumUses - 1;
3175}
3176
3178 const MCInstrDesc &MCID = MI.getDesc();
3179 int CondNo = getCondSrcNoFromDesc(MCID);
3180 if (CondNo < 0)
3181 return X86::COND_INVALID;
3182 CondNo += MCID.getNumDefs();
3183 return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
3184}
3185
3187 return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3189}
3190
3192 return X86::isSETCC(MI.getOpcode()) || X86::isSETZUCC(MI.getOpcode())
3195}
3196
3198 return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3200}
3201
3203 return X86::isCFCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3205}
3206
3208 return X86::isCCMPCC(MI.getOpcode()) || X86::isCTESTCC(MI.getOpcode())
3211}
3212
3214 // CCMP/CTEST has two conditional operands:
3215 // - SCC: source conditonal code (same as CMOV)
3216 // - DCF: destination conditional flags, which has 4 valid bits
3217 //
3218 // +----+----+----+----+
3219 // | OF | SF | ZF | CF |
3220 // +----+----+----+----+
3221 //
3222 // If SCC(source conditional code) evaluates to false, CCMP/CTEST will updates
3223 // the conditional flags by as follows:
3224 //
3225 // OF = DCF.OF
3226 // SF = DCF.SF
3227 // ZF = DCF.ZF
3228 // CF = DCF.CF
3229 // PF = DCF.CF
3230 // AF = 0 (Auxiliary Carry Flag)
3231 //
3232 // Otherwise, the CMP or TEST is executed and it updates the
3233 // CSPAZO flags normally.
3234 //
3235 // NOTE:
3236 // If SCC = P, then SCC evaluates to true regardless of the CSPAZO value.
3237 // If SCC = NP, then SCC evaluates to false regardless of the CSPAZO value.
3238
3239 enum { CF = 1, ZF = 2, SF = 4, OF = 8, PF = CF };
3240
3241 switch (CC) {
3242 default:
3243 llvm_unreachable("Illegal condition code!");
3244 case X86::COND_NO:
3245 case X86::COND_NE:
3246 case X86::COND_GE:
3247 case X86::COND_G:
3248 case X86::COND_AE:
3249 case X86::COND_A:
3250 case X86::COND_NS:
3251 case X86::COND_NP:
3252 return 0;
3253 case X86::COND_O:
3254 return OF;
3255 case X86::COND_B:
3256 case X86::COND_BE:
3257 return CF;
3258 break;
3259 case X86::COND_E:
3260 case X86::COND_LE:
3261 return ZF;
3262 case X86::COND_S:
3263 case X86::COND_L:
3264 return SF;
3265 case X86::COND_P:
3266 return PF;
3267 }
3268}
3269
3270#define GET_X86_NF_TRANSFORM_TABLE
3271#define GET_X86_ND2NONND_TABLE
3272#include "X86GenInstrMapping.inc"
3273
3275 unsigned Opc) {
3276 const auto I = llvm::lower_bound(Table, Opc);
3277 return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc;
3278}
3279unsigned X86::getNFVariant(unsigned Opc) {
3280#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3281 // Make sure the tables are sorted.
3282 static std::atomic<bool> NFTableChecked(false);
3283 if (!NFTableChecked.load(std::memory_order_relaxed)) {
3284 assert(llvm::is_sorted(X86NFTransformTable) &&
3285 "X86NFTransformTable is not sorted!");
3286 NFTableChecked.store(true, std::memory_order_relaxed);
3287 }
3288#endif
3289 return getNewOpcFromTable(X86NFTransformTable, Opc);
3290}
3291
3292unsigned X86::getNonNDVariant(unsigned Opc) {
3293#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3294 // Make sure the tables are sorted.
3295 static std::atomic<bool> NDTableChecked(false);
3296 if (!NDTableChecked.load(std::memory_order_relaxed)) {
3297 assert(llvm::is_sorted(X86ND2NonNDTable) &&
3298 "X86ND2NonNDTableis not sorted!");
3299 NDTableChecked.store(true, std::memory_order_relaxed);
3300 }
3301#endif
3302 return getNewOpcFromTable(X86ND2NonNDTable, Opc);
3303}
3304
3305/// Return the inverse of the specified condition,
3306/// e.g. turning COND_E to COND_NE.
3308 switch (CC) {
3309 default:
3310 llvm_unreachable("Illegal condition code!");
3311 case X86::COND_E:
3312 return X86::COND_NE;
3313 case X86::COND_NE:
3314 return X86::COND_E;
3315 case X86::COND_L:
3316 return X86::COND_GE;
3317 case X86::COND_LE:
3318 return X86::COND_G;
3319 case X86::COND_G:
3320 return X86::COND_LE;
3321 case X86::COND_GE:
3322 return X86::COND_L;
3323 case X86::COND_B:
3324 return X86::COND_AE;
3325 case X86::COND_BE:
3326 return X86::COND_A;
3327 case X86::COND_A:
3328 return X86::COND_BE;
3329 case X86::COND_AE:
3330 return X86::COND_B;
3331 case X86::COND_S:
3332 return X86::COND_NS;
3333 case X86::COND_NS:
3334 return X86::COND_S;
3335 case X86::COND_P:
3336 return X86::COND_NP;
3337 case X86::COND_NP:
3338 return X86::COND_P;
3339 case X86::COND_O:
3340 return X86::COND_NO;
3341 case X86::COND_NO:
3342 return X86::COND_O;
3343 case X86::COND_NE_OR_P:
3344 return X86::COND_E_AND_NP;
3345 case X86::COND_E_AND_NP:
3346 return X86::COND_NE_OR_P;
3347 }
3348}
3349
3350/// Assuming the flags are set by MI(a,b), return the condition code if we
3351/// modify the instructions such that flags are set by MI(b,a).
3353 switch (CC) {
3354 default:
3355 return X86::COND_INVALID;
3356 case X86::COND_E:
3357 return X86::COND_E;
3358 case X86::COND_NE:
3359 return X86::COND_NE;
3360 case X86::COND_L:
3361 return X86::COND_G;
3362 case X86::COND_LE:
3363 return X86::COND_GE;
3364 case X86::COND_G:
3365 return X86::COND_L;
3366 case X86::COND_GE:
3367 return X86::COND_LE;
3368 case X86::COND_B:
3369 return X86::COND_A;
3370 case X86::COND_BE:
3371 return X86::COND_AE;
3372 case X86::COND_A:
3373 return X86::COND_B;
3374 case X86::COND_AE:
3375 return X86::COND_BE;
3376 }
3377}
3378
3379std::pair<X86::CondCode, bool>
3382 bool NeedSwap = false;
3383 switch (Predicate) {
3384 default:
3385 break;
3386 // Floating-point Predicates
3387 case CmpInst::FCMP_UEQ:
3388 CC = X86::COND_E;
3389 break;
3390 case CmpInst::FCMP_OLT:
3391 NeedSwap = true;
3392 [[fallthrough]];
3393 case CmpInst::FCMP_OGT:
3394 CC = X86::COND_A;
3395 break;
3396 case CmpInst::FCMP_OLE:
3397 NeedSwap = true;
3398 [[fallthrough]];
3399 case CmpInst::FCMP_OGE:
3400 CC = X86::COND_AE;
3401 break;
3402 case CmpInst::FCMP_UGT:
3403 NeedSwap = true;
3404 [[fallthrough]];
3405 case CmpInst::FCMP_ULT:
3406 CC = X86::COND_B;
3407 break;
3408 case CmpInst::FCMP_UGE:
3409 NeedSwap = true;
3410 [[fallthrough]];
3411 case CmpInst::FCMP_ULE:
3412 CC = X86::COND_BE;
3413 break;
3414 case CmpInst::FCMP_ONE:
3415 CC = X86::COND_NE;
3416 break;
3417 case CmpInst::FCMP_UNO:
3418 CC = X86::COND_P;
3419 break;
3420 case CmpInst::FCMP_ORD:
3421 CC = X86::COND_NP;
3422 break;
3423 case CmpInst::FCMP_OEQ:
3424 [[fallthrough]];
3425 case CmpInst::FCMP_UNE:
3426 CC = X86::COND_INVALID;
3427 break;
3428
3429 // Integer Predicates
3430 case CmpInst::ICMP_EQ:
3431 CC = X86::COND_E;
3432 break;
3433 case CmpInst::ICMP_NE:
3434 CC = X86::COND_NE;
3435 break;
3436 case CmpInst::ICMP_UGT:
3437 CC = X86::COND_A;
3438 break;
3439 case CmpInst::ICMP_UGE:
3440 CC = X86::COND_AE;
3441 break;
3442 case CmpInst::ICMP_ULT:
3443 CC = X86::COND_B;
3444 break;
3445 case CmpInst::ICMP_ULE:
3446 CC = X86::COND_BE;
3447 break;
3448 case CmpInst::ICMP_SGT:
3449 CC = X86::COND_G;
3450 break;
3451 case CmpInst::ICMP_SGE:
3452 CC = X86::COND_GE;
3453 break;
3454 case CmpInst::ICMP_SLT:
3455 CC = X86::COND_L;
3456 break;
3457 case CmpInst::ICMP_SLE:
3458 CC = X86::COND_LE;
3459 break;
3460 }
3461
3462 return std::make_pair(CC, NeedSwap);
3463}
3464
3465/// Return a cmov opcode for the given register size in bytes, and operand type.
3466unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand,
3467 bool HasNDD) {
3468 switch (RegBytes) {
3469 default:
3470 llvm_unreachable("Illegal register size!");
3471#define GET_ND_IF_ENABLED(OPC) (HasNDD ? OPC##_ND : OPC)
3472 case 2:
3473 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV16rm)
3474 : GET_ND_IF_ENABLED(X86::CMOV16rr);
3475 case 4:
3476 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV32rm)
3477 : GET_ND_IF_ENABLED(X86::CMOV32rr);
3478 case 8:
3479 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV64rm)
3480 : GET_ND_IF_ENABLED(X86::CMOV64rr);
3481 }
3482}
3483
3484/// Get the VPCMP immediate for the given condition.
3486 switch (CC) {
3487 default:
3488 llvm_unreachable("Unexpected SETCC condition");
3489 case ISD::SETNE:
3490 return 4;
3491 case ISD::SETEQ:
3492 return 0;
3493 case ISD::SETULT:
3494 case ISD::SETLT:
3495 return 1;
3496 case ISD::SETUGT:
3497 case ISD::SETGT:
3498 return 6;
3499 case ISD::SETUGE:
3500 case ISD::SETGE:
3501 return 5;
3502 case ISD::SETULE:
3503 case ISD::SETLE:
3504 return 2;
3505 }
3506}
3507
3508/// Get the VPCMP immediate if the operands are swapped.
3509unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
3510 switch (Imm) {
3511 default:
3512 llvm_unreachable("Unreachable!");
3513 case 0x01:
3514 Imm = 0x06;
3515 break; // LT -> NLE
3516 case 0x02:
3517 Imm = 0x05;
3518 break; // LE -> NLT
3519 case 0x05:
3520 Imm = 0x02;
3521 break; // NLT -> LE
3522 case 0x06:
3523 Imm = 0x01;
3524 break; // NLE -> LT
3525 case 0x00: // EQ
3526 case 0x03: // FALSE
3527 case 0x04: // NE
3528 case 0x07: // TRUE
3529 break;
3530 }
3531
3532 return Imm;
3533}
3534
3535/// Get the VPCOM immediate if the operands are swapped.
3536unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
3537 switch (Imm) {
3538 default:
3539 llvm_unreachable("Unreachable!");
3540 case 0x00:
3541 Imm = 0x02;
3542 break; // LT -> GT
3543 case 0x01:
3544 Imm = 0x03;
3545 break; // LE -> GE
3546 case 0x02:
3547 Imm = 0x00;
3548 break; // GT -> LT
3549 case 0x03:
3550 Imm = 0x01;
3551 break; // GE -> LE
3552 case 0x04: // EQ
3553 case 0x05: // NE
3554 case 0x06: // FALSE
3555 case 0x07: // TRUE
3556 break;
3557 }
3558
3559 return Imm;
3560}
3561
3562/// Get the VCMP immediate if the operands are swapped.
3563unsigned X86::getSwappedVCMPImm(unsigned Imm) {
3564 // Only need the lower 2 bits to distinquish.
3565 switch (Imm & 0x3) {
3566 default:
3567 llvm_unreachable("Unreachable!");
3568 case 0x00:
3569 case 0x03:
3570 // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
3571 break;
3572 case 0x01:
3573 case 0x02:
3574 // Need to toggle bits 3:0. Bit 4 stays the same.
3575 Imm ^= 0xf;
3576 break;
3577 }
3578
3579 return Imm;
3580}
3581
3583 if (Info.RegClass == X86::VR128RegClassID ||
3584 Info.RegClass == X86::VR128XRegClassID)
3585 return 128;
3586 if (Info.RegClass == X86::VR256RegClassID ||
3587 Info.RegClass == X86::VR256XRegClassID)
3588 return 256;
3589 if (Info.RegClass == X86::VR512RegClassID)
3590 return 512;
3591 llvm_unreachable("Unknown register class!");
3592}
3593
3594/// Return true if the Reg is X87 register.
3595static bool isX87Reg(Register Reg) {
3596 return (Reg == X86::FPCW || Reg == X86::FPSW ||
3597 (Reg >= X86::ST0 && Reg <= X86::ST7));
3598}
3599
3600/// check if the instruction is X87 instruction
3602 // Call and inlineasm defs X87 register, so we special case it here because
3603 // otherwise calls are incorrectly flagged as x87 instructions
3604 // as a result.
3605 if (MI.isCall() || MI.isInlineAsm())
3606 return false;
3607 for (const MachineOperand &MO : MI.operands()) {
3608 if (!MO.isReg())
3609 continue;
3610 if (isX87Reg(MO.getReg()))
3611 return true;
3612 }
3613 return false;
3614}
3615
3617 auto IsMemOp = [](const MCOperandInfo &OpInfo) {
3618 return OpInfo.OperandType == MCOI::OPERAND_MEMORY;
3619 };
3620
3621 const MCInstrDesc &Desc = MI.getDesc();
3622
3623 // Directly invoke the MC-layer routine for real (i.e., non-pseudo)
3624 // instructions (fast case).
3625 if (!X86II::isPseudo(Desc.TSFlags)) {
3626 int MemRefIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
3627 if (MemRefIdx >= 0)
3628 return MemRefIdx + X86II::getOperandBias(Desc);
3629#ifdef EXPENSIVE_CHECKS
3630 assert(none_of(Desc.operands(), IsMemOp) &&
3631 "Got false negative from X86II::getMemoryOperandNo()!");
3632#endif
3633 return -1;
3634 }
3635
3636 // Otherwise, handle pseudo instructions by examining the type of their
3637 // operands (slow case). An instruction cannot have a memory reference if it
3638 // has fewer than AddrNumOperands (= 5) explicit operands.
3639 unsigned NumOps = Desc.getNumOperands();
3641#ifdef EXPENSIVE_CHECKS
3642 assert(none_of(Desc.operands(), IsMemOp) &&
3643 "Expected no operands to have OPERAND_MEMORY type!");
3644#endif
3645 return -1;
3646 }
3647
3648 // The first operand with type OPERAND_MEMORY indicates the start of a memory
3649 // reference. We expect the following AddrNumOperand-1 operands to also have
3650 // OPERAND_MEMORY type.
3651 for (unsigned I = 0, E = NumOps - X86::AddrNumOperands; I != E; ++I) {
3652 if (IsMemOp(Desc.operands()[I])) {
3653#ifdef EXPENSIVE_CHECKS
3654 assert(std::all_of(Desc.operands().begin() + I,
3655 Desc.operands().begin() + I + X86::AddrNumOperands,
3656 IsMemOp) &&
3657 "Expected all five operands in the memory reference to have "
3658 "OPERAND_MEMORY type!");
3659#endif
3660 return I;
3661 }
3662 }
3663
3664 return -1;
3665}
3666
3668 unsigned OpNo) {
3669 assert(MI.getNumOperands() >= (OpNo + X86::AddrNumOperands) &&
3670 "Unexpected number of operands!");
3671
3672 const MachineOperand &Index = MI.getOperand(OpNo + X86::AddrIndexReg);
3673 if (!Index.isReg() || Index.getReg() != X86::NoRegister)
3674 return nullptr;
3675
3676 const MachineOperand &Disp = MI.getOperand(OpNo + X86::AddrDisp);
3677 if (!Disp.isCPI() || Disp.getOffset() != 0)
3678 return nullptr;
3679
3681 MI.getParent()->getParent()->getConstantPool()->getConstants();
3682 const MachineConstantPoolEntry &ConstantEntry = Constants[Disp.getIndex()];
3683
3684 // Bail if this is a machine constant pool entry, we won't be able to dig out
3685 // anything useful.
3686 if (ConstantEntry.isMachineConstantPoolEntry())
3687 return nullptr;
3688
3689 return ConstantEntry.Val.ConstVal;
3690}
3691
3693 switch (MI.getOpcode()) {
3694 case X86::TCRETURNdi:
3695 case X86::TCRETURNri:
3696 case X86::TCRETURNmi:
3697 case X86::TCRETURNdi64:
3698 case X86::TCRETURNri64:
3699 case X86::TCRETURNri64_ImpCall:
3700 case X86::TCRETURNmi64:
3701 return true;
3702 default:
3703 return false;
3704 }
3705}
3706
3709 const MachineInstr &TailCall) const {
3710
3711 const MachineFunction *MF = TailCall.getMF();
3712
3713 if (MF->getTarget().getCodeModel() == CodeModel::Kernel) {
3714 // Kernel patches thunk calls in runtime, these should never be conditional.
3715 const MachineOperand &Target = TailCall.getOperand(0);
3716 if (Target.isSymbol()) {
3717 StringRef Symbol(Target.getSymbolName());
3718 // this is currently only relevant to r11/kernel indirect thunk.
3719 if (Symbol == "__x86_indirect_thunk_r11")
3720 return false;
3721 }
3722 }
3723
3724 if (TailCall.getOpcode() != X86::TCRETURNdi &&
3725 TailCall.getOpcode() != X86::TCRETURNdi64) {
3726 // Only direct calls can be done with a conditional branch.
3727 return false;
3728 }
3729
3730 if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
3731 // Conditional tail calls confuse the Win64 unwinder.
3732 return false;
3733 }
3734
3735 assert(BranchCond.size() == 1);
3736 if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
3737 // Can't make a conditional tail call with this condition.
3738 return false;
3739 }
3740
3742 if (X86FI->getTCReturnAddrDelta() != 0 ||
3743 TailCall.getOperand(1).getImm() != 0) {
3744 // A conditional tail call cannot do any stack adjustment.
3745 return false;
3746 }
3747
3748 return true;
3749}
3750
3753 const MachineInstr &TailCall) const {
3754 assert(canMakeTailCallConditional(BranchCond, TailCall));
3755
3757 while (I != MBB.begin()) {
3758 --I;
3759 if (I->isDebugInstr())
3760 continue;
3761 if (!I->isBranch())
3762 assert(0 && "Can't find the branch to replace!");
3763
3765 assert(BranchCond.size() == 1);
3766 if (CC != BranchCond[0].getImm())
3767 continue;
3768
3769 break;
3770 }
3771
3772 unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3773 : X86::TCRETURNdi64cc;
3774
3775 auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3776 MIB->addOperand(TailCall.getOperand(0)); // Destination.
3777 MIB.addImm(0); // Stack offset (not used).
3778 MIB->addOperand(BranchCond[0]); // Condition.
3779 MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
3780
3781 // Add implicit uses and defs of all live regs potentially clobbered by the
3782 // call. This way they still appear live across the call.
3784 LiveRegs.addLiveOuts(MBB);
3786 LiveRegs.stepForward(*MIB, Clobbers);
3787 for (const auto &C : Clobbers) {
3788 MIB.addReg(C.first, RegState::Implicit);
3790 }
3791
3792 I->eraseFromParent();
3793}
3794
3795// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3796// not be a fallthrough MBB now due to layout changes). Return nullptr if the
3797// fallthrough MBB cannot be identified.
3800 // Look for non-EHPad successors other than TBB. If we find exactly one, it
3801 // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3802 // and fallthrough MBB. If we find more than one, we cannot identify the
3803 // fallthrough MBB and should return nullptr.
3804 MachineBasicBlock *FallthroughBB = nullptr;
3805 for (MachineBasicBlock *Succ : MBB->successors()) {
3806 if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
3807 continue;
3808 // Return a nullptr if we found more than one fallthrough successor.
3809 if (FallthroughBB && FallthroughBB != TBB)
3810 return nullptr;
3811 FallthroughBB = Succ;
3812 }
3813 return FallthroughBB;
3814}
3815
3816bool X86InstrInfo::analyzeBranchImpl(
3819 SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3820
3821 // Start from the bottom of the block and work up, examining the
3822 // terminator instructions.
3824 MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3825 while (I != MBB.begin()) {
3826 --I;
3827 if (I->isDebugInstr())
3828 continue;
3829
3830 // Working from the bottom, when we see a non-terminator instruction, we're
3831 // done.
3832 if (!isUnpredicatedTerminator(*I))
3833 break;
3834
3835 // A terminator that isn't a branch can't easily be handled by this
3836 // analysis.
3837 if (!I->isBranch())
3838 return true;
3839
3840 // Handle unconditional branches.
3841 if (I->getOpcode() == X86::JMP_1) {
3842 UnCondBrIter = I;
3843
3844 if (!AllowModify) {
3845 TBB = I->getOperand(0).getMBB();
3846 continue;
3847 }
3848
3849 // If the block has any instructions after a JMP, delete them.
3850 MBB.erase(std::next(I), MBB.end());
3851
3852 Cond.clear();
3853 FBB = nullptr;
3854
3855 // Delete the JMP if it's equivalent to a fall-through.
3856 if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3857 TBB = nullptr;
3858 I->eraseFromParent();
3859 I = MBB.end();
3860 UnCondBrIter = MBB.end();
3861 continue;
3862 }
3863
3864 // TBB is used to indicate the unconditional destination.
3865 TBB = I->getOperand(0).getMBB();
3866 continue;
3867 }
3868
3869 // Handle conditional branches.
3870 X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3871 if (BranchCode == X86::COND_INVALID)
3872 return true; // Can't handle indirect branch.
3873
3874 // In practice we should never have an undef eflags operand, if we do
3875 // abort here as we are not prepared to preserve the flag.
3876 if (I->findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->isUndef())
3877 return true;
3878
3879 // Working from the bottom, handle the first conditional branch.
3880 if (Cond.empty()) {
3881 FBB = TBB;
3882 TBB = I->getOperand(0).getMBB();
3884 CondBranches.push_back(&*I);
3885 continue;
3886 }
3887
3888 // Handle subsequent conditional branches. Only handle the case where all
3889 // conditional branches branch to the same destination and their condition
3890 // opcodes fit one of the special multi-branch idioms.
3891 assert(Cond.size() == 1);
3892 assert(TBB);
3893
3894 // If the conditions are the same, we can leave them alone.
3895 X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3896 auto NewTBB = I->getOperand(0).getMBB();
3897 if (OldBranchCode == BranchCode && TBB == NewTBB)
3898 continue;
3899
3900 // If they differ, see if they fit one of the known patterns. Theoretically,
3901 // we could handle more patterns here, but we shouldn't expect to see them
3902 // if instruction selection has done a reasonable job.
3903 if (TBB == NewTBB &&
3904 ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3905 (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3906 BranchCode = X86::COND_NE_OR_P;
3907 } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3908 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3909 if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3910 return true;
3911
3912 // X86::COND_E_AND_NP usually has two different branch destinations.
3913 //
3914 // JP B1
3915 // JE B2
3916 // JMP B1
3917 // B1:
3918 // B2:
3919 //
3920 // Here this condition branches to B2 only if NP && E. It has another
3921 // equivalent form:
3922 //
3923 // JNE B1
3924 // JNP B2
3925 // JMP B1
3926 // B1:
3927 // B2:
3928 //
3929 // Similarly it branches to B2 only if E && NP. That is why this condition
3930 // is named with COND_E_AND_NP.
3931 BranchCode = X86::COND_E_AND_NP;
3932 } else
3933 return true;
3934
3935 // Update the MachineOperand.
3936 Cond[0].setImm(BranchCode);
3937 CondBranches.push_back(&*I);
3938 }
3939
3940 return false;
3941}
3942
3945 MachineBasicBlock *&FBB,
3947 bool AllowModify) const {
3948 SmallVector<MachineInstr *, 4> CondBranches;
3949 return analyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3950}
3951
3953 const MCInstrDesc &Desc = MI.getDesc();
3954 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3955 assert(MemRefBegin >= 0 && "instr should have memory operand");
3956 MemRefBegin += X86II::getOperandBias(Desc);
3957
3958 const MachineOperand &MO = MI.getOperand(MemRefBegin + X86::AddrDisp);
3959 if (!MO.isJTI())
3960 return -1;
3961
3962 return MO.getIndex();
3963}
3964
3966 Register Reg) {
3967 if (!Reg.isVirtual())
3968 return -1;
3970 if (MI == nullptr)
3971 return -1;
3972 unsigned Opcode = MI->getOpcode();
3973 if (Opcode != X86::LEA64r && Opcode != X86::LEA32r)
3974 return -1;
3976}
3977
3979 unsigned Opcode = MI.getOpcode();
3980 // Switch-jump pattern for non-PIC code looks like:
3981 // JMP64m $noreg, 8, %X, %jump-table.X, $noreg
3982 if (Opcode == X86::JMP64m || Opcode == X86::JMP32m) {
3984 }
3985 // The pattern for PIC code looks like:
3986 // %0 = LEA64r $rip, 1, $noreg, %jump-table.X
3987 // %1 = MOVSX64rm32 %0, 4, XX, 0, $noreg
3988 // %2 = ADD64rr %1, %0
3989 // JMP64r %2
3990 if (Opcode == X86::JMP64r || Opcode == X86::JMP32r) {
3991 Register Reg = MI.getOperand(0).getReg();
3992 if (!Reg.isVirtual())
3993 return -1;
3994 const MachineFunction &MF = *MI.getParent()->getParent();
3995 const MachineRegisterInfo &MRI = MF.getRegInfo();
3996 MachineInstr *Add = MRI.getUniqueVRegDef(Reg);
3997 if (Add == nullptr)
3998 return -1;
3999 if (Add->getOpcode() != X86::ADD64rr && Add->getOpcode() != X86::ADD32rr)
4000 return -1;
4001 int JTI1 = getJumpTableIndexFromReg(MRI, Add->getOperand(1).getReg());
4002 if (JTI1 >= 0)
4003 return JTI1;
4004 int JTI2 = getJumpTableIndexFromReg(MRI, Add->getOperand(2).getReg());
4005 if (JTI2 >= 0)
4006 return JTI2;
4007 }
4008 return -1;
4009}
4010
4012 MachineBranchPredicate &MBP,
4013 bool AllowModify) const {
4014 using namespace std::placeholders;
4015
4017 SmallVector<MachineInstr *, 4> CondBranches;
4018 if (analyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
4019 AllowModify))
4020 return true;
4021
4022 if (Cond.size() != 1)
4023 return true;
4024
4025 assert(MBP.TrueDest && "expected!");
4026
4027 if (!MBP.FalseDest)
4028 MBP.FalseDest = MBB.getNextNode();
4029
4031
4032 MachineInstr *ConditionDef = nullptr;
4033 bool SingleUseCondition = true;
4034
4036 if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
4037 ConditionDef = &MI;
4038 break;
4039 }
4040
4041 if (MI.readsRegister(X86::EFLAGS, TRI))
4042 SingleUseCondition = false;
4043 }
4044
4045 if (!ConditionDef)
4046 return true;
4047
4048 if (SingleUseCondition) {
4049 for (auto *Succ : MBB.successors())
4050 if (Succ->isLiveIn(X86::EFLAGS))
4051 SingleUseCondition = false;
4052 }
4053
4054 MBP.ConditionDef = ConditionDef;
4055 MBP.SingleUseCondition = SingleUseCondition;
4056
4057 // Currently we only recognize the simple pattern:
4058 //
4059 // test %reg, %reg
4060 // je %label
4061 //
4062 const unsigned TestOpcode =
4063 Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
4064
4065 if (ConditionDef->getOpcode() == TestOpcode &&
4066 ConditionDef->getNumOperands() == 3 &&
4067 ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
4068 (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
4069 MBP.LHS = ConditionDef->getOperand(0);
4070 MBP.RHS = MachineOperand::CreateImm(0);
4071 MBP.Predicate = Cond[0].getImm() == X86::COND_NE
4072 ? MachineBranchPredicate::PRED_NE
4073 : MachineBranchPredicate::PRED_EQ;
4074 return false;
4075 }
4076
4077 return true;
4078}
4079
4081 int *BytesRemoved) const {
4082 assert(!BytesRemoved && "code size not handled");
4083
4085 unsigned Count = 0;
4086
4087 while (I != MBB.begin()) {
4088 --I;
4089 if (I->isDebugInstr())
4090 continue;
4091 if (I->getOpcode() != X86::JMP_1 &&
4093 break;
4094 // Remove the branch.
4095 I->eraseFromParent();
4096 I = MBB.end();
4097 ++Count;
4098 }
4099
4100 return Count;
4101}
4102
4105 MachineBasicBlock *FBB,
4107 const DebugLoc &DL, int *BytesAdded) const {
4108 // Shouldn't be a fall through.
4109 assert(TBB && "insertBranch must not be told to insert a fallthrough");
4110 assert((Cond.size() == 1 || Cond.size() == 0) &&
4111 "X86 branch conditions have one component!");
4112 assert(!BytesAdded && "code size not handled");
4113
4114 if (Cond.empty()) {
4115 // Unconditional branch?
4116 assert(!FBB && "Unconditional branch with multiple successors!");
4117 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
4118 return 1;
4119 }
4120
4121 // If FBB is null, it is implied to be a fall-through block.
4122 bool FallThru = FBB == nullptr;
4123
4124 // Conditional branch.
4125 unsigned Count = 0;
4127 switch (CC) {
4128 case X86::COND_NE_OR_P:
4129 // Synthesize NE_OR_P with two branches.
4130 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
4131 ++Count;
4132 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
4133 ++Count;
4134 break;
4135 case X86::COND_E_AND_NP:
4136 // Use the next block of MBB as FBB if it is null.
4137 if (FBB == nullptr) {
4138 FBB = getFallThroughMBB(&MBB, TBB);
4139 assert(FBB && "MBB cannot be the last block in function when the false "
4140 "body is a fall-through.");
4141 }
4142 // Synthesize COND_E_AND_NP with two branches.
4143 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
4144 ++Count;
4145 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
4146 ++Count;
4147 break;
4148 default: {
4149 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
4150 ++Count;
4151 }
4152 }
4153 if (!FallThru) {
4154 // Two-way Conditional branch. Insert the second branch.
4155 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
4156 ++Count;
4157 }
4158 return Count;
4159}
4160
4163 Register DstReg, Register TrueReg,
4164 Register FalseReg, int &CondCycles,
4165 int &TrueCycles, int &FalseCycles) const {
4166 // Not all subtargets have cmov instructions.
4167 if (!Subtarget.canUseCMOV())
4168 return false;
4169 if (Cond.size() != 1)
4170 return false;
4171 // We cannot do the composite conditions, at least not in SSA form.
4173 return false;
4174
4175 // Check register classes.
4176 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4177 const TargetRegisterClass *RC =
4178 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
4179 if (!RC)
4180 return false;
4181
4182 // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
4183 if (X86::GR16RegClass.hasSubClassEq(RC) ||
4184 X86::GR32RegClass.hasSubClassEq(RC) ||
4185 X86::GR64RegClass.hasSubClassEq(RC)) {
4186 // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
4187 // Bridge. Probably Ivy Bridge as well.
4188 CondCycles = 2;
4189 TrueCycles = 2;
4190 FalseCycles = 2;
4191 return true;
4192 }
4193
4194 // Can't do vectors.
4195 return false;
4196}
4197
4200 const DebugLoc &DL, Register DstReg,
4202 Register FalseReg) const {
4203 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4205 const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
4206 assert(Cond.size() == 1 && "Invalid Cond array");
4207 unsigned Opc =
4208 X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
4209 false /*HasMemoryOperand*/, Subtarget.hasNDD());
4210 BuildMI(MBB, I, DL, get(Opc), DstReg)
4211 .addReg(FalseReg)
4212 .addReg(TrueReg)
4213 .addImm(Cond[0].getImm());
4214}
4215
4216/// Test if the given register is a physical h register.
4217static bool isHReg(Register Reg) {
4218 return X86::GR8_ABCD_HRegClass.contains(Reg);
4219}
4220
4221// Try and copy between VR128/VR64 and GR64 registers.
4222static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg,
4223 const X86Subtarget &Subtarget) {
4224 bool HasAVX = Subtarget.hasAVX();
4225 bool HasAVX512 = Subtarget.hasAVX512();
4226 bool HasEGPR = Subtarget.hasEGPR();
4227
4228 // SrcReg(MaskReg) -> DestReg(GR64)
4229 // SrcReg(MaskReg) -> DestReg(GR32)
4230
4231 // All KMASK RegClasses hold the same k registers, can be tested against
4232 // anyone.
4233 if (X86::VK16RegClass.contains(SrcReg)) {
4234 if (X86::GR64RegClass.contains(DestReg)) {
4235 assert(Subtarget.hasBWI());
4236 return HasEGPR ? X86::KMOVQrk_EVEX : X86::KMOVQrk;
4237 }
4238 if (X86::GR32RegClass.contains(DestReg))
4239 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDrk_EVEX : X86::KMOVDrk)
4240 : (HasEGPR ? X86::KMOVWrk_EVEX : X86::KMOVWrk);
4241 }
4242
4243 // SrcReg(GR64) -> DestReg(MaskReg)
4244 // SrcReg(GR32) -> DestReg(MaskReg)
4245
4246 // All KMASK RegClasses hold the same k registers, can be tested against
4247 // anyone.
4248 if (X86::VK16RegClass.contains(DestReg)) {
4249 if (X86::GR64RegClass.contains(SrcReg)) {
4250 assert(Subtarget.hasBWI());
4251 return HasEGPR ? X86::KMOVQkr_EVEX : X86::KMOVQkr;
4252 }
4253 if (X86::GR32RegClass.contains(SrcReg))
4254 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDkr_EVEX : X86::KMOVDkr)
4255 : (HasEGPR ? X86::KMOVWkr_EVEX : X86::KMOVWkr);
4256 }
4257
4258 // SrcReg(VR128) -> DestReg(GR64)
4259 // SrcReg(VR64) -> DestReg(GR64)
4260 // SrcReg(GR64) -> DestReg(VR128)
4261 // SrcReg(GR64) -> DestReg(VR64)
4262
4263 if (X86::GR64RegClass.contains(DestReg)) {
4264 if (X86::VR128XRegClass.contains(SrcReg))
4265 // Copy from a VR128 register to a GR64 register.
4266 return HasAVX512 ? X86::VMOVPQIto64Zrr
4267 : HasAVX ? X86::VMOVPQIto64rr
4268 : X86::MOVPQIto64rr;
4269 if (X86::VR64RegClass.contains(SrcReg))
4270 // Copy from a VR64 register to a GR64 register.
4271 return X86::MMX_MOVD64from64rr;
4272 } else if (X86::GR64RegClass.contains(SrcReg)) {
4273 // Copy from a GR64 register to a VR128 register.
4274 if (X86::VR128XRegClass.contains(DestReg))
4275 return HasAVX512 ? X86::VMOV64toPQIZrr
4276 : HasAVX ? X86::VMOV64toPQIrr
4277 : X86::MOV64toPQIrr;
4278 // Copy from a GR64 register to a VR64 register.
4279 if (X86::VR64RegClass.contains(DestReg))
4280 return X86::MMX_MOVD64to64rr;
4281 }
4282
4283 // SrcReg(VR128) -> DestReg(GR32)
4284 // SrcReg(GR32) -> DestReg(VR128)
4285
4286 if (X86::GR32RegClass.contains(DestReg) &&
4287 X86::VR128XRegClass.contains(SrcReg))
4288 // Copy from a VR128 register to a GR32 register.
4289 return HasAVX512 ? X86::VMOVPDI2DIZrr
4290 : HasAVX ? X86::VMOVPDI2DIrr
4291 : X86::MOVPDI2DIrr;
4292
4293 if (X86::VR128XRegClass.contains(DestReg) &&
4294 X86::GR32RegClass.contains(SrcReg))
4295 // Copy from a GR32 register to a VR128 register.
4296 return HasAVX512 ? X86::VMOVDI2PDIZrr
4297 : HasAVX ? X86::VMOVDI2PDIrr
4298 : X86::MOVDI2PDIrr;
4299
4300 return 0;
4301}
4302
4305 const DebugLoc &DL, Register DestReg,
4306 Register SrcReg, bool KillSrc,
4307 bool RenamableDest, bool RenamableSrc) const {
4308 // First deal with the normal symmetric copies.
4309 bool HasAVX = Subtarget.hasAVX();
4310 bool HasVLX = Subtarget.hasVLX();
4311 bool HasEGPR = Subtarget.hasEGPR();
4312 unsigned Opc = 0;
4313 if (X86::GR64RegClass.contains(DestReg, SrcReg))
4314 Opc = X86::MOV64rr;
4315 else if (X86::GR32RegClass.contains(DestReg, SrcReg))
4316 Opc = X86::MOV32rr;
4317 else if (X86::GR16RegClass.contains(DestReg, SrcReg))
4318 Opc = X86::MOV16rr;
4319 else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
4320 // Copying to or from a physical H register on x86-64 requires a NOREX
4321 // move. Otherwise use a normal move.
4322 if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) {
4323 Opc = X86::MOV8rr_NOREX;
4324 // Both operands must be encodable without an REX prefix.
4325 assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
4326 "8-bit H register can not be copied outside GR8_NOREX");
4327 } else
4328 Opc = X86::MOV8rr;
4329 } else if (X86::VR64RegClass.contains(DestReg, SrcReg))
4330 Opc = X86::MMX_MOVQ64rr;
4331 else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
4332 if (HasVLX)
4333 Opc = X86::VMOVAPSZ128rr;
4334 else if (X86::VR128RegClass.contains(DestReg, SrcReg))
4335 Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
4336 else {
4337 // If this an extended register and we don't have VLX we need to use a
4338 // 512-bit move.
4339 Opc = X86::VMOVAPSZrr;
4341 DestReg =
4342 TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass);
4343 SrcReg =
4344 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4345 }
4346 } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
4347 if (HasVLX)
4348 Opc = X86::VMOVAPSZ256rr;
4349 else if (X86::VR256RegClass.contains(DestReg, SrcReg))
4350 Opc = X86::VMOVAPSYrr;
4351 else {
4352 // If this an extended register and we don't have VLX we need to use a
4353 // 512-bit move.
4354 Opc = X86::VMOVAPSZrr;
4356 DestReg =
4357 TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass);
4358 SrcReg =
4359 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4360 }
4361 } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
4362 Opc = X86::VMOVAPSZrr;
4363 // All KMASK RegClasses hold the same k registers, can be tested against
4364 // anyone.
4365 else if (X86::VK16RegClass.contains(DestReg, SrcReg))
4366 Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk)
4367 : (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVWkk);
4368
4369 if (!Opc)
4370 Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
4371
4372 if (Opc) {
4373 BuildMI(MBB, MI, DL, get(Opc), DestReg)
4374 .addReg(SrcReg, getKillRegState(KillSrc));
4375 return;
4376 }
4377
4378 if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
4379 // FIXME: We use a fatal error here because historically LLVM has tried
4380 // lower some of these physreg copies and we want to ensure we get
4381 // reasonable bug reports if someone encounters a case no other testing
4382 // found. This path should be removed after the LLVM 7 release.
4383 report_fatal_error("Unable to copy EFLAGS physical register!");
4384 }
4385
4386 LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
4387 << RI.getName(DestReg) << '\n');
4388 report_fatal_error("Cannot emit physreg copy instruction");
4389}
4390
4391std::optional<DestSourcePair>
4393 if (MI.isMoveReg()) {
4394 // FIXME: Dirty hack for apparent invariant that doesn't hold when
4395 // subreg_to_reg is coalesced with ordinary copies, such that the bits that
4396 // were asserted as 0 are now undef.
4397 if (MI.getOperand(0).isUndef() && MI.getOperand(0).getSubReg())
4398 return std::nullopt;
4399
4400 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
4401 }
4402 return std::nullopt;
4403}
4404
4405static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
4406 if (STI.hasFP16())
4407 return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
4408 if (Load)
4409 return X86::MOVSHPrm;
4410 return X86::MOVSHPmr;
4411}
4412
4414 const TargetRegisterClass *RC,
4415 bool IsStackAligned,
4416 const X86Subtarget &STI, bool Load) {
4417 bool HasAVX = STI.hasAVX();
4418 bool HasAVX512 = STI.hasAVX512();
4419 bool HasVLX = STI.hasVLX();
4420 bool HasEGPR = STI.hasEGPR();
4421
4422 assert(RC != nullptr && "Invalid target register class");
4423 switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
4424 default:
4425 llvm_unreachable("Unknown spill size");
4426 case 1:
4427 assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
4428 if (STI.is64Bit())
4429 // Copying to or from a physical H register on x86-64 requires a NOREX
4430 // move. Otherwise use a normal move.
4431 if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
4432 return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
4433 return Load ? X86::MOV8rm : X86::MOV8mr;
4434 case 2:
4435 if (X86::VK16RegClass.hasSubClassEq(RC))
4436 return Load ? (HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm)
4437 : (HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk);
4438 assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
4439 return Load ? X86::MOV16rm : X86::MOV16mr;
4440 case 4:
4441 if (X86::GR32RegClass.hasSubClassEq(RC))
4442 return Load ? X86::MOV32rm : X86::MOV32mr;
4443 if (X86::FR32XRegClass.hasSubClassEq(RC))
4444 return Load ? (HasAVX512 ? X86::VMOVSSZrm_alt
4445 : HasAVX ? X86::VMOVSSrm_alt
4446 : X86::MOVSSrm_alt)
4447 : (HasAVX512 ? X86::VMOVSSZmr
4448 : HasAVX ? X86::VMOVSSmr
4449 : X86::MOVSSmr);
4450 if (X86::RFP32RegClass.hasSubClassEq(RC))
4451 return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
4452 if (X86::VK32RegClass.hasSubClassEq(RC)) {
4453 assert(STI.hasBWI() && "KMOVD requires BWI");
4454 return Load ? (HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm)
4455 : (HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk);
4456 }
4457 // All of these mask pair classes have the same spill size, the same kind
4458 // of kmov instructions can be used with all of them.
4459 if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
4460 X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
4461 X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
4462 X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
4463 X86::VK16PAIRRegClass.hasSubClassEq(RC))
4464 return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
4465 if (X86::FR16RegClass.hasSubClassEq(RC) ||
4466 X86::FR16XRegClass.hasSubClassEq(RC))
4467 return getLoadStoreOpcodeForFP16(Load, STI);
4468 llvm_unreachable("Unknown 4-byte regclass");
4469 case 8:
4470 if (X86::GR64RegClass.hasSubClassEq(RC))
4471 return Load ? X86::MOV64rm : X86::MOV64mr;
4472 if (X86::FR64XRegClass.hasSubClassEq(RC))
4473 return Load ? (HasAVX512 ? X86::VMOVSDZrm_alt
4474 : HasAVX ? X86::VMOVSDrm_alt
4475 : X86::MOVSDrm_alt)
4476 : (HasAVX512 ? X86::VMOVSDZmr
4477 : HasAVX ? X86::VMOVSDmr
4478 : X86::MOVSDmr);
4479 if (X86::VR64RegClass.hasSubClassEq(RC))
4480 return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
4481 if (X86::RFP64RegClass.hasSubClassEq(RC))
4482 return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
4483 if (X86::VK64RegClass.hasSubClassEq(RC)) {
4484 assert(STI.hasBWI() && "KMOVQ requires BWI");
4485 return Load ? (HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm)
4486 : (HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk);
4487 }
4488 llvm_unreachable("Unknown 8-byte regclass");
4489 case 10:
4490 assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
4491 return Load ? X86::LD_Fp80m : X86::ST_FpP80m;
4492 case 16: {
4493 if (X86::VR128XRegClass.hasSubClassEq(RC)) {
4494 // If stack is realigned we can use aligned stores.
4495 if (IsStackAligned)
4496 return Load ? (HasVLX ? X86::VMOVAPSZ128rm
4497 : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX
4498 : HasAVX ? X86::VMOVAPSrm
4499 : X86::MOVAPSrm)
4500 : (HasVLX ? X86::VMOVAPSZ128mr
4501 : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX
4502 : HasAVX ? X86::VMOVAPSmr
4503 : X86::MOVAPSmr);
4504 else
4505 return Load ? (HasVLX ? X86::VMOVUPSZ128rm
4506 : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX
4507 : HasAVX ? X86::VMOVUPSrm
4508 : X86::MOVUPSrm)
4509 : (HasVLX ? X86::VMOVUPSZ128mr
4510 : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX
4511 : HasAVX ? X86::VMOVUPSmr
4512 : X86::MOVUPSmr);
4513 }
4514 llvm_unreachable("Unknown 16-byte regclass");
4515 }
4516 case 32:
4517 assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
4518 // If stack is realigned we can use aligned stores.
4519 if (IsStackAligned)
4520 return Load ? (HasVLX ? X86::VMOVAPSZ256rm
4521 : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
4522 : X86::VMOVAPSYrm)
4523 : (HasVLX ? X86::VMOVAPSZ256mr
4524 : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
4525 : X86::VMOVAPSYmr);
4526 else
4527 return Load ? (HasVLX ? X86::VMOVUPSZ256rm
4528 : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
4529 : X86::VMOVUPSYrm)
4530 : (HasVLX ? X86::VMOVUPSZ256mr
4531 : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
4532 : X86::VMOVUPSYmr);
4533 case 64:
4534 assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
4535 assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
4536 if (IsStackAligned)
4537 return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
4538 else
4539 return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
4540 case 1024:
4541 assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
4542 assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
4543#define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC)
4544 return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
4545 : GET_EGPR_IF_ENABLED(X86::TILESTORED);
4546#undef GET_EGPR_IF_ENABLED
4547 }
4548}
4549
4550std::optional<ExtAddrMode>
4552 const TargetRegisterInfo *TRI) const {
4553 const MCInstrDesc &Desc = MemI.getDesc();
4554 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4555 if (MemRefBegin < 0)
4556 return std::nullopt;
4557
4558 MemRefBegin += X86II::getOperandBias(Desc);
4559
4560 auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
4561 if (!BaseOp.isReg()) // Can be an MO_FrameIndex
4562 return std::nullopt;
4563
4564 const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
4565 // Displacement can be symbolic
4566 if (!DispMO.isImm())
4567 return std::nullopt;
4568
4569 ExtAddrMode AM;
4570 AM.BaseReg = BaseOp.getReg();
4571 AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
4572 AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
4573 AM.Displacement = DispMO.getImm();
4574 return AM;
4575}
4576
4578 StringRef &ErrInfo) const {
4579 std::optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
4580 if (!AMOrNone)
4581 return true;
4582
4583 ExtAddrMode AM = *AMOrNone;
4585 if (AM.ScaledReg != X86::NoRegister) {
4586 switch (AM.Scale) {
4587 case 1:
4588 case 2:
4589 case 4:
4590 case 8:
4591 break;
4592 default:
4593 ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
4594 return false;
4595 }
4596 }
4597 if (!isInt<32>(AM.Displacement)) {
4598 ErrInfo = "Displacement in address must fit into 32-bit signed "
4599 "integer";
4600 return false;
4601 }
4602
4603 return true;
4604}
4605
4607 const Register Reg,
4608 int64_t &ImmVal) const {
4609 Register MovReg = Reg;
4610 const MachineInstr *MovMI = &MI;
4611
4612 // Follow use-def for SUBREG_TO_REG to find the real move immediate
4613 // instruction. It is quite common for x86-64.
4614 if (MI.isSubregToReg()) {
4615 // We use following pattern to setup 64b immediate.
4616 // %8:gr32 = MOV32r0 implicit-def dead $eflags
4617 // %6:gr64 = SUBREG_TO_REG killed %8:gr32, %subreg.sub_32bit
4618 unsigned SubIdx = MI.getOperand(2).getImm();
4619 MovReg = MI.getOperand(1).getReg();
4620 if (SubIdx != X86::sub_32bit)
4621 return false;
4622 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4623 MovMI = MRI.getUniqueVRegDef(MovReg);
4624 if (!MovMI)
4625 return false;
4626 }
4627
4628 if (MovMI->getOpcode() == X86::MOV32r0 &&
4629 MovMI->getOperand(0).getReg() == MovReg) {
4630 ImmVal = 0;
4631 return true;
4632 }
4633
4634 if (MovMI->getOpcode() != X86::MOV32ri &&
4635 MovMI->getOpcode() != X86::MOV64ri &&
4636 MovMI->getOpcode() != X86::MOV32ri64 && MovMI->getOpcode() != X86::MOV8ri)
4637 return false;
4638 // Mov Src can be a global address.
4639 if (!MovMI->getOperand(1).isImm() || MovMI->getOperand(0).getReg() != MovReg)
4640 return false;
4641 ImmVal = MovMI->getOperand(1).getImm();
4642 return true;
4643}
4644
4646 const MachineInstr *MI, const Register NullValueReg,
4647 const TargetRegisterInfo *TRI) const {
4648 if (!MI->modifiesRegister(NullValueReg, TRI))
4649 return true;
4650 switch (MI->getOpcode()) {
4651 // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
4652 // X.
4653 case X86::SHR64ri:
4654 case X86::SHR32ri:
4655 case X86::SHL64ri:
4656 case X86::SHL32ri:
4657 assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
4658 "expected for shift opcode!");
4659 return MI->getOperand(0).getReg() == NullValueReg &&
4660 MI->getOperand(1).getReg() == NullValueReg;
4661 // Zero extend of a sub-reg of NullValueReg into itself does not change the
4662 // null value.
4663 case X86::MOV32rr:
4664 return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
4665 return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
4666 });
4667 default:
4668 return false;
4669 }
4670 llvm_unreachable("Should be handled above!");
4671}
4672
4675 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
4676 const TargetRegisterInfo *TRI) const {
4677 const MCInstrDesc &Desc = MemOp.getDesc();
4678 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4679 if (MemRefBegin < 0)
4680 return false;
4681
4682 MemRefBegin += X86II::getOperandBias(Desc);
4683
4684 const MachineOperand *BaseOp =
4685 &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
4686 if (!BaseOp->isReg()) // Can be an MO_FrameIndex
4687 return false;
4688
4689 if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
4690 return false;
4691
4692 if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
4693 X86::NoRegister)
4694 return false;
4695
4696 const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
4697
4698 // Displacement can be symbolic
4699 if (!DispMO.isImm())
4700 return false;
4701
4702 Offset = DispMO.getImm();
4703
4704 if (!BaseOp->isReg())
4705 return false;
4706
4707 OffsetIsScalable = false;
4708 // FIXME: Relying on memoperands() may not be right thing to do here. Check
4709 // with X86 maintainers, and fix it accordingly. For now, it is ok, since
4710 // there is no use of `Width` for X86 back-end at the moment.
4711 Width = !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize()
4713 BaseOps.push_back(BaseOp);
4714 return true;
4715}
4716
4717static unsigned getStoreRegOpcode(Register SrcReg,
4718 const TargetRegisterClass *RC,
4719 bool IsStackAligned,
4720 const X86Subtarget &STI) {
4721 return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
4722}
4723
4724static unsigned getLoadRegOpcode(Register DestReg,
4725 const TargetRegisterClass *RC,
4726 bool IsStackAligned, const X86Subtarget &STI) {
4727 return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
4728}
4729
4730static bool isAMXOpcode(unsigned Opc) {
4731 switch (Opc) {
4732 default:
4733 return false;
4734 case X86::TILELOADD:
4735 case X86::TILESTORED:
4736 case X86::TILELOADD_EVEX:
4737 case X86::TILESTORED_EVEX:
4738 return true;
4739 }
4740}
4741
4744 unsigned Opc, Register Reg, int FrameIdx,
4745 bool isKill) const {
4746 switch (Opc) {
4747 default:
4748 llvm_unreachable("Unexpected special opcode!");
4749 case X86::TILESTORED:
4750 case X86::TILESTORED_EVEX: {
4751 // tilestored %tmm, (%sp, %idx)
4752 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4753 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4754 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4755 MachineInstr *NewMI =
4756 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4757 .addReg(Reg, getKillRegState(isKill));
4759 MO.setReg(VirtReg);
4760 MO.setIsKill(true);
4761 break;
4762 }
4763 case X86::TILELOADD:
4764 case X86::TILELOADD_EVEX: {
4765 // tileloadd (%sp, %idx), %tmm
4766 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4767 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4768 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4770 BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx);
4772 MO.setReg(VirtReg);
4773 MO.setIsKill(true);
4774 break;
4775 }
4776 }
4777}
4778
4781 bool isKill, int FrameIdx, const TargetRegisterClass *RC,
4782
4783 Register VReg, MachineInstr::MIFlag Flags) const {
4784 const MachineFunction &MF = *MBB.getParent();
4785 const MachineFrameInfo &MFI = MF.getFrameInfo();
4786 assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
4787 "Stack slot too small for store");
4788
4789 unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
4790 bool isAligned =
4791 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4792 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4793
4794 unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
4795 if (isAMXOpcode(Opc))
4796 loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill);
4797 else
4798 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4799 .addReg(SrcReg, getKillRegState(isKill))
4800 .setMIFlag(Flags);
4801}
4802
4805 Register DestReg, int FrameIdx,
4806 const TargetRegisterClass *RC,
4807 Register VReg, unsigned SubReg,
4808 MachineInstr::MIFlag Flags) const {
4809 const MachineFunction &MF = *MBB.getParent();
4810 const MachineFrameInfo &MFI = MF.getFrameInfo();
4811 assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
4812 "Load size exceeds stack slot");
4813 unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
4814 bool isAligned =
4815 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4816 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4817
4818 unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
4819 if (isAMXOpcode(Opc))
4820 loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx);
4821 else
4822 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx)
4823 .setMIFlag(Flags);
4824}
4825
4827 Register &SrcReg2, int64_t &CmpMask,
4828 int64_t &CmpValue) const {
4829 switch (MI.getOpcode()) {
4830 default:
4831 break;
4832 case X86::CMP64ri32:
4833 case X86::CMP32ri:
4834 case X86::CMP16ri:
4835 case X86::CMP8ri:
4836 SrcReg = MI.getOperand(0).getReg();
4837 SrcReg2 = 0;
4838 if (MI.getOperand(1).isImm()) {
4839 CmpMask = ~0;
4840 CmpValue = MI.getOperand(1).getImm();
4841 } else {
4842 CmpMask = CmpValue = 0;
4843 }
4844 return true;
4845 // A SUB can be used to perform comparison.
4846 CASE_ND(SUB64rm)
4847 CASE_ND(SUB32rm)
4848 CASE_ND(SUB16rm)
4849 CASE_ND(SUB8rm)
4850 SrcReg = MI.getOperand(1).getReg();
4851 SrcReg2 = 0;
4852 CmpMask = 0;
4853 CmpValue = 0;
4854 return true;
4855 CASE_ND(SUB64rr)
4856 CASE_ND(SUB32rr)
4857 CASE_ND(SUB16rr)
4858 CASE_ND(SUB8rr)
4859 SrcReg = MI.getOperand(1).getReg();
4860 SrcReg2 = MI.getOperand(2).getReg();
4861 CmpMask = 0;
4862 CmpValue = 0;
4863 return true;
4864 CASE_ND(SUB64ri32)
4865 CASE_ND(SUB32ri)
4866 CASE_ND(SUB16ri)
4867 CASE_ND(SUB8ri)
4868 SrcReg = MI.getOperand(1).getReg();
4869 SrcReg2 = 0;
4870 if (MI.getOperand(2).isImm()) {
4871 CmpMask = ~0;
4872 CmpValue = MI.getOperand(2).getImm();
4873 } else {
4874 CmpMask = CmpValue = 0;
4875 }
4876 return true;
4877 case X86::CMP64rr:
4878 case X86::CMP32rr:
4879 case X86::CMP16rr:
4880 case X86::CMP8rr:
4881 SrcReg = MI.getOperand(0).getReg();
4882 SrcReg2 = MI.getOperand(1).getReg();
4883 CmpMask = 0;
4884 CmpValue = 0;
4885 return true;
4886 case X86::TEST8rr:
4887 case X86::TEST16rr:
4888 case X86::TEST32rr:
4889 case X86::TEST64rr:
4890 SrcReg = MI.getOperand(0).getReg();
4891 if (MI.getOperand(1).getReg() != SrcReg)
4892 return false;
4893 // Compare against zero.
4894 SrcReg2 = 0;
4895 CmpMask = ~0;
4896 CmpValue = 0;
4897 return true;
4898 case X86::TEST64ri32:
4899 case X86::TEST32ri:
4900 case X86::TEST16ri:
4901 case X86::TEST8ri:
4902 SrcReg = MI.getOperand(0).getReg();
4903 SrcReg2 = 0;
4904 // Force identical compare.
4905 CmpMask = 0;
4906 CmpValue = 0;
4907 return true;
4908 }
4909 return false;
4910}
4911
4912bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
4913 Register SrcReg, Register SrcReg2,
4914 int64_t ImmMask, int64_t ImmValue,
4915 const MachineInstr &OI, bool *IsSwapped,
4916 int64_t *ImmDelta) const {
4917 switch (OI.getOpcode()) {
4918 case X86::CMP64rr:
4919 case X86::CMP32rr:
4920 case X86::CMP16rr:
4921 case X86::CMP8rr:
4922 CASE_ND(SUB64rr)
4923 CASE_ND(SUB32rr)
4924 CASE_ND(SUB16rr)
4925 CASE_ND(SUB8rr) {
4926 Register OISrcReg;
4927 Register OISrcReg2;
4928 int64_t OIMask;
4929 int64_t OIValue;
4930 if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4931 OIMask != ImmMask || OIValue != ImmValue)
4932 return false;
4933 if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4934 *IsSwapped = false;
4935 return true;
4936 }
4937 if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4938 *IsSwapped = true;
4939 return true;
4940 }
4941 return false;
4942 }
4943 case X86::CMP64ri32:
4944 case X86::CMP32ri:
4945 case X86::CMP16ri:
4946 case X86::CMP8ri:
4947 case X86::TEST64ri32:
4948 case X86::TEST32ri:
4949 case X86::TEST16ri:
4950 case X86::TEST8ri:
4951 CASE_ND(SUB64ri32)
4952 CASE_ND(SUB32ri)
4953 CASE_ND(SUB16ri)
4954 CASE_ND(SUB8ri)
4955 case X86::TEST64rr:
4956 case X86::TEST32rr:
4957 case X86::TEST16rr:
4958 case X86::TEST8rr: {
4959 if (ImmMask != 0) {
4960 Register OISrcReg;
4961 Register OISrcReg2;
4962 int64_t OIMask;
4963 int64_t OIValue;
4964 if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4965 SrcReg == OISrcReg && ImmMask == OIMask) {
4966 if (OIValue == ImmValue) {
4967 *ImmDelta = 0;
4968 return true;
4969 } else if (static_cast<uint64_t>(ImmValue) ==
4970 static_cast<uint64_t>(OIValue) - 1) {
4971 *ImmDelta = -1;
4972 return true;
4973 } else if (static_cast<uint64_t>(ImmValue) ==
4974 static_cast<uint64_t>(OIValue) + 1) {
4975 *ImmDelta = 1;
4976 return true;
4977 } else {
4978 return false;
4979 }
4980 }
4981 }
4982 return FlagI.isIdenticalTo(OI);
4983 }
4984 default:
4985 return false;
4986 }
4987}
4988
4989/// Check whether the definition can be converted
4990/// to remove a comparison against zero.
4991inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4992 bool &ClearsOverflowFlag) {
4993 NoSignFlag = false;
4994 ClearsOverflowFlag = false;
4995
4996 // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and
4997 // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during
4998 // Initial Exec to Local Exec relaxation. In these cases, we must not depend
4999 // on the EFLAGS modification of ADD actually happening in the final binary.
5000 if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) {
5001 unsigned Flags = MI.getOperand(5).getTargetFlags();
5002 if (Flags == X86II::MO_GOTTPOFF || Flags == X86II::MO_INDNTPOFF ||
5003 Flags == X86II::MO_GOTNTPOFF)
5004 return false;
5005 }
5006
5007 switch (MI.getOpcode()) {
5008 default:
5009 return false;
5010
5011 // The shift instructions only modify ZF if their shift count is non-zero.
5012 // N.B.: The processor truncates the shift count depending on the encoding.
5013 CASE_ND(SAR8ri)
5014 CASE_ND(SAR16ri)
5015 CASE_ND(SAR32ri)
5016 CASE_ND(SAR64ri)
5017 CASE_ND(SHR8ri)
5018 CASE_ND(SHR16ri)
5019 CASE_ND(SHR32ri)
5020 CASE_ND(SHR64ri)
5021 return getTruncatedShiftCount(MI, 2) != 0;
5022
5023 // Some left shift instructions can be turned into LEA instructions but only
5024 // if their flags aren't used. Avoid transforming such instructions.
5025 CASE_ND(SHL8ri)
5026 CASE_ND(SHL16ri)
5027 CASE_ND(SHL32ri)
5028 CASE_ND(SHL64ri) {
5029 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
5030 if (isTruncatedShiftCountForLEA(ShAmt))
5031 return false;
5032 return ShAmt != 0;
5033 }
5034
5035 CASE_ND(SHRD16rri8)
5036 CASE_ND(SHRD32rri8)
5037 CASE_ND(SHRD64rri8)
5038 CASE_ND(SHLD16rri8)
5039 CASE_ND(SHLD32rri8)
5040 CASE_ND(SHLD64rri8)
5041 return getTruncatedShiftCount(MI, 3) != 0;
5042
5043 CASE_ND(SUB64ri32)
5044 CASE_ND(SUB32ri)
5045 CASE_ND(SUB16ri)
5046 CASE_ND(SUB8ri)
5047 CASE_ND(SUB64rr)
5048 CASE_ND(SUB32rr)
5049 CASE_ND(SUB16rr)
5050 CASE_ND(SUB8rr)
5051 CASE_ND(SUB64rm)
5052 CASE_ND(SUB32rm)
5053 CASE_ND(SUB16rm)
5054 CASE_ND(SUB8rm)
5055 CASE_ND(DEC64r)
5056 CASE_ND(DEC32r)
5057 CASE_ND(DEC16r)
5058 CASE_ND(DEC8r)
5059 CASE_ND(ADD64ri32)
5060 CASE_ND(ADD32ri)
5061 CASE_ND(ADD16ri)
5062 CASE_ND(ADD8ri)
5063 CASE_ND(ADD64rr)
5064 CASE_ND(ADD32rr)
5065 CASE_ND(ADD16rr)
5066 CASE_ND(ADD8rr)
5067 CASE_ND(ADD64rm)
5068 CASE_ND(ADD32rm)
5069 CASE_ND(ADD16rm)
5070 CASE_ND(ADD8rm)
5071 CASE_ND(INC64r)
5072 CASE_ND(INC32r)
5073 CASE_ND(INC16r)
5074 CASE_ND(INC8r)
5075 CASE_ND(ADC64ri32)
5076 CASE_ND(ADC32ri)
5077 CASE_ND(ADC16ri)
5078 CASE_ND(ADC8ri)
5079 CASE_ND(ADC64rr)
5080 CASE_ND(ADC32rr)
5081 CASE_ND(ADC16rr)
5082 CASE_ND(ADC8rr)
5083 CASE_ND(ADC64rm)
5084 CASE_ND(ADC32rm)
5085 CASE_ND(ADC16rm)
5086 CASE_ND(ADC8rm)
5087 CASE_ND(SBB64ri32)
5088 CASE_ND(SBB32ri)
5089 CASE_ND(SBB16ri)
5090 CASE_ND(SBB8ri)
5091 CASE_ND(SBB64rr)
5092 CASE_ND(SBB32rr)
5093 CASE_ND(SBB16rr)
5094 CASE_ND(SBB8rr)
5095 CASE_ND(SBB64rm)
5096 CASE_ND(SBB32rm)
5097 CASE_ND(SBB16rm)
5098 CASE_ND(SBB8rm)
5099 CASE_ND(NEG8r)
5100 CASE_ND(NEG16r)
5101 CASE_ND(NEG32r)
5102 CASE_ND(NEG64r)
5103 case X86::LZCNT16rr:
5104 case X86::LZCNT16rm:
5105 case X86::LZCNT32rr:
5106 case X86::LZCNT32rm:
5107 case X86::LZCNT64rr:
5108 case X86::LZCNT64rm:
5109 case X86::POPCNT16rr:
5110 case X86::POPCNT16rm:
5111 case X86::POPCNT32rr:
5112 case X86::POPCNT32rm:
5113 case X86::POPCNT64rr:
5114 case X86::POPCNT64rm:
5115 case X86::TZCNT16rr:
5116 case X86::TZCNT16rm:
5117 case X86::TZCNT32rr:
5118 case X86::TZCNT32rm:
5119 case X86::TZCNT64rr:
5120 case X86::TZCNT64rm:
5121 return true;
5122 CASE_ND(AND64ri32)
5123 CASE_ND(AND32ri)
5124 CASE_ND(AND16ri)
5125 CASE_ND(AND8ri)
5126 CASE_ND(AND64rr)
5127 CASE_ND(AND32rr)
5128 CASE_ND(AND16rr)
5129 CASE_ND(AND8rr)
5130 CASE_ND(AND64rm)
5131 CASE_ND(AND32rm)
5132 CASE_ND(AND16rm)
5133 CASE_ND(AND8rm)
5134 CASE_ND(XOR64ri32)
5135 CASE_ND(XOR32ri)
5136 CASE_ND(XOR16ri)
5137 CASE_ND(XOR8ri)
5138 CASE_ND(XOR64rr)
5139 CASE_ND(XOR32rr)
5140 CASE_ND(XOR16rr)
5141 CASE_ND(XOR8rr)
5142 CASE_ND(XOR64rm)
5143 CASE_ND(XOR32rm)
5144 CASE_ND(XOR16rm)
5145 CASE_ND(XOR8rm)
5146 CASE_ND(OR64ri32)
5147 CASE_ND(OR32ri)
5148 CASE_ND(OR16ri)
5149 CASE_ND(OR8ri)
5150 CASE_ND(OR64rr)
5151 CASE_ND(OR32rr)
5152 CASE_ND(OR16rr)
5153 CASE_ND(OR8rr)
5154 CASE_ND(OR64rm)
5155 CASE_ND(OR32rm)
5156 CASE_ND(OR16rm)
5157 CASE_ND(OR8rm)
5158 case X86::ANDN32rr:
5159 case X86::ANDN32rm:
5160 case X86::ANDN64rr:
5161 case X86::ANDN64rm:
5162 case X86::BLSI32rr:
5163 case X86::BLSI32rm:
5164 case X86::BLSI64rr:
5165 case X86::BLSI64rm:
5166 case X86::BLSMSK32rr:
5167 case X86::BLSMSK32rm:
5168 case X86::BLSMSK64rr:
5169 case X86::BLSMSK64rm:
5170 case X86::BLSR32rr:
5171 case X86::BLSR32rm:
5172 case X86::BLSR64rr:
5173 case X86::BLSR64rm:
5174 case X86::BLCFILL32rr:
5175 case X86::BLCFILL32rm:
5176 case X86::BLCFILL64rr:
5177 case X86::BLCFILL64rm:
5178 case X86::BLCI32rr:
5179 case X86::BLCI32rm:
5180 case X86::BLCI64rr:
5181 case X86::BLCI64rm:
5182 case X86::BLCIC32rr:
5183 case X86::BLCIC32rm:
5184 case X86::BLCIC64rr:
5185 case X86::BLCIC64rm:
5186 case X86::BLCMSK32rr:
5187 case X86::BLCMSK32rm:
5188 case X86::BLCMSK64rr:
5189 case X86::BLCMSK64rm:
5190 case X86::BLCS32rr:
5191 case X86::BLCS32rm:
5192 case X86::BLCS64rr:
5193 case X86::BLCS64rm:
5194 case X86::BLSFILL32rr:
5195 case X86::BLSFILL32rm:
5196 case X86::BLSFILL64rr:
5197 case X86::BLSFILL64rm:
5198 case X86::BLSIC32rr:
5199 case X86::BLSIC32rm:
5200 case X86::BLSIC64rr:
5201 case X86::BLSIC64rm:
5202 case X86::BZHI32rr:
5203 case X86::BZHI32rm:
5204 case X86::BZHI64rr:
5205 case X86::BZHI64rm:
5206 case X86::T1MSKC32rr:
5207 case X86::T1MSKC32rm:
5208 case X86::T1MSKC64rr:
5209 case X86::T1MSKC64rm:
5210 case X86::TZMSK32rr:
5211 case X86::TZMSK32rm:
5212 case X86::TZMSK64rr:
5213 case X86::TZMSK64rm:
5214 // These instructions clear the overflow flag just like TEST.
5215 // FIXME: These are not the only instructions in this switch that clear the
5216 // overflow flag.
5217 ClearsOverflowFlag = true;
5218 return true;
5219 case X86::BEXTR32rr:
5220 case X86::BEXTR64rr:
5221 case X86::BEXTR32rm:
5222 case X86::BEXTR64rm:
5223 case X86::BEXTRI32ri:
5224 case X86::BEXTRI32mi:
5225 case X86::BEXTRI64ri:
5226 case X86::BEXTRI64mi:
5227 // BEXTR doesn't update the sign flag so we can't use it. It does clear
5228 // the overflow flag, but that's not useful without the sign flag.
5229 NoSignFlag = true;
5230 return true;
5231 }
5232}
5233
5234/// Check whether the use can be converted to remove a comparison against zero.
5235/// Returns the EFLAGS condition and the operand that we are comparing against zero.
5236static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
5237 switch (MI.getOpcode()) {
5238 default:
5239 return std::make_pair(X86::COND_INVALID, ~0U);
5240 CASE_ND(NEG8r)
5241 CASE_ND(NEG16r)
5242 CASE_ND(NEG32r)
5243 CASE_ND(NEG64r)
5244 return std::make_pair(X86::COND_AE, 1U);
5245 case X86::LZCNT16rr:
5246 case X86::LZCNT32rr:
5247 case X86::LZCNT64rr:
5248 return std::make_pair(X86::COND_B, 1U);
5249 case X86::POPCNT16rr:
5250 case X86::POPCNT32rr:
5251 case X86::POPCNT64rr:
5252 return std::make_pair(X86::COND_E, 1U);
5253 case X86::TZCNT16rr:
5254 case X86::TZCNT32rr:
5255 case X86::TZCNT64rr:
5256 return std::make_pair(X86::COND_B, 1U);
5257 case X86::BSF16rr:
5258 case X86::BSF32rr:
5259 case X86::BSF64rr:
5260 case X86::BSR16rr:
5261 case X86::BSR32rr:
5262 case X86::BSR64rr:
5263 return std::make_pair(X86::COND_E, 2U);
5264 case X86::BLSI32rr:
5265 case X86::BLSI64rr:
5266 return std::make_pair(X86::COND_AE, 1U);
5267 case X86::BLSR32rr:
5268 case X86::BLSR64rr:
5269 case X86::BLSMSK32rr:
5270 case X86::BLSMSK64rr:
5271 return std::make_pair(X86::COND_B, 1U);
5272 // TODO: TBM instructions.
5273 }
5274}
5275
5276/// Check if there exists an earlier instruction that
5277/// operates on the same source operands and sets flags in the same way as
5278/// Compare; remove Compare if possible.
5280 Register SrcReg2, int64_t CmpMask,
5281 int64_t CmpValue,
5282 const MachineRegisterInfo *MRI) const {
5283 // Check whether we can replace SUB with CMP.
5284 switch (CmpInstr.getOpcode()) {
5285 default:
5286 break;
5287 CASE_ND(SUB64ri32)
5288 CASE_ND(SUB32ri)
5289 CASE_ND(SUB16ri)
5290 CASE_ND(SUB8ri)
5291 CASE_ND(SUB64rm)
5292 CASE_ND(SUB32rm)
5293 CASE_ND(SUB16rm)
5294 CASE_ND(SUB8rm)
5295 CASE_ND(SUB64rr)
5296 CASE_ND(SUB32rr)
5297 CASE_ND(SUB16rr)
5298 CASE_ND(SUB8rr) {
5299 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
5300 return false;
5301 // There is no use of the destination register, we can replace SUB with CMP.
5302 unsigned NewOpcode = 0;
5303#define FROM_TO(A, B) \
5304 CASE_ND(A) NewOpcode = X86::B; \
5305 break;
5306 switch (CmpInstr.getOpcode()) {
5307 default:
5308 llvm_unreachable("Unreachable!");
5309 FROM_TO(SUB64rm, CMP64rm)
5310 FROM_TO(SUB32rm, CMP32rm)
5311 FROM_TO(SUB16rm, CMP16rm)
5312 FROM_TO(SUB8rm, CMP8rm)
5313 FROM_TO(SUB64rr, CMP64rr)
5314 FROM_TO(SUB32rr, CMP32rr)
5315 FROM_TO(SUB16rr, CMP16rr)
5316 FROM_TO(SUB8rr, CMP8rr)
5317 FROM_TO(SUB64ri32, CMP64ri32)
5318 FROM_TO(SUB32ri, CMP32ri)
5319 FROM_TO(SUB16ri, CMP16ri)
5320 FROM_TO(SUB8ri, CMP8ri)
5321 }
5322#undef FROM_TO
5323 CmpInstr.setDesc(get(NewOpcode));
5324 CmpInstr.removeOperand(0);
5325 // Mutating this instruction invalidates any debug data associated with it.
5326 CmpInstr.dropDebugNumber();
5327 // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
5328 if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
5329 NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
5330 return false;
5331 }
5332 }
5333
5334 // The following code tries to remove the comparison by re-using EFLAGS
5335 // from earlier instructions.
5336
5337 bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
5338
5339 // Transformation currently requires SSA values.
5340 if (SrcReg2.isPhysical())
5341 return false;
5342 MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
5343 assert(SrcRegDef && "Must have a definition (SSA)");
5344
5345 MachineInstr *MI = nullptr;
5346 MachineInstr *Sub = nullptr;
5347 MachineInstr *Movr0Inst = nullptr;
5349 bool NoSignFlag = false;
5350 bool ClearsOverflowFlag = false;
5351 bool ShouldUpdateCC = false;
5352 bool IsSwapped = false;
5353 bool HasNF = Subtarget.hasNF();
5354 unsigned OpNo = 0;
5356 int64_t ImmDelta = 0;
5357
5358 // Search backward from CmpInstr for the next instruction defining EFLAGS.
5360 MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
5362 std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
5363 for (MachineBasicBlock *MBB = &CmpMBB;;) {
5364 for (MachineInstr &Inst : make_range(From, MBB->rend())) {
5365 // Try to use EFLAGS from the instruction defining %SrcReg. Example:
5366 // %eax = addl ...
5367 // ... // EFLAGS not changed
5368 // testl %eax, %eax // <-- can be removed
5369 if (&Inst == SrcRegDef) {
5370 if (IsCmpZero &&
5371 isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
5372 MI = &Inst;
5373 break;
5374 }
5375
5376 // Look back for the following pattern, in which case the
5377 // test16rr/test64rr instruction could be erased.
5378 //
5379 // Example for test16rr:
5380 // %reg = and32ri %in_reg, 5
5381 // ... // EFLAGS not changed.
5382 // %src_reg = copy %reg.sub_16bit:gr32
5383 // test16rr %src_reg, %src_reg, implicit-def $eflags
5384 // Example for test64rr:
5385 // %reg = and32ri %in_reg, 5
5386 // ... // EFLAGS not changed.
5387 // %src_reg = subreg_to_reg %reg, %subreg.sub_index
5388 // test64rr %src_reg, %src_reg, implicit-def $eflags
5389 MachineInstr *AndInstr = nullptr;
5390 if (IsCmpZero &&
5391 findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
5392 Subtarget, NoSignFlag, ClearsOverflowFlag)) {
5393 assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
5394 MI = AndInstr;
5395 break;
5396 }
5397 // Cannot find other candidates before definition of SrcReg.
5398 return false;
5399 }
5400
5401 if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
5402 // Try to use EFLAGS produced by an instruction reading %SrcReg.
5403 // Example:
5404 // %eax = ...
5405 // ...
5406 // popcntl %eax
5407 // ... // EFLAGS not changed
5408 // testl %eax, %eax // <-- can be removed
5409 if (IsCmpZero) {
5410 std::tie(NewCC, OpNo) = isUseDefConvertible(Inst);
5411 if (NewCC != X86::COND_INVALID && Inst.getOperand(OpNo).isReg() &&
5412 Inst.getOperand(OpNo).getReg() == SrcReg) {
5413 ShouldUpdateCC = true;
5414 MI = &Inst;
5415 break;
5416 }
5417 }
5418
5419 // Try to use EFLAGS from an instruction with similar flag results.
5420 // Example:
5421 // sub x, y or cmp x, y
5422 // ... // EFLAGS not changed
5423 // cmp x, y // <-- can be removed
5424 if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
5425 Inst, &IsSwapped, &ImmDelta)) {
5426 Sub = &Inst;
5427 break;
5428 }
5429
5430 // MOV32r0 is implemented with xor which clobbers condition code. It is
5431 // safe to move up, if the definition to EFLAGS is dead and earlier
5432 // instructions do not read or write EFLAGS.
5433 if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
5434 Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
5435 Movr0Inst = &Inst;
5436 continue;
5437 }
5438
5439 // For the instructions are ADDrm/ADDmr with relocation, we'll skip the
5440 // optimization for replacing non-NF with NF. This is to keep backward
5441 // compatiblity with old version of linkers without APX relocation type
5442 // support on Linux OS.
5443 bool IsWithReloc = X86EnableAPXForRelocation
5444 ? false
5446
5447 // Try to replace non-NF with NF instructions.
5448 if (HasNF && Inst.registerDefIsDead(X86::EFLAGS, TRI) && !IsWithReloc) {
5449 unsigned NewOp = X86::getNFVariant(Inst.getOpcode());
5450 if (!NewOp)
5451 return false;
5452
5453 InstsToUpdate.push_back(std::make_pair(&Inst, NewOp));
5454 continue;
5455 }
5456
5457 // Cannot do anything for any other EFLAG changes.
5458 return false;
5459 }
5460 }
5461
5462 if (MI || Sub)
5463 break;
5464
5465 // Reached begin of basic block. Continue in predecessor if there is
5466 // exactly one.
5467 if (MBB->pred_size() != 1)
5468 return false;
5469 MBB = *MBB->pred_begin();
5470 From = MBB->rbegin();
5471 }
5472
5473 // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
5474 // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
5475 // If we are done with the basic block, we need to check whether EFLAGS is
5476 // live-out.
5477 bool FlagsMayLiveOut = true;
5479 MachineBasicBlock::iterator AfterCmpInstr =
5480 std::next(MachineBasicBlock::iterator(CmpInstr));
5481 for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
5482 bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
5483 bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
5484 // We should check the usage if this instruction uses and updates EFLAGS.
5485 if (!UseEFLAGS && ModifyEFLAGS) {
5486 // It is safe to remove CmpInstr if EFLAGS is updated again.
5487 FlagsMayLiveOut = false;
5488 break;
5489 }
5490 if (!UseEFLAGS && !ModifyEFLAGS)
5491 continue;
5492
5493 // EFLAGS is used by this instruction.
5494 X86::CondCode OldCC = X86::getCondFromMI(Instr);
5495 if ((MI || IsSwapped || ImmDelta != 0) && OldCC == X86::COND_INVALID)
5496 return false;
5497
5498 X86::CondCode ReplacementCC = X86::COND_INVALID;
5499 if (MI) {
5500 switch (OldCC) {
5501 default:
5502 break;
5503 case X86::COND_A:
5504 case X86::COND_AE:
5505 case X86::COND_B:
5506 case X86::COND_BE:
5507 // CF is used, we can't perform this optimization.
5508 return false;
5509 case X86::COND_G:
5510 case X86::COND_GE:
5511 case X86::COND_L:
5512 case X86::COND_LE:
5513 // If SF is used, but the instruction doesn't update the SF, then we
5514 // can't do the optimization.
5515 if (NoSignFlag)
5516 return false;
5517 [[fallthrough]];
5518 case X86::COND_O:
5519 case X86::COND_NO:
5520 // If OF is used, the instruction needs to clear it like CmpZero does.
5521 if (!ClearsOverflowFlag)
5522 return false;
5523 break;
5524 case X86::COND_S:
5525 case X86::COND_NS:
5526 // If SF is used, but the instruction doesn't update the SF, then we
5527 // can't do the optimization.
5528 if (NoSignFlag)
5529 return false;
5530 break;
5531 }
5532
5533 // If we're updating the condition code check if we have to reverse the
5534 // condition.
5535 if (ShouldUpdateCC)
5536 switch (OldCC) {
5537 default:
5538 return false;
5539 case X86::COND_E:
5540 ReplacementCC = NewCC;
5541 break;
5542 case X86::COND_NE:
5543 ReplacementCC = GetOppositeBranchCondition(NewCC);
5544 break;
5545 }
5546 } else if (IsSwapped) {
5547 // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
5548 // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
5549 // We swap the condition code and synthesize the new opcode.
5550 ReplacementCC = getSwappedCondition(OldCC);
5551 if (ReplacementCC == X86::COND_INVALID)
5552 return false;
5553 ShouldUpdateCC = true;
5554 } else if (ImmDelta != 0) {
5555 unsigned BitWidth = RI.getRegSizeInBits(*MRI->getRegClass(SrcReg));
5556 // Shift amount for min/max constants to adjust for 8/16/32 instruction
5557 // sizes.
5558 switch (OldCC) {
5559 case X86::COND_L: // x <s (C + 1) --> x <=s C
5560 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5561 return false;
5562 ReplacementCC = X86::COND_LE;
5563 break;
5564 case X86::COND_B: // x <u (C + 1) --> x <=u C
5565 if (ImmDelta != 1 || CmpValue == 0)
5566 return false;
5567 ReplacementCC = X86::COND_BE;
5568 break;
5569 case X86::COND_GE: // x >=s (C + 1) --> x >s C
5570 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5571 return false;
5572 ReplacementCC = X86::COND_G;
5573 break;
5574 case X86::COND_AE: // x >=u (C + 1) --> x >u C
5575 if (ImmDelta != 1 || CmpValue == 0)
5576 return false;
5577 ReplacementCC = X86::COND_A;
5578 break;
5579 case X86::COND_G: // x >s (C - 1) --> x >=s C
5580 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5581 return false;
5582 ReplacementCC = X86::COND_GE;
5583 break;
5584 case X86::COND_A: // x >u (C - 1) --> x >=u C
5585 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5586 return false;
5587 ReplacementCC = X86::COND_AE;
5588 break;
5589 case X86::COND_LE: // x <=s (C - 1) --> x <s C
5590 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5591 return false;
5592 ReplacementCC = X86::COND_L;
5593 break;
5594 case X86::COND_BE: // x <=u (C - 1) --> x <u C
5595 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5596 return false;
5597 ReplacementCC = X86::COND_B;
5598 break;
5599 default:
5600 return false;
5601 }
5602 ShouldUpdateCC = true;
5603 }
5604
5605 if (ShouldUpdateCC && ReplacementCC != OldCC) {
5606 // Push the MachineInstr to OpsToUpdate.
5607 // If it is safe to remove CmpInstr, the condition code of these
5608 // instructions will be modified.
5609 OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
5610 }
5611 if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
5612 // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
5613 FlagsMayLiveOut = false;
5614 break;
5615 }
5616 }
5617
5618 // If we have to update users but EFLAGS is live-out abort, since we cannot
5619 // easily find all of the users.
5620 if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
5621 for (MachineBasicBlock *Successor : CmpMBB.successors())
5622 if (Successor->isLiveIn(X86::EFLAGS))
5623 return false;
5624 }
5625
5626 // The instruction to be updated is either Sub or MI.
5627 assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
5628 Sub = MI != nullptr ? MI : Sub;
5629 MachineBasicBlock *SubBB = Sub->getParent();
5630 // Move Movr0Inst to the appropriate place before Sub.
5631 if (Movr0Inst) {
5632 // Only move within the same block so we don't accidentally move to a
5633 // block with higher execution frequency.
5634 if (&CmpMBB != SubBB)
5635 return false;
5636 // Look backwards until we find a def that doesn't use the current EFLAGS.
5638 InsertE = Sub->getParent()->rend();
5639 for (; InsertI != InsertE; ++InsertI) {
5640 MachineInstr *Instr = &*InsertI;
5641 if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
5642 Instr->modifiesRegister(X86::EFLAGS, TRI)) {
5643 Movr0Inst->getParent()->remove(Movr0Inst);
5644 Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
5645 Movr0Inst);
5646 break;
5647 }
5648 }
5649 if (InsertI == InsertE)
5650 return false;
5651 }
5652
5653 // Replace non-NF with NF instructions.
5654 for (auto &Inst : InstsToUpdate) {
5655 Inst.first->setDesc(get(Inst.second));
5656 Inst.first->removeOperand(
5657 Inst.first->findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5658 }
5659
5660 // Make sure Sub instruction defines EFLAGS and mark the def live.
5661 MachineOperand *FlagDef =
5662 Sub->findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
5663 assert(FlagDef && "Unable to locate a def EFLAGS operand");
5664 FlagDef->setIsDead(false);
5665
5666 CmpInstr.eraseFromParent();
5667
5668 // Modify the condition code of instructions in OpsToUpdate.
5669 for (auto &Op : OpsToUpdate) {
5670 Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
5671 .setImm(Op.second);
5672 }
5673 // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
5674 for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
5675 MBB = *MBB->pred_begin()) {
5676 assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
5677 if (!MBB->isLiveIn(X86::EFLAGS))
5678 MBB->addLiveIn(X86::EFLAGS);
5679 }
5680 return true;
5681}
5682
5683/// \returns true if the instruction can be changed to COPY when imm is 0.
5684static bool canConvert2Copy(unsigned Opc) {
5685 switch (Opc) {
5686 default:
5687 return false;
5688 CASE_ND(ADD64ri32)
5689 CASE_ND(SUB64ri32)
5690 CASE_ND(OR64ri32)
5691 CASE_ND(XOR64ri32)
5692 CASE_ND(ADD32ri)
5693 CASE_ND(SUB32ri)
5694 CASE_ND(OR32ri)
5695 CASE_ND(XOR32ri)
5696 return true;
5697 }
5698}
5699
5700/// Convert an ALUrr opcode to corresponding ALUri opcode. Such as
5701/// ADD32rr ==> ADD32ri
5702static unsigned convertALUrr2ALUri(unsigned Opc, bool HasNDDI) {
5703 switch (Opc) {
5704 default:
5705 return 0;
5706#define FROM_TO(FROM, TO) \
5707 case X86::FROM: \
5708 return X86::TO; \
5709 case X86::FROM##_ND: \
5710 return X86::TO##_ND;
5711 FROM_TO(ADC64rr, ADC64ri32)
5712 FROM_TO(SBB64rr, SBB64ri32)
5713 FROM_TO(AND64rr, AND64ri32)
5714 FROM_TO(OR64rr, OR64ri32)
5715 FROM_TO(XOR64rr, XOR64ri32)
5716 FROM_TO(SHR64rCL, SHR64ri)
5717 FROM_TO(SHL64rCL, SHL64ri)
5718 FROM_TO(SAR64rCL, SAR64ri)
5719 FROM_TO(ROL64rCL, ROL64ri)
5720 FROM_TO(ROR64rCL, ROR64ri)
5721 FROM_TO(RCL64rCL, RCL64ri)
5722 FROM_TO(RCR64rCL, RCR64ri)
5723 FROM_TO(ADD32rr, ADD32ri)
5724 FROM_TO(ADC32rr, ADC32ri)
5725 FROM_TO(SUB32rr, SUB32ri)
5726 FROM_TO(SBB32rr, SBB32ri)
5727 FROM_TO(AND32rr, AND32ri)
5728 FROM_TO(OR32rr, OR32ri)
5729 FROM_TO(XOR32rr, XOR32ri)
5730 FROM_TO(SHR32rCL, SHR32ri)
5731 FROM_TO(SHL32rCL, SHL32ri)
5732 FROM_TO(SAR32rCL, SAR32ri)
5733 FROM_TO(ROL32rCL, ROL32ri)
5734 FROM_TO(ROR32rCL, ROR32ri)
5735 FROM_TO(RCL32rCL, RCL32ri)
5736 FROM_TO(RCR32rCL, RCR32ri)
5737#undef FROM_TO
5738#define FROM_TO(FROM, TO) \
5739 case X86::FROM: \
5740 return X86::TO;
5741 FROM_TO(ADD64rr, ADD64ri32)
5742 FROM_TO(SUB64rr, SUB64ri32)
5743 FROM_TO(TEST64rr, TEST64ri32)
5744 FROM_TO(CTEST64rr, CTEST64ri32)
5745 FROM_TO(CMP64rr, CMP64ri32)
5746 FROM_TO(CCMP64rr, CCMP64ri32)
5747 FROM_TO(TEST32rr, TEST32ri)
5748 FROM_TO(CTEST32rr, CTEST32ri)
5749 FROM_TO(CMP32rr, CMP32ri)
5750 FROM_TO(CCMP32rr, CCMP32ri)
5751#undef FROM_TO
5752 case X86::ADD64rr_ND:
5753 return HasNDDI ? X86::ADD64ri32_ND : 0;
5754 case X86::SUB64rr_ND:
5755 return HasNDDI ? X86::SUB64ri32_ND : 0;
5756 }
5757}
5758
5759/// Reg is assigned ImmVal in DefMI, and is used in UseMI.
5760/// If MakeChange is true, this function tries to replace Reg by ImmVal in
5761/// UseMI. If MakeChange is false, just check if folding is possible.
5762//
5763/// \returns true if folding is successful or possible.
5764bool X86InstrInfo::foldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
5765 Register Reg, int64_t ImmVal,
5767 bool MakeChange) const {
5768 bool Modified = false;
5769
5770 // 64 bit operations accept sign extended 32 bit immediates.
5771 // 32 bit operations accept all 32 bit immediates, so we don't need to check
5772 // them.
5773 const TargetRegisterClass *RC = nullptr;
5774 if (Reg.isVirtual())
5775 RC = MRI->getRegClass(Reg);
5776 if ((Reg.isPhysical() && X86::GR64RegClass.contains(Reg)) ||
5777 (Reg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC))) {
5778 if (!isInt<32>(ImmVal))
5779 return false;
5780 }
5781
5782 if (UseMI.findRegisterUseOperand(Reg, /*TRI=*/nullptr)->getSubReg())
5783 return false;
5784 // Immediate has larger code size than register. So avoid folding the
5785 // immediate if it has more than 1 use and we are optimizing for size.
5786 if (UseMI.getMF()->getFunction().hasOptSize() && Reg.isVirtual() &&
5787 !MRI->hasOneNonDBGUse(Reg))
5788 return false;
5789
5790 unsigned Opc = UseMI.getOpcode();
5791 unsigned NewOpc;
5792 if (Opc == TargetOpcode::COPY) {
5793 Register ToReg = UseMI.getOperand(0).getReg();
5794 const TargetRegisterClass *RC = nullptr;
5795 if (ToReg.isVirtual())
5796 RC = MRI->getRegClass(ToReg);
5797 bool GR32Reg = (ToReg.isVirtual() && X86::GR32RegClass.hasSubClassEq(RC)) ||
5798 (ToReg.isPhysical() && X86::GR32RegClass.contains(ToReg));
5799 bool GR64Reg = (ToReg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC)) ||
5800 (ToReg.isPhysical() && X86::GR64RegClass.contains(ToReg));
5801 bool GR8Reg = (ToReg.isVirtual() && X86::GR8RegClass.hasSubClassEq(RC)) ||
5802 (ToReg.isPhysical() && X86::GR8RegClass.contains(ToReg));
5803
5804 if (ImmVal == 0) {
5805 // We have MOV32r0 only.
5806 if (!GR32Reg)
5807 return false;
5808 }
5809
5810 if (GR64Reg) {
5811 if (isUInt<32>(ImmVal))
5812 NewOpc = X86::MOV32ri64;
5813 else
5814 NewOpc = X86::MOV64ri;
5815 } else if (GR32Reg) {
5816 NewOpc = X86::MOV32ri;
5817 if (ImmVal == 0) {
5818 // MOV32r0 clobbers EFLAGS.
5819 const TargetRegisterInfo *TRI = &getRegisterInfo();
5820 if (UseMI.getParent()->computeRegisterLiveness(
5821 TRI, X86::EFLAGS, UseMI) != MachineBasicBlock::LQR_Dead)
5822 return false;
5823
5824 // MOV32r0 is different than other cases because it doesn't encode the
5825 // immediate in the instruction. So we directly modify it here.
5826 if (!MakeChange)
5827 return true;
5828 UseMI.setDesc(get(X86::MOV32r0));
5829 UseMI.removeOperand(
5830 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5831 UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/true,
5832 /*isImp=*/true,
5833 /*isKill=*/false,
5834 /*isDead=*/true));
5835 Modified = true;
5836 }
5837 } else if (GR8Reg)
5838 NewOpc = X86::MOV8ri;
5839 else
5840 return false;
5841 } else
5842 NewOpc = convertALUrr2ALUri(Opc, Subtarget.hasNDDI());
5843
5844 if (!NewOpc)
5845 return false;
5846
5847 // For SUB instructions the immediate can only be the second source operand.
5848 if ((NewOpc == X86::SUB64ri32 || NewOpc == X86::SUB32ri ||
5849 NewOpc == X86::SBB64ri32 || NewOpc == X86::SBB32ri ||
5850 NewOpc == X86::SUB64ri32_ND || NewOpc == X86::SUB32ri_ND ||
5851 NewOpc == X86::SBB64ri32_ND || NewOpc == X86::SBB32ri_ND) &&
5852 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 2)
5853 return false;
5854 // For CMP instructions the immediate can only be at index 1.
5855 if (((NewOpc == X86::CMP64ri32 || NewOpc == X86::CMP32ri) ||
5856 (NewOpc == X86::CCMP64ri32 || NewOpc == X86::CCMP32ri)) &&
5857 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 1)
5858 return false;
5859
5860 using namespace X86;
5861 if (isSHL(Opc) || isSHR(Opc) || isSAR(Opc) || isROL(Opc) || isROR(Opc) ||
5862 isRCL(Opc) || isRCR(Opc)) {
5863 unsigned RegIdx = UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr);
5864 if (RegIdx < 2)
5865 return false;
5866 if (!isInt<8>(ImmVal))
5867 return false;
5868 assert(Reg == X86::CL);
5869
5870 if (!MakeChange)
5871 return true;
5872 UseMI.setDesc(get(NewOpc));
5873 UseMI.removeOperand(RegIdx);
5874 UseMI.addOperand(MachineOperand::CreateImm(ImmVal));
5875 // Reg is physical register $cl, so we don't know if DefMI is dead through
5876 // MRI. Let the caller handle it, or pass dead-mi-elimination can delete
5877 // the dead physical register define instruction.
5878 return true;
5879 }
5880
5881 if (!MakeChange)
5882 return true;
5883
5884 if (!Modified) {
5885 // Modify the instruction.
5886 if (ImmVal == 0 && canConvert2Copy(NewOpc) &&
5887 UseMI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr)) {
5888 // %100 = add %101, 0
5889 // ==>
5890 // %100 = COPY %101
5891 UseMI.setDesc(get(TargetOpcode::COPY));
5892 UseMI.removeOperand(
5893 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5894 UseMI.removeOperand(
5895 UseMI.findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5896 UseMI.untieRegOperand(0);
5899 } else {
5900 unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
5901 unsigned ImmOpNum = 2;
5902 if (!UseMI.getOperand(0).isDef()) {
5903 Op1 = 0; // TEST, CMP, CTEST, CCMP
5904 ImmOpNum = 1;
5905 }
5906 if (Opc == TargetOpcode::COPY)
5907 ImmOpNum = 1;
5908 if (findCommutedOpIndices(UseMI, Op1, Op2) &&
5909 UseMI.getOperand(Op1).getReg() == Reg)
5910 commuteInstruction(UseMI);
5911
5912 assert(UseMI.getOperand(ImmOpNum).getReg() == Reg);
5913 UseMI.setDesc(get(NewOpc));
5914 UseMI.getOperand(ImmOpNum).ChangeToImmediate(ImmVal);
5915 }
5916 }
5917
5918 if (Reg.isVirtual() && MRI->use_nodbg_empty(Reg))
5920
5921 return true;
5922}
5923
5924/// foldImmediate - 'Reg' is known to be defined by a move immediate
5925/// instruction, try to fold the immediate into the use instruction.
5927 Register Reg, MachineRegisterInfo *MRI) const {
5928 int64_t ImmVal;
5929 if (!getConstValDefinedInReg(DefMI, Reg, ImmVal))
5930 return false;
5931
5932 return foldImmediateImpl(UseMI, &DefMI, Reg, ImmVal, MRI, true);
5933}
5934
5935/// Expand a single-def pseudo instruction to a two-addr
5936/// instruction with two undef reads of the register being defined.
5937/// This is used for mapping:
5938/// %xmm4 = V_SET0
5939/// to:
5940/// %xmm4 = PXORrr undef %xmm4, undef %xmm4
5941///
5943 const MCInstrDesc &Desc) {
5944 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5945 Register Reg = MIB.getReg(0);
5946 MIB->setDesc(Desc);
5947
5948 // MachineInstr::addOperand() will insert explicit operands before any
5949 // implicit operands.
5951 // But we don't trust that.
5952 assert(MIB.getReg(1) == Reg && MIB.getReg(2) == Reg && "Misplaced operand");
5953 return true;
5954}
5955
5956/// Expand a single-def pseudo instruction to a two-addr
5957/// instruction with two %k0 reads.
5958/// This is used for mapping:
5959/// %k4 = K_SET1
5960/// to:
5961/// %k4 = KXNORrr %k0, %k0
5963 Register Reg) {
5964 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5965 MIB->setDesc(Desc);
5967 return true;
5968}
5969
5971 bool MinusOne) {
5972 MachineBasicBlock &MBB = *MIB->getParent();
5973 const DebugLoc &DL = MIB->getDebugLoc();
5974 Register Reg = MIB.getReg(0);
5975
5976 // Insert the XOR.
5977 BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
5980
5981 // Turn the pseudo into an INC or DEC.
5982 MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
5983 MIB.addReg(Reg);
5984
5985 return true;
5986}
5987
5989 const TargetInstrInfo &TII,
5990 const X86Subtarget &Subtarget) {
5991 MachineBasicBlock &MBB = *MIB->getParent();
5992 const DebugLoc &DL = MIB->getDebugLoc();
5993 int64_t Imm = MIB->getOperand(1).getImm();
5994 assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
5996
5997 int StackAdjustment;
5998
5999 if (Subtarget.is64Bit()) {
6000 assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
6001 MIB->getOpcode() == X86::MOV32ImmSExti8);
6002
6003 // Can't use push/pop lowering if the function might write to the red zone.
6004 X86MachineFunctionInfo *X86FI =
6005 MBB.getParent()->getInfo<X86MachineFunctionInfo>();
6006 if (X86FI->getUsesRedZone()) {
6007 MIB->setDesc(TII.get(MIB->getOpcode() == X86::MOV32ImmSExti8
6008 ? X86::MOV32ri
6009 : X86::MOV64ri));
6010 return true;
6011 }
6012
6013 // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
6014 // widen the register if necessary.
6015 StackAdjustment = 8;
6016 BuildMI(MBB, I, DL, TII.get(X86::PUSH64i32)).addImm(Imm);
6017 MIB->setDesc(TII.get(X86::POP64r));
6018 MIB->getOperand(0).setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
6019 } else {
6020 assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
6021 StackAdjustment = 4;
6022 BuildMI(MBB, I, DL, TII.get(X86::PUSH32i)).addImm(Imm);
6023 MIB->setDesc(TII.get(X86::POP32r));
6024 }
6025 MIB->removeOperand(1);
6026 MIB->addImplicitDefUseOperands(*MBB.getParent());
6027
6028 // Build CFI if necessary.
6029 MachineFunction &MF = *MBB.getParent();
6030 const X86FrameLowering *TFL = Subtarget.getFrameLowering();
6031 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
6032 bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
6033 bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
6034 if (EmitCFI) {
6035 TFL->BuildCFI(
6036 MBB, I, DL,
6037 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
6038 TFL->BuildCFI(
6039 MBB, std::next(I), DL,
6040 MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
6041 }
6042
6043 return true;
6044}
6045
6046// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
6047// code sequence is needed for other targets.
6049 const TargetInstrInfo &TII) {
6050 MachineBasicBlock &MBB = *MIB->getParent();
6051 const DebugLoc &DL = MIB->getDebugLoc();
6052 Register Reg = MIB.getReg(0);
6053 const GlobalValue *GV =
6054 cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
6055 auto Flags = MachineMemOperand::MOLoad |
6058 MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
6059 MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
6061
6062 BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg)
6063 .addReg(X86::RIP)
6064 .addImm(1)
6065 .addReg(0)
6067 .addReg(0)
6068 .addMemOperand(MMO);
6069 MIB->setDebugLoc(DL);
6070 MIB->setDesc(TII.get(X86::MOV64rm));
6072}
6073
6075 MachineBasicBlock &MBB = *MIB->getParent();
6076 MachineFunction &MF = *MBB.getParent();
6077 const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
6078 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
6079 unsigned XorOp =
6080 MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
6081 MIB->setDesc(TII.get(XorOp));
6082 MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
6083 return true;
6084}
6085
6086// This is used to handle spills for 128/256-bit registers when we have AVX512,
6087// but not VLX. If it uses an extended register we need to use an instruction
6088// that loads the lower 128/256-bit, but is available with only AVX512F.
6090 const TargetRegisterInfo *TRI,
6091 const MCInstrDesc &LoadDesc,
6092 const MCInstrDesc &BroadcastDesc, unsigned SubIdx) {
6093 Register DestReg = MIB.getReg(0);
6094 // Check if DestReg is XMM16-31 or YMM16-31.
6095 if (TRI->getEncodingValue(DestReg) < 16) {
6096 // We can use a normal VEX encoded load.
6097 MIB->setDesc(LoadDesc);
6098 } else {
6099 // Use a 128/256-bit VBROADCAST instruction.
6100 MIB->setDesc(BroadcastDesc);
6101 // Change the destination to a 512-bit register.
6102 DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
6103 MIB->getOperand(0).setReg(DestReg);
6104 }
6105 return true;
6106}
6107
6108// This is used to handle spills for 128/256-bit registers when we have AVX512,
6109// but not VLX. If it uses an extended register we need to use an instruction
6110// that stores the lower 128/256-bit, but is available with only AVX512F.
6112 const TargetRegisterInfo *TRI,
6113 const MCInstrDesc &StoreDesc,
6114 const MCInstrDesc &ExtractDesc, unsigned SubIdx) {
6115 Register SrcReg = MIB.getReg(X86::AddrNumOperands);
6116 // Check if DestReg is XMM16-31 or YMM16-31.
6117 if (TRI->getEncodingValue(SrcReg) < 16) {
6118 // We can use a normal VEX encoded store.
6119 MIB->setDesc(StoreDesc);
6120 } else {
6121 // Use a VEXTRACTF instruction.
6122 MIB->setDesc(ExtractDesc);
6123 // Change the destination to a 512-bit register.
6124 SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
6126 MIB.addImm(0x0); // Append immediate to extract from the lower bits.
6127 }
6128
6129 return true;
6130}
6131
6133 MIB->setDesc(Desc);
6134 int64_t ShiftAmt = MIB->getOperand(2).getImm();
6135 // Temporarily remove the immediate so we can add another source register.
6136 MIB->removeOperand(2);
6137 // Add the register. Don't copy the kill flag if there is one.
6138 MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef()));
6139 // Add back the immediate.
6140 MIB.addImm(ShiftAmt);
6141 return true;
6142}
6143
6145 const TargetInstrInfo &TII, bool HasAVX) {
6146 unsigned NewOpc;
6147 if (MI.getOpcode() == X86::MOVSHPrm) {
6148 NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
6149 Register Reg = MI.getOperand(0).getReg();
6150 if (Reg > X86::XMM15)
6151 NewOpc = X86::VMOVSSZrm;
6152 } else {
6153 NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
6154 Register Reg = MI.getOperand(5).getReg();
6155 if (Reg > X86::XMM15)
6156 NewOpc = X86::VMOVSSZmr;
6157 }
6158
6159 MIB->setDesc(TII.get(NewOpc));
6160 return true;
6161}
6162
6164 bool HasAVX = Subtarget.hasAVX();
6165 MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
6166 switch (MI.getOpcode()) {
6167 case X86::MOV32r0:
6168 return Expand2AddrUndef(MIB, get(X86::XOR32rr));
6169 case X86::MOV32r1:
6170 return expandMOV32r1(MIB, *this, /*MinusOne=*/false);
6171 case X86::MOV32r_1:
6172 return expandMOV32r1(MIB, *this, /*MinusOne=*/true);
6173 case X86::MOV32ImmSExti8:
6174 case X86::MOV64ImmSExti8:
6175 return ExpandMOVImmSExti8(MIB, *this, Subtarget);
6176 case X86::SETB_C32r:
6177 return Expand2AddrUndef(MIB, get(X86::SBB32rr));
6178 case X86::SETB_C64r:
6179 return Expand2AddrUndef(MIB, get(X86::SBB64rr));
6180 case X86::MMX_SET0:
6181 return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
6182 case X86::V_SET0:
6183 case X86::FsFLD0SS:
6184 case X86::FsFLD0SD:
6185 case X86::FsFLD0SH:
6186 case X86::FsFLD0F128:
6187 return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
6188 case X86::AVX_SET0: {
6189 assert(HasAVX && "AVX not supported");
6191 Register SrcReg = MIB.getReg(0);
6192 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6193 MIB->getOperand(0).setReg(XReg);
6194 Expand2AddrUndef(MIB, get(X86::VXORPSrr));
6195 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6196 return true;
6197 }
6198 case X86::AVX512_128_SET0:
6199 case X86::AVX512_FsFLD0SH:
6200 case X86::AVX512_FsFLD0SS:
6201 case X86::AVX512_FsFLD0SD:
6202 case X86::AVX512_FsFLD0F128: {
6203 bool HasVLX = Subtarget.hasVLX();
6204 Register SrcReg = MIB.getReg(0);
6206 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
6207 return Expand2AddrUndef(MIB,
6208 get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6209 // Extended register without VLX. Use a larger XOR.
6210 SrcReg =
6211 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
6212 MIB->getOperand(0).setReg(SrcReg);
6213 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6214 }
6215 case X86::AVX512_256_SET0:
6216 case X86::AVX512_512_SET0: {
6217 bool HasVLX = Subtarget.hasVLX();
6218 Register SrcReg = MIB.getReg(0);
6220 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
6221 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6222 MIB->getOperand(0).setReg(XReg);
6223 Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6224 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6225 return true;
6226 }
6227 if (MI.getOpcode() == X86::AVX512_256_SET0) {
6228 // No VLX so we must reference a zmm.
6229 MCRegister ZReg =
6230 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
6231 MIB->getOperand(0).setReg(ZReg);
6232 }
6233 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6234 }
6235 case X86::MOVSHPmr:
6236 case X86::MOVSHPrm:
6237 return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX());
6238 case X86::V_SETALLONES:
6239 return Expand2AddrUndef(MIB,
6240 get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
6241 case X86::AVX2_SETALLONES:
6242 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6243 case X86::AVX1_SETALLONES: {
6244 Register Reg = MIB.getReg(0);
6245 // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
6246 MIB->setDesc(get(X86::VCMPPSYrri));
6247 MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
6248 return true;
6249 }
6250 case X86::AVX512_128_SETALLONES:
6251 case X86::AVX512_256_SETALLONES:
6252 case X86::AVX512_512_SETALLONES: {
6253 Register Reg = MIB.getReg(0);
6254 unsigned Opc;
6255 switch (MI.getOpcode()) {
6256 case X86::AVX512_128_SETALLONES: {
6257 if (X86::VR128RegClass.contains(Reg))
6258 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDrr));
6259
6260 Opc = X86::VPTERNLOGDZ128rri;
6261 break;
6262 }
6263 case X86::AVX512_256_SETALLONES: {
6264 if (X86::VR256RegClass.contains(Reg))
6265 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6266
6267 Opc = X86::VPTERNLOGDZ256rri;
6268 break;
6269 }
6270 case X86::AVX512_512_SETALLONES:
6271 Opc = X86::VPTERNLOGDZrri;
6272 break;
6273 }
6274 MIB->setDesc(get(Opc));
6275 // VPTERNLOGD needs 3 register inputs and an immediate.
6276 // 0xff will return 1s for any input.
6277 MIB.addReg(Reg, RegState::Undef)
6278 .addReg(Reg, RegState::Undef)
6279 .addReg(Reg, RegState::Undef)
6280 .addImm(0xff);
6281 return true;
6282 }
6283 case X86::AVX512_512_SEXT_MASK_32:
6284 case X86::AVX512_512_SEXT_MASK_64: {
6285 Register Reg = MIB.getReg(0);
6286 Register MaskReg = MIB.getReg(1);
6287 RegState MaskState = getRegState(MIB->getOperand(1));
6288 unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64)
6289 ? X86::VPTERNLOGQZrrikz
6290 : X86::VPTERNLOGDZrrikz;
6291 MI.removeOperand(1);
6292 MIB->setDesc(get(Opc));
6293 // VPTERNLOG needs 3 register inputs and an immediate.
6294 // 0xff will return 1s for any input.
6295 MIB.addReg(Reg, RegState::Undef)
6296 .addReg(MaskReg, MaskState)
6297 .addReg(Reg, RegState::Undef)
6298 .addReg(Reg, RegState::Undef)
6299 .addImm(0xff);
6300 return true;
6301 }
6302 case X86::VMOVAPSZ128rm_NOVLX:
6303 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
6304 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6305 case X86::VMOVUPSZ128rm_NOVLX:
6306 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
6307 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6308 case X86::VMOVAPSZ256rm_NOVLX:
6309 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
6310 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6311 case X86::VMOVUPSZ256rm_NOVLX:
6312 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
6313 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6314 case X86::VMOVAPSZ128mr_NOVLX:
6315 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
6316 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6317 case X86::VMOVUPSZ128mr_NOVLX:
6318 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
6319 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6320 case X86::VMOVAPSZ256mr_NOVLX:
6321 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
6322 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6323 case X86::VMOVUPSZ256mr_NOVLX:
6324 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
6325 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6326 case X86::MOV32ri64: {
6327 Register Reg = MIB.getReg(0);
6328 Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
6329 MI.setDesc(get(X86::MOV32ri));
6330 MIB->getOperand(0).setReg(Reg32);
6332 return true;
6333 }
6334
6335 case X86::RDFLAGS32:
6336 case X86::RDFLAGS64: {
6337 unsigned Is64Bit = MI.getOpcode() == X86::RDFLAGS64;
6338 MachineBasicBlock &MBB = *MIB->getParent();
6339
6340 MachineInstr *NewMI = BuildMI(MBB, MI, MIB->getDebugLoc(),
6341 get(Is64Bit ? X86::PUSHF64 : X86::PUSHF32))
6342 .getInstr();
6343
6344 // Permit reads of the EFLAGS and DF registers without them being defined.
6345 // This intrinsic exists to read external processor state in flags, such as
6346 // the trap flag, interrupt flag, and direction flag, none of which are
6347 // modeled by the backend.
6348 assert(NewMI->getOperand(2).getReg() == X86::EFLAGS &&
6349 "Unexpected register in operand! Should be EFLAGS.");
6350 NewMI->getOperand(2).setIsUndef();
6351 assert(NewMI->getOperand(3).getReg() == X86::DF &&
6352 "Unexpected register in operand! Should be DF.");
6353 NewMI->getOperand(3).setIsUndef();
6354
6355 MIB->setDesc(get(Is64Bit ? X86::POP64r : X86::POP32r));
6356 return true;
6357 }
6358
6359 case X86::WRFLAGS32:
6360 case X86::WRFLAGS64: {
6361 unsigned Is64Bit = MI.getOpcode() == X86::WRFLAGS64;
6362 MachineBasicBlock &MBB = *MIB->getParent();
6363
6364 BuildMI(MBB, MI, MIB->getDebugLoc(),
6365 get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
6366 .addReg(MI.getOperand(0).getReg());
6367 BuildMI(MBB, MI, MIB->getDebugLoc(),
6368 get(Is64Bit ? X86::POPF64 : X86::POPF32));
6369 MI.eraseFromParent();
6370 return true;
6371 }
6372
6373 // KNL does not recognize dependency-breaking idioms for mask registers,
6374 // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
6375 // Using %k0 as the undef input register is a performance heuristic based
6376 // on the assumption that %k0 is used less frequently than the other mask
6377 // registers, since it is not usable as a write mask.
6378 // FIXME: A more advanced approach would be to choose the best input mask
6379 // register based on context.
6380 case X86::KSET0B:
6381 return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0);
6382 case X86::KSET0W:
6383 return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
6384 case X86::KSET0D:
6385 return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
6386 case X86::KSET0Q:
6387 return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
6388 case X86::KSET1B:
6389 return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0);
6390 case X86::KSET1W:
6391 return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
6392 case X86::KSET1D:
6393 return Expand2AddrKreg(MIB, get(X86::KXNORDkk), X86::K0);
6394 case X86::KSET1Q:
6395 return Expand2AddrKreg(MIB, get(X86::KXNORQkk), X86::K0);
6396 case TargetOpcode::LOAD_STACK_GUARD:
6397 expandLoadStackGuard(MIB, *this);
6398 return true;
6399 case X86::XOR64_FP:
6400 case X86::XOR32_FP:
6401 return expandXorFP(MIB, *this);
6402 case X86::SHLDROT32ri:
6403 return expandSHXDROT(MIB, get(X86::SHLD32rri8));
6404 case X86::SHLDROT64ri:
6405 return expandSHXDROT(MIB, get(X86::SHLD64rri8));
6406 case X86::SHRDROT32ri:
6407 return expandSHXDROT(MIB, get(X86::SHRD32rri8));
6408 case X86::SHRDROT64ri:
6409 return expandSHXDROT(MIB, get(X86::SHRD64rri8));
6410 case X86::ADD8rr_DB:
6411 MIB->setDesc(get(X86::OR8rr));
6412 break;
6413 case X86::ADD16rr_DB:
6414 MIB->setDesc(get(X86::OR16rr));
6415 break;
6416 case X86::ADD32rr_DB:
6417 MIB->setDesc(get(X86::OR32rr));
6418 break;
6419 case X86::ADD64rr_DB:
6420 MIB->setDesc(get(X86::OR64rr));
6421 break;
6422 case X86::ADD8ri_DB:
6423 MIB->setDesc(get(X86::OR8ri));
6424 break;
6425 case X86::ADD16ri_DB:
6426 MIB->setDesc(get(X86::OR16ri));
6427 break;
6428 case X86::ADD32ri_DB:
6429 MIB->setDesc(get(X86::OR32ri));
6430 break;
6431 case X86::ADD64ri32_DB:
6432 MIB->setDesc(get(X86::OR64ri32));
6433 break;
6434 }
6435 return false;
6436}
6437
6438/// Return true for all instructions that only update
6439/// the first 32 or 64-bits of the destination register and leave the rest
6440/// unmodified. This can be used to avoid folding loads if the instructions
6441/// only update part of the destination register, and the non-updated part is
6442/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
6443/// instructions breaks the partial register dependency and it can improve
6444/// performance. e.g.:
6445///
6446/// movss (%rdi), %xmm0
6447/// cvtss2sd %xmm0, %xmm0
6448///
6449/// Instead of
6450/// cvtss2sd (%rdi), %xmm0
6451///
6452/// FIXME: This should be turned into a TSFlags.
6453///
6454static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
6455 bool ForLoadFold = false) {
6456 switch (Opcode) {
6457 case X86::CVTSI2SSrr:
6458 case X86::CVTSI2SSrm:
6459 case X86::CVTSI642SSrr:
6460 case X86::CVTSI642SSrm:
6461 case X86::CVTSI2SDrr:
6462 case X86::CVTSI2SDrm:
6463 case X86::CVTSI642SDrr:
6464 case X86::CVTSI642SDrm:
6465 // Load folding won't effect the undef register update since the input is
6466 // a GPR.
6467 return !ForLoadFold;
6468 case X86::CVTSD2SSrr:
6469 case X86::CVTSD2SSrm:
6470 case X86::CVTSS2SDrr:
6471 case X86::CVTSS2SDrm:
6472 case X86::MOVHPDrm:
6473 case X86::MOVHPSrm:
6474 case X86::MOVLPDrm:
6475 case X86::MOVLPSrm:
6476 case X86::RCPSSr:
6477 case X86::RCPSSm:
6478 case X86::RCPSSr_Int:
6479 case X86::RCPSSm_Int:
6480 case X86::ROUNDSDri:
6481 case X86::ROUNDSDmi:
6482 case X86::ROUNDSSri:
6483 case X86::ROUNDSSmi:
6484 case X86::RSQRTSSr:
6485 case X86::RSQRTSSm:
6486 case X86::RSQRTSSr_Int:
6487 case X86::RSQRTSSm_Int:
6488 case X86::SQRTSSr:
6489 case X86::SQRTSSm:
6490 case X86::SQRTSSr_Int:
6491 case X86::SQRTSSm_Int:
6492 case X86::SQRTSDr:
6493 case X86::SQRTSDm:
6494 case X86::SQRTSDr_Int:
6495 case X86::SQRTSDm_Int:
6496 return true;
6497 case X86::VFCMULCPHZ128rm:
6498 case X86::VFCMULCPHZ128rmb:
6499 case X86::VFCMULCPHZ128rmbkz:
6500 case X86::VFCMULCPHZ128rmkz:
6501 case X86::VFCMULCPHZ128rr:
6502 case X86::VFCMULCPHZ128rrkz:
6503 case X86::VFCMULCPHZ256rm:
6504 case X86::VFCMULCPHZ256rmb:
6505 case X86::VFCMULCPHZ256rmbkz:
6506 case X86::VFCMULCPHZ256rmkz:
6507 case X86::VFCMULCPHZ256rr:
6508 case X86::VFCMULCPHZ256rrkz:
6509 case X86::VFCMULCPHZrm:
6510 case X86::VFCMULCPHZrmb:
6511 case X86::VFCMULCPHZrmbkz:
6512 case X86::VFCMULCPHZrmkz:
6513 case X86::VFCMULCPHZrr:
6514 case X86::VFCMULCPHZrrb:
6515 case X86::VFCMULCPHZrrbkz:
6516 case X86::VFCMULCPHZrrkz:
6517 case X86::VFMULCPHZ128rm:
6518 case X86::VFMULCPHZ128rmb:
6519 case X86::VFMULCPHZ128rmbkz:
6520 case X86::VFMULCPHZ128rmkz:
6521 case X86::VFMULCPHZ128rr:
6522 case X86::VFMULCPHZ128rrkz:
6523 case X86::VFMULCPHZ256rm:
6524 case X86::VFMULCPHZ256rmb:
6525 case X86::VFMULCPHZ256rmbkz:
6526 case X86::VFMULCPHZ256rmkz:
6527 case X86::VFMULCPHZ256rr:
6528 case X86::VFMULCPHZ256rrkz:
6529 case X86::VFMULCPHZrm:
6530 case X86::VFMULCPHZrmb:
6531 case X86::VFMULCPHZrmbkz:
6532 case X86::VFMULCPHZrmkz:
6533 case X86::VFMULCPHZrr:
6534 case X86::VFMULCPHZrrb:
6535 case X86::VFMULCPHZrrbkz:
6536 case X86::VFMULCPHZrrkz:
6537 case X86::VFCMULCSHZrm:
6538 case X86::VFCMULCSHZrmkz:
6539 case X86::VFCMULCSHZrr:
6540 case X86::VFCMULCSHZrrb:
6541 case X86::VFCMULCSHZrrbkz:
6542 case X86::VFCMULCSHZrrkz:
6543 case X86::VFMULCSHZrm:
6544 case X86::VFMULCSHZrmkz:
6545 case X86::VFMULCSHZrr:
6546 case X86::VFMULCSHZrrb:
6547 case X86::VFMULCSHZrrbkz:
6548 case X86::VFMULCSHZrrkz:
6549 return Subtarget.hasMULCFalseDeps();
6550 case X86::VPERMDYrm:
6551 case X86::VPERMDYrr:
6552 case X86::VPERMQYmi:
6553 case X86::VPERMQYri:
6554 case X86::VPERMPSYrm:
6555 case X86::VPERMPSYrr:
6556 case X86::VPERMPDYmi:
6557 case X86::VPERMPDYri:
6558 case X86::VPERMDZ256rm:
6559 case X86::VPERMDZ256rmb:
6560 case X86::VPERMDZ256rmbkz:
6561 case X86::VPERMDZ256rmkz:
6562 case X86::VPERMDZ256rr:
6563 case X86::VPERMDZ256rrkz:
6564 case X86::VPERMDZrm:
6565 case X86::VPERMDZrmb:
6566 case X86::VPERMDZrmbkz:
6567 case X86::VPERMDZrmkz:
6568 case X86::VPERMDZrr:
6569 case X86::VPERMDZrrkz:
6570 case X86::VPERMQZ256mbi:
6571 case X86::VPERMQZ256mbikz:
6572 case X86::VPERMQZ256mi:
6573 case X86::VPERMQZ256mikz:
6574 case X86::VPERMQZ256ri:
6575 case X86::VPERMQZ256rikz:
6576 case X86::VPERMQZ256rm:
6577 case X86::VPERMQZ256rmb:
6578 case X86::VPERMQZ256rmbkz:
6579 case X86::VPERMQZ256rmkz:
6580 case X86::VPERMQZ256rr:
6581 case X86::VPERMQZ256rrkz:
6582 case X86::VPERMQZmbi:
6583 case X86::VPERMQZmbikz:
6584 case X86::VPERMQZmi:
6585 case X86::VPERMQZmikz:
6586 case X86::VPERMQZri:
6587 case X86::VPERMQZrikz:
6588 case X86::VPERMQZrm:
6589 case X86::VPERMQZrmb:
6590 case X86::VPERMQZrmbkz:
6591 case X86::VPERMQZrmkz:
6592 case X86::VPERMQZrr:
6593 case X86::VPERMQZrrkz:
6594 case X86::VPERMPSZ256rm:
6595 case X86::VPERMPSZ256rmb:
6596 case X86::VPERMPSZ256rmbkz:
6597 case X86::VPERMPSZ256rmkz:
6598 case X86::VPERMPSZ256rr:
6599 case X86::VPERMPSZ256rrkz:
6600 case X86::VPERMPSZrm:
6601 case X86::VPERMPSZrmb:
6602 case X86::VPERMPSZrmbkz:
6603 case X86::VPERMPSZrmkz:
6604 case X86::VPERMPSZrr:
6605 case X86::VPERMPSZrrkz:
6606 case X86::VPERMPDZ256mbi:
6607 case X86::VPERMPDZ256mbikz:
6608 case X86::VPERMPDZ256mi:
6609 case X86::VPERMPDZ256mikz:
6610 case X86::VPERMPDZ256ri:
6611 case X86::VPERMPDZ256rikz:
6612 case X86::VPERMPDZ256rm:
6613 case X86::VPERMPDZ256rmb:
6614 case X86::VPERMPDZ256rmbkz:
6615 case X86::VPERMPDZ256rmkz:
6616 case X86::VPERMPDZ256rr:
6617 case X86::VPERMPDZ256rrkz:
6618 case X86::VPERMPDZmbi:
6619 case X86::VPERMPDZmbikz:
6620 case X86::VPERMPDZmi:
6621 case X86::VPERMPDZmikz:
6622 case X86::VPERMPDZri:
6623 case X86::VPERMPDZrikz:
6624 case X86::VPERMPDZrm:
6625 case X86::VPERMPDZrmb:
6626 case X86::VPERMPDZrmbkz:
6627 case X86::VPERMPDZrmkz:
6628 case X86::VPERMPDZrr:
6629 case X86::VPERMPDZrrkz:
6630 return Subtarget.hasPERMFalseDeps();
6631 case X86::VRANGEPDZ128rmbi:
6632 case X86::VRANGEPDZ128rmbikz:
6633 case X86::VRANGEPDZ128rmi:
6634 case X86::VRANGEPDZ128rmikz:
6635 case X86::VRANGEPDZ128rri:
6636 case X86::VRANGEPDZ128rrikz:
6637 case X86::VRANGEPDZ256rmbi:
6638 case X86::VRANGEPDZ256rmbikz:
6639 case X86::VRANGEPDZ256rmi:
6640 case X86::VRANGEPDZ256rmikz:
6641 case X86::VRANGEPDZ256rri:
6642 case X86::VRANGEPDZ256rrikz:
6643 case X86::VRANGEPDZrmbi:
6644 case X86::VRANGEPDZrmbikz:
6645 case X86::VRANGEPDZrmi:
6646 case X86::VRANGEPDZrmikz:
6647 case X86::VRANGEPDZrri:
6648 case X86::VRANGEPDZrrib:
6649 case X86::VRANGEPDZrribkz:
6650 case X86::VRANGEPDZrrikz:
6651 case X86::VRANGEPSZ128rmbi:
6652 case X86::VRANGEPSZ128rmbikz:
6653 case X86::VRANGEPSZ128rmi:
6654 case X86::VRANGEPSZ128rmikz:
6655 case X86::VRANGEPSZ128rri:
6656 case X86::VRANGEPSZ128rrikz:
6657 case X86::VRANGEPSZ256rmbi:
6658 case X86::VRANGEPSZ256rmbikz:
6659 case X86::VRANGEPSZ256rmi:
6660 case X86::VRANGEPSZ256rmikz:
6661 case X86::VRANGEPSZ256rri:
6662 case X86::VRANGEPSZ256rrikz:
6663 case X86::VRANGEPSZrmbi:
6664 case X86::VRANGEPSZrmbikz:
6665 case X86::VRANGEPSZrmi:
6666 case X86::VRANGEPSZrmikz:
6667 case X86::VRANGEPSZrri:
6668 case X86::VRANGEPSZrrib:
6669 case X86::VRANGEPSZrribkz:
6670 case X86::VRANGEPSZrrikz:
6671 case X86::VRANGESDZrmi:
6672 case X86::VRANGESDZrmikz:
6673 case X86::VRANGESDZrri:
6674 case X86::VRANGESDZrrib:
6675 case X86::VRANGESDZrribkz:
6676 case X86::VRANGESDZrrikz:
6677 case X86::VRANGESSZrmi:
6678 case X86::VRANGESSZrmikz:
6679 case X86::VRANGESSZrri:
6680 case X86::VRANGESSZrrib:
6681 case X86::VRANGESSZrribkz:
6682 case X86::VRANGESSZrrikz:
6683 return Subtarget.hasRANGEFalseDeps();
6684 case X86::VGETMANTSSZrmi:
6685 case X86::VGETMANTSSZrmikz:
6686 case X86::VGETMANTSSZrri:
6687 case X86::VGETMANTSSZrrib:
6688 case X86::VGETMANTSSZrribkz:
6689 case X86::VGETMANTSSZrrikz:
6690 case X86::VGETMANTSDZrmi:
6691 case X86::VGETMANTSDZrmikz:
6692 case X86::VGETMANTSDZrri:
6693 case X86::VGETMANTSDZrrib:
6694 case X86::VGETMANTSDZrribkz:
6695 case X86::VGETMANTSDZrrikz:
6696 case X86::VGETMANTSHZrmi:
6697 case X86::VGETMANTSHZrmikz:
6698 case X86::VGETMANTSHZrri:
6699 case X86::VGETMANTSHZrrib:
6700 case X86::VGETMANTSHZrribkz:
6701 case X86::VGETMANTSHZrrikz:
6702 case X86::VGETMANTPSZ128rmbi:
6703 case X86::VGETMANTPSZ128rmbikz:
6704 case X86::VGETMANTPSZ128rmi:
6705 case X86::VGETMANTPSZ128rmikz:
6706 case X86::VGETMANTPSZ256rmbi:
6707 case X86::VGETMANTPSZ256rmbikz:
6708 case X86::VGETMANTPSZ256rmi:
6709 case X86::VGETMANTPSZ256rmikz:
6710 case X86::VGETMANTPSZrmbi:
6711 case X86::VGETMANTPSZrmbikz:
6712 case X86::VGETMANTPSZrmi:
6713 case X86::VGETMANTPSZrmikz:
6714 case X86::VGETMANTPDZ128rmbi:
6715 case X86::VGETMANTPDZ128rmbikz:
6716 case X86::VGETMANTPDZ128rmi:
6717 case X86::VGETMANTPDZ128rmikz:
6718 case X86::VGETMANTPDZ256rmbi:
6719 case X86::VGETMANTPDZ256rmbikz:
6720 case X86::VGETMANTPDZ256rmi:
6721 case X86::VGETMANTPDZ256rmikz:
6722 case X86::VGETMANTPDZrmbi:
6723 case X86::VGETMANTPDZrmbikz:
6724 case X86::VGETMANTPDZrmi:
6725 case X86::VGETMANTPDZrmikz:
6726 return Subtarget.hasGETMANTFalseDeps();
6727 case X86::VPMULLQZ128rm:
6728 case X86::VPMULLQZ128rmb:
6729 case X86::VPMULLQZ128rmbkz:
6730 case X86::VPMULLQZ128rmkz:
6731 case X86::VPMULLQZ128rr:
6732 case X86::VPMULLQZ128rrkz:
6733 case X86::VPMULLQZ256rm:
6734 case X86::VPMULLQZ256rmb:
6735 case X86::VPMULLQZ256rmbkz:
6736 case X86::VPMULLQZ256rmkz:
6737 case X86::VPMULLQZ256rr:
6738 case X86::VPMULLQZ256rrkz:
6739 case X86::VPMULLQZrm:
6740 case X86::VPMULLQZrmb:
6741 case X86::VPMULLQZrmbkz:
6742 case X86::VPMULLQZrmkz:
6743 case X86::VPMULLQZrr:
6744 case X86::VPMULLQZrrkz:
6745 return Subtarget.hasMULLQFalseDeps();
6746 // GPR
6747 case X86::POPCNT32rm:
6748 case X86::POPCNT32rr:
6749 case X86::POPCNT64rm:
6750 case X86::POPCNT64rr:
6751 return Subtarget.hasPOPCNTFalseDeps();
6752 case X86::LZCNT32rm:
6753 case X86::LZCNT32rr:
6754 case X86::LZCNT64rm:
6755 case X86::LZCNT64rr:
6756 case X86::TZCNT32rm:
6757 case X86::TZCNT32rr:
6758 case X86::TZCNT64rm:
6759 case X86::TZCNT64rr:
6760 return Subtarget.hasLZCNTFalseDeps();
6761 }
6762
6763 return false;
6764}
6765
6766/// Inform the BreakFalseDeps pass how many idle
6767/// instructions we would like before a partial register update.
6769 const MachineInstr &MI, unsigned OpNum,
6770 const TargetRegisterInfo *TRI) const {
6771
6772 if (OpNum != 0)
6773 return 0;
6774
6775 // NDD ops with 8/16b results may appear to be partial register
6776 // updates after register allocation.
6777 bool HasNDDPartialWrite = false;
6778 if (X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
6779 Register Reg = MI.getOperand(0).getReg();
6780 if (!Reg.isVirtual())
6781 HasNDDPartialWrite =
6782 X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
6783 }
6784
6785 if (!(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
6786 return 0;
6787
6788 // Check if the result register is also used as a source.
6789 // For non-NDD ops, this means a partial update is wanted, hence we return 0.
6790 // For NDD ops, this means it is possible to compress the instruction
6791 // to a legacy form in CompressEVEX, which would create an unwanted partial
6792 // update, so we return the clearance.
6793 const MachineOperand &MO = MI.getOperand(0);
6794 Register Reg = MO.getReg();
6795 bool ReadsReg = false;
6796 if (Reg.isVirtual())
6797 ReadsReg = (MO.readsReg() || MI.readsVirtualRegister(Reg));
6798 else
6799 ReadsReg = MI.readsRegister(Reg, TRI);
6800 if (ReadsReg != HasNDDPartialWrite)
6801 return 0;
6802
6803 // If any instructions in the clearance range are reading Reg, insert a
6804 // dependency breaking instruction, which is inexpensive and is likely to
6805 // be hidden in other instruction's cycles.
6807}
6808
6809// Return true for any instruction the copies the high bits of the first source
6810// operand into the unused high bits of the destination operand.
6811// Also returns true for instructions that have two inputs where one may
6812// be undef and we want it to use the same register as the other input.
6813static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
6814 bool ForLoadFold = false) {
6815 // Set the OpNum parameter to the first source operand.
6816 switch (Opcode) {
6817 case X86::MMX_PUNPCKHBWrr:
6818 case X86::MMX_PUNPCKHWDrr:
6819 case X86::MMX_PUNPCKHDQrr:
6820 case X86::MMX_PUNPCKLBWrr:
6821 case X86::MMX_PUNPCKLWDrr:
6822 case X86::MMX_PUNPCKLDQrr:
6823 case X86::MOVHLPSrr:
6824 case X86::PACKSSWBrr:
6825 case X86::PACKUSWBrr:
6826 case X86::PACKSSDWrr:
6827 case X86::PACKUSDWrr:
6828 case X86::PUNPCKHBWrr:
6829 case X86::PUNPCKLBWrr:
6830 case X86::PUNPCKHWDrr:
6831 case X86::PUNPCKLWDrr:
6832 case X86::PUNPCKHDQrr:
6833 case X86::PUNPCKLDQrr:
6834 case X86::PUNPCKHQDQrr:
6835 case X86::PUNPCKLQDQrr:
6836 case X86::SHUFPDrri:
6837 case X86::SHUFPSrri:
6838 // These instructions are sometimes used with an undef first or second
6839 // source. Return true here so BreakFalseDeps will assign this source to the
6840 // same register as the first source to avoid a false dependency.
6841 // Operand 1 of these instructions is tied so they're separate from their
6842 // VEX counterparts.
6843 return OpNum == 2 && !ForLoadFold;
6844
6845 case X86::VMOVLHPSrr:
6846 case X86::VMOVLHPSZrr:
6847 case X86::VPACKSSWBrr:
6848 case X86::VPACKUSWBrr:
6849 case X86::VPACKSSDWrr:
6850 case X86::VPACKUSDWrr:
6851 case X86::VPACKSSWBZ128rr:
6852 case X86::VPACKUSWBZ128rr:
6853 case X86::VPACKSSDWZ128rr:
6854 case X86::VPACKUSDWZ128rr:
6855 case X86::VPERM2F128rri:
6856 case X86::VPERM2I128rri:
6857 case X86::VSHUFF32X4Z256rri:
6858 case X86::VSHUFF32X4Zrri:
6859 case X86::VSHUFF64X2Z256rri:
6860 case X86::VSHUFF64X2Zrri:
6861 case X86::VSHUFI32X4Z256rri:
6862 case X86::VSHUFI32X4Zrri:
6863 case X86::VSHUFI64X2Z256rri:
6864 case X86::VSHUFI64X2Zrri:
6865 case X86::VPUNPCKHBWrr:
6866 case X86::VPUNPCKLBWrr:
6867 case X86::VPUNPCKHBWYrr:
6868 case X86::VPUNPCKLBWYrr:
6869 case X86::VPUNPCKHBWZ128rr:
6870 case X86::VPUNPCKLBWZ128rr:
6871 case X86::VPUNPCKHBWZ256rr:
6872 case X86::VPUNPCKLBWZ256rr:
6873 case X86::VPUNPCKHBWZrr:
6874 case X86::VPUNPCKLBWZrr:
6875 case X86::VPUNPCKHWDrr:
6876 case X86::VPUNPCKLWDrr:
6877 case X86::VPUNPCKHWDYrr:
6878 case X86::VPUNPCKLWDYrr:
6879 case X86::VPUNPCKHWDZ128rr:
6880 case X86::VPUNPCKLWDZ128rr:
6881 case X86::VPUNPCKHWDZ256rr:
6882 case X86::VPUNPCKLWDZ256rr:
6883 case X86::VPUNPCKHWDZrr:
6884 case X86::VPUNPCKLWDZrr:
6885 case X86::VPUNPCKHDQrr:
6886 case X86::VPUNPCKLDQrr:
6887 case X86::VPUNPCKHDQYrr:
6888 case X86::VPUNPCKLDQYrr:
6889 case X86::VPUNPCKHDQZ128rr:
6890 case X86::VPUNPCKLDQZ128rr:
6891 case X86::VPUNPCKHDQZ256rr:
6892 case X86::VPUNPCKLDQZ256rr:
6893 case X86::VPUNPCKHDQZrr:
6894 case X86::VPUNPCKLDQZrr:
6895 case X86::VPUNPCKHQDQrr:
6896 case X86::VPUNPCKLQDQrr:
6897 case X86::VPUNPCKHQDQYrr:
6898 case X86::VPUNPCKLQDQYrr:
6899 case X86::VPUNPCKHQDQZ128rr:
6900 case X86::VPUNPCKLQDQZ128rr:
6901 case X86::VPUNPCKHQDQZ256rr:
6902 case X86::VPUNPCKLQDQZ256rr:
6903 case X86::VPUNPCKHQDQZrr:
6904 case X86::VPUNPCKLQDQZrr:
6905 // These instructions are sometimes used with an undef first or second
6906 // source. Return true here so BreakFalseDeps will assign this source to the
6907 // same register as the first source to avoid a false dependency.
6908 return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
6909
6910 case X86::VCVTSI2SSrr:
6911 case X86::VCVTSI2SSrm:
6912 case X86::VCVTSI2SSrr_Int:
6913 case X86::VCVTSI2SSrm_Int:
6914 case X86::VCVTSI642SSrr:
6915 case X86::VCVTSI642SSrm:
6916 case X86::VCVTSI642SSrr_Int:
6917 case X86::VCVTSI642SSrm_Int:
6918 case X86::VCVTSI2SDrr:
6919 case X86::VCVTSI2SDrm:
6920 case X86::VCVTSI2SDrr_Int:
6921 case X86::VCVTSI2SDrm_Int:
6922 case X86::VCVTSI642SDrr:
6923 case X86::VCVTSI642SDrm:
6924 case X86::VCVTSI642SDrr_Int:
6925 case X86::VCVTSI642SDrm_Int:
6926 // AVX-512
6927 case X86::VCVTSI2SSZrr:
6928 case X86::VCVTSI2SSZrm:
6929 case X86::VCVTSI2SSZrr_Int:
6930 case X86::VCVTSI2SSZrrb_Int:
6931 case X86::VCVTSI2SSZrm_Int:
6932 case X86::VCVTSI642SSZrr:
6933 case X86::VCVTSI642SSZrm:
6934 case X86::VCVTSI642SSZrr_Int:
6935 case X86::VCVTSI642SSZrrb_Int:
6936 case X86::VCVTSI642SSZrm_Int:
6937 case X86::VCVTSI2SDZrr:
6938 case X86::VCVTSI2SDZrm:
6939 case X86::VCVTSI2SDZrr_Int:
6940 case X86::VCVTSI2SDZrm_Int:
6941 case X86::VCVTSI642SDZrr:
6942 case X86::VCVTSI642SDZrm:
6943 case X86::VCVTSI642SDZrr_Int:
6944 case X86::VCVTSI642SDZrrb_Int:
6945 case X86::VCVTSI642SDZrm_Int:
6946 case X86::VCVTUSI2SSZrr:
6947 case X86::VCVTUSI2SSZrm:
6948 case X86::VCVTUSI2SSZrr_Int:
6949 case X86::VCVTUSI2SSZrrb_Int:
6950 case X86::VCVTUSI2SSZrm_Int:
6951 case X86::VCVTUSI642SSZrr:
6952 case X86::VCVTUSI642SSZrm:
6953 case X86::VCVTUSI642SSZrr_Int:
6954 case X86::VCVTUSI642SSZrrb_Int:
6955 case X86::VCVTUSI642SSZrm_Int:
6956 case X86::VCVTUSI2SDZrr:
6957 case X86::VCVTUSI2SDZrm:
6958 case X86::VCVTUSI2SDZrr_Int:
6959 case X86::VCVTUSI2SDZrm_Int:
6960 case X86::VCVTUSI642SDZrr:
6961 case X86::VCVTUSI642SDZrm:
6962 case X86::VCVTUSI642SDZrr_Int:
6963 case X86::VCVTUSI642SDZrrb_Int:
6964 case X86::VCVTUSI642SDZrm_Int:
6965 case X86::VCVTSI2SHZrr:
6966 case X86::VCVTSI2SHZrm:
6967 case X86::VCVTSI2SHZrr_Int:
6968 case X86::VCVTSI2SHZrrb_Int:
6969 case X86::VCVTSI2SHZrm_Int:
6970 case X86::VCVTSI642SHZrr:
6971 case X86::VCVTSI642SHZrm:
6972 case X86::VCVTSI642SHZrr_Int:
6973 case X86::VCVTSI642SHZrrb_Int:
6974 case X86::VCVTSI642SHZrm_Int:
6975 case X86::VCVTUSI2SHZrr:
6976 case X86::VCVTUSI2SHZrm:
6977 case X86::VCVTUSI2SHZrr_Int:
6978 case X86::VCVTUSI2SHZrrb_Int:
6979 case X86::VCVTUSI2SHZrm_Int:
6980 case X86::VCVTUSI642SHZrr:
6981 case X86::VCVTUSI642SHZrm:
6982 case X86::VCVTUSI642SHZrr_Int:
6983 case X86::VCVTUSI642SHZrrb_Int:
6984 case X86::VCVTUSI642SHZrm_Int:
6985 // Load folding won't effect the undef register update since the input is
6986 // a GPR.
6987 return OpNum == 1 && !ForLoadFold;
6988 case X86::VCVTSD2SSrr:
6989 case X86::VCVTSD2SSrm:
6990 case X86::VCVTSD2SSrr_Int:
6991 case X86::VCVTSD2SSrm_Int:
6992 case X86::VCVTSS2SDrr:
6993 case X86::VCVTSS2SDrm:
6994 case X86::VCVTSS2SDrr_Int:
6995 case X86::VCVTSS2SDrm_Int:
6996 case X86::VRCPSSr:
6997 case X86::VRCPSSr_Int:
6998 case X86::VRCPSSm:
6999 case X86::VRCPSSm_Int:
7000 case X86::VROUNDSDri:
7001 case X86::VROUNDSDmi:
7002 case X86::VROUNDSDri_Int:
7003 case X86::VROUNDSDmi_Int:
7004 case X86::VROUNDSSri:
7005 case X86::VROUNDSSmi:
7006 case X86::VROUNDSSri_Int:
7007 case X86::VROUNDSSmi_Int:
7008 case X86::VRSQRTSSr:
7009 case X86::VRSQRTSSr_Int:
7010 case X86::VRSQRTSSm:
7011 case X86::VRSQRTSSm_Int:
7012 case X86::VSQRTSSr:
7013 case X86::VSQRTSSr_Int:
7014 case X86::VSQRTSSm:
7015 case X86::VSQRTSSm_Int:
7016 case X86::VSQRTSDr:
7017 case X86::VSQRTSDr_Int:
7018 case X86::VSQRTSDm:
7019 case X86::VSQRTSDm_Int:
7020 // AVX-512
7021 case X86::VCVTSD2SSZrr:
7022 case X86::VCVTSD2SSZrr_Int:
7023 case X86::VCVTSD2SSZrrb_Int:
7024 case X86::VCVTSD2SSZrm:
7025 case X86::VCVTSD2SSZrm_Int:
7026 case X86::VCVTSS2SDZrr:
7027 case X86::VCVTSS2SDZrr_Int:
7028 case X86::VCVTSS2SDZrrb_Int:
7029 case X86::VCVTSS2SDZrm:
7030 case X86::VCVTSS2SDZrm_Int:
7031 case X86::VGETEXPSDZr:
7032 case X86::VGETEXPSDZrb:
7033 case X86::VGETEXPSDZm:
7034 case X86::VGETEXPSSZr:
7035 case X86::VGETEXPSSZrb:
7036 case X86::VGETEXPSSZm:
7037 case X86::VGETMANTSDZrri:
7038 case X86::VGETMANTSDZrrib:
7039 case X86::VGETMANTSDZrmi:
7040 case X86::VGETMANTSSZrri:
7041 case X86::VGETMANTSSZrrib:
7042 case X86::VGETMANTSSZrmi:
7043 case X86::VRNDSCALESDZrri:
7044 case X86::VRNDSCALESDZrri_Int:
7045 case X86::VRNDSCALESDZrrib_Int:
7046 case X86::VRNDSCALESDZrmi:
7047 case X86::VRNDSCALESDZrmi_Int:
7048 case X86::VRNDSCALESSZrri:
7049 case X86::VRNDSCALESSZrri_Int:
7050 case X86::VRNDSCALESSZrrib_Int:
7051 case X86::VRNDSCALESSZrmi:
7052 case X86::VRNDSCALESSZrmi_Int:
7053 case X86::VRCP14SDZrr:
7054 case X86::VRCP14SDZrm:
7055 case X86::VRCP14SSZrr:
7056 case X86::VRCP14SSZrm:
7057 case X86::VRCPSHZrr:
7058 case X86::VRCPSHZrm:
7059 case X86::VRSQRTSHZrr:
7060 case X86::VRSQRTSHZrm:
7061 case X86::VREDUCESHZrmi:
7062 case X86::VREDUCESHZrri:
7063 case X86::VREDUCESHZrrib:
7064 case X86::VGETEXPSHZr:
7065 case X86::VGETEXPSHZrb:
7066 case X86::VGETEXPSHZm:
7067 case X86::VGETMANTSHZrri:
7068 case X86::VGETMANTSHZrrib:
7069 case X86::VGETMANTSHZrmi:
7070 case X86::VRNDSCALESHZrri:
7071 case X86::VRNDSCALESHZrri_Int:
7072 case X86::VRNDSCALESHZrrib_Int:
7073 case X86::VRNDSCALESHZrmi:
7074 case X86::VRNDSCALESHZrmi_Int:
7075 case X86::VSQRTSHZr:
7076 case X86::VSQRTSHZr_Int:
7077 case X86::VSQRTSHZrb_Int:
7078 case X86::VSQRTSHZm:
7079 case X86::VSQRTSHZm_Int:
7080 case X86::VRCP28SDZr:
7081 case X86::VRCP28SDZrb:
7082 case X86::VRCP28SDZm:
7083 case X86::VRCP28SSZr:
7084 case X86::VRCP28SSZrb:
7085 case X86::VRCP28SSZm:
7086 case X86::VREDUCESSZrmi:
7087 case X86::VREDUCESSZrri:
7088 case X86::VREDUCESSZrrib:
7089 case X86::VRSQRT14SDZrr:
7090 case X86::VRSQRT14SDZrm:
7091 case X86::VRSQRT14SSZrr:
7092 case X86::VRSQRT14SSZrm:
7093 case X86::VRSQRT28SDZr:
7094 case X86::VRSQRT28SDZrb:
7095 case X86::VRSQRT28SDZm:
7096 case X86::VRSQRT28SSZr:
7097 case X86::VRSQRT28SSZrb:
7098 case X86::VRSQRT28SSZm:
7099 case X86::VSQRTSSZr:
7100 case X86::VSQRTSSZr_Int:
7101 case X86::VSQRTSSZrb_Int:
7102 case X86::VSQRTSSZm:
7103 case X86::VSQRTSSZm_Int:
7104 case X86::VSQRTSDZr:
7105 case X86::VSQRTSDZr_Int:
7106 case X86::VSQRTSDZrb_Int:
7107 case X86::VSQRTSDZm:
7108 case X86::VSQRTSDZm_Int:
7109 case X86::VCVTSD2SHZrr:
7110 case X86::VCVTSD2SHZrr_Int:
7111 case X86::VCVTSD2SHZrrb_Int:
7112 case X86::VCVTSD2SHZrm:
7113 case X86::VCVTSD2SHZrm_Int:
7114 case X86::VCVTSS2SHZrr:
7115 case X86::VCVTSS2SHZrr_Int:
7116 case X86::VCVTSS2SHZrrb_Int:
7117 case X86::VCVTSS2SHZrm:
7118 case X86::VCVTSS2SHZrm_Int:
7119 case X86::VCVTSH2SDZrr:
7120 case X86::VCVTSH2SDZrr_Int:
7121 case X86::VCVTSH2SDZrrb_Int:
7122 case X86::VCVTSH2SDZrm:
7123 case X86::VCVTSH2SDZrm_Int:
7124 case X86::VCVTSH2SSZrr:
7125 case X86::VCVTSH2SSZrr_Int:
7126 case X86::VCVTSH2SSZrrb_Int:
7127 case X86::VCVTSH2SSZrm:
7128 case X86::VCVTSH2SSZrm_Int:
7129 return OpNum == 1;
7130 case X86::VMOVSSZrrk:
7131 case X86::VMOVSDZrrk:
7132 return OpNum == 3 && !ForLoadFold;
7133 case X86::VMOVSSZrrkz:
7134 case X86::VMOVSDZrrkz:
7135 return OpNum == 2 && !ForLoadFold;
7136 }
7137
7138 return false;
7139}
7140
7141/// Inform the BreakFalseDeps pass how many idle instructions we would like
7142/// before certain undef register reads.
7143///
7144/// This catches the VCVTSI2SD family of instructions:
7145///
7146/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
7147///
7148/// We should to be careful *not* to catch VXOR idioms which are presumably
7149/// handled specially in the pipeline:
7150///
7151/// vxorps undef %xmm1, undef %xmm1, %xmm1
7152///
7153/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
7154/// high bits that are passed-through are not live.
7155unsigned
7157 const TargetRegisterInfo *TRI) const {
7158 const MachineOperand &MO = MI.getOperand(OpNum);
7159 if (MO.getReg().isPhysical() && hasUndefRegUpdate(MI.getOpcode(), OpNum))
7160 return UndefRegClearance;
7161
7162 return 0;
7163}
7164
7166 MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
7167 Register Reg = MI.getOperand(OpNum).getReg();
7168 // If MI kills this register, the false dependence is already broken.
7169 if (MI.killsRegister(Reg, TRI))
7170 return;
7171
7172 if (X86::VR128RegClass.contains(Reg)) {
7173 // These instructions are all floating point domain, so xorps is the best
7174 // choice.
7175 unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
7176 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
7177 .addReg(Reg, RegState::Undef)
7178 .addReg(Reg, RegState::Undef);
7179 MI.addRegisterKilled(Reg, TRI, true);
7180 } else if (X86::VR256RegClass.contains(Reg)) {
7181 // Use vxorps to clear the full ymm register.
7182 // It wants to read and write the xmm sub-register.
7183 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7184 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
7185 .addReg(XReg, RegState::Undef)
7186 .addReg(XReg, RegState::Undef)
7188 MI.addRegisterKilled(Reg, TRI, true);
7189 } else if (X86::VR128XRegClass.contains(Reg)) {
7190 // Only handle VLX targets.
7191 if (!Subtarget.hasVLX())
7192 return;
7193 // Since vxorps requires AVX512DQ, vpxord should be the best choice.
7194 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg)
7195 .addReg(Reg, RegState::Undef)
7196 .addReg(Reg, RegState::Undef);
7197 MI.addRegisterKilled(Reg, TRI, true);
7198 } else if (X86::VR256XRegClass.contains(Reg) ||
7199 X86::VR512RegClass.contains(Reg)) {
7200 // Only handle VLX targets.
7201 if (!Subtarget.hasVLX())
7202 return;
7203 // Use vpxord to clear the full ymm/zmm register.
7204 // It wants to read and write the xmm sub-register.
7205 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7206 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg)
7207 .addReg(XReg, RegState::Undef)
7208 .addReg(XReg, RegState::Undef)
7210 MI.addRegisterKilled(Reg, TRI, true);
7211 } else if (X86::GR64RegClass.contains(Reg)) {
7212 // Using XOR32rr because it has shorter encoding and zeros up the upper bits
7213 // as well.
7214 Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
7215 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
7216 .addReg(XReg, RegState::Undef)
7217 .addReg(XReg, RegState::Undef)
7219 MI.addRegisterKilled(Reg, TRI, true);
7220 } else if (X86::GR32RegClass.contains(Reg)) {
7221 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
7222 .addReg(Reg, RegState::Undef)
7223 .addReg(Reg, RegState::Undef);
7224 MI.addRegisterKilled(Reg, TRI, true);
7225 } else if ((X86::GR16RegClass.contains(Reg) ||
7226 X86::GR8RegClass.contains(Reg)) &&
7227 X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
7228 // This case is only expected for NDD ops which appear to be partial
7229 // writes, but are not due to the zeroing of the upper part. Here
7230 // we add an implicit def of the superegister, which prevents
7231 // CompressEVEX from converting this to a legacy form.
7232 Register SuperReg = getX86SubSuperRegister(Reg, 64);
7233 MachineInstrBuilder BuildMI(*MI.getParent()->getParent(), &MI);
7234 if (!MI.definesRegister(SuperReg, /*TRI=*/nullptr))
7235 BuildMI.addReg(SuperReg, RegState::ImplicitDefine);
7236 }
7237}
7238
7240 int PtrOffset = 0) {
7241 unsigned NumAddrOps = MOs.size();
7242
7243 if (NumAddrOps < 4) {
7244 // FrameIndex only - add an immediate offset (whether its zero or not).
7245 for (unsigned i = 0; i != NumAddrOps; ++i)
7246 MIB.add(MOs[i]);
7247 addOffset(MIB, PtrOffset);
7248 } else {
7249 // General Memory Addressing - we need to add any offset to an existing
7250 // offset.
7251 assert(MOs.size() == 5 && "Unexpected memory operand list length");
7252 for (unsigned i = 0; i != NumAddrOps; ++i) {
7253 const MachineOperand &MO = MOs[i];
7254 if (i == 3 && PtrOffset != 0) {
7255 MIB.addDisp(MO, PtrOffset);
7256 } else {
7257 MIB.add(MO);
7258 }
7259 }
7260 }
7261}
7262
7264 MachineInstr &NewMI,
7265 const TargetInstrInfo &TII) {
7266 MachineRegisterInfo &MRI = MF.getRegInfo();
7267
7268 for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
7269 MachineOperand &MO = NewMI.getOperand(Idx);
7270 // We only need to update constraints on virtual register operands.
7271 if (!MO.isReg())
7272 continue;
7273 Register Reg = MO.getReg();
7274 if (!Reg.isVirtual())
7275 continue;
7276
7277 auto *NewRC =
7278 MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx));
7279 if (!NewRC) {
7280 LLVM_DEBUG(
7281 dbgs() << "WARNING: Unable to update register constraint for operand "
7282 << Idx << " of instruction:\n";
7283 NewMI.dump(); dbgs() << "\n");
7284 }
7285 }
7286}
7287
7288static MachineInstr *fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
7292 const TargetInstrInfo &TII) {
7293 // Create the base instruction with the memory operand as the first part.
7294 // Omit the implicit operands, something BuildMI can't do.
7295 MachineInstr *NewMI =
7296 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7297 MachineInstrBuilder MIB(MF, NewMI);
7298 addOperands(MIB, MOs);
7299
7300 // Loop over the rest of the ri operands, converting them over.
7301 unsigned NumOps = MI.getDesc().getNumOperands() - 2;
7302 for (unsigned i = 0; i != NumOps; ++i) {
7303 MachineOperand &MO = MI.getOperand(i + 2);
7304 MIB.add(MO);
7305 }
7306 for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), NumOps + 2))
7307 MIB.add(MO);
7308
7309 updateOperandRegConstraints(MF, *NewMI, TII);
7310
7311 MachineBasicBlock *MBB = InsertPt->getParent();
7312 MBB->insert(InsertPt, NewMI);
7313
7314 return MIB;
7315}
7316
7317static MachineInstr *fuseInst(MachineFunction &MF, unsigned Opcode,
7318 unsigned OpNo, ArrayRef<MachineOperand> MOs,
7321 int PtrOffset = 0) {
7322 // Omit the implicit operands, something BuildMI can't do.
7323 MachineInstr *NewMI =
7324 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7325 MachineInstrBuilder MIB(MF, NewMI);
7326
7327 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
7328 MachineOperand &MO = MI.getOperand(i);
7329 if (i == OpNo) {
7330 assert(MO.isReg() && "Expected to fold into reg operand!");
7331 addOperands(MIB, MOs, PtrOffset);
7332 } else {
7333 MIB.add(MO);
7334 }
7335 }
7336
7337 updateOperandRegConstraints(MF, *NewMI, TII);
7338
7339 // Copy the NoFPExcept flag from the instruction we're fusing.
7342
7343 MachineBasicBlock *MBB = InsertPt->getParent();
7344 MBB->insert(InsertPt, NewMI);
7345
7346 return MIB;
7347}
7348
7349static MachineInstr *makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
7352 MachineInstr &MI) {
7353 MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
7354 MI.getDebugLoc(), TII.get(Opcode));
7355 addOperands(MIB, MOs);
7356 return MIB.addImm(0);
7357}
7358
7359MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
7360 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7362 unsigned Size, Align Alignment) const {
7363 switch (MI.getOpcode()) {
7364 case X86::INSERTPSrri:
7365 case X86::VINSERTPSrri:
7366 case X86::VINSERTPSZrri:
7367 // Attempt to convert the load of inserted vector into a fold load
7368 // of a single float.
7369 if (OpNum == 2) {
7370 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
7371 unsigned ZMask = Imm & 15;
7372 unsigned DstIdx = (Imm >> 4) & 3;
7373 unsigned SrcIdx = (Imm >> 6) & 3;
7374
7375 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7376 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7377 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7378 if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
7379 (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) {
7380 int PtrOffset = SrcIdx * 4;
7381 unsigned NewImm = (DstIdx << 4) | ZMask;
7382 unsigned NewOpCode =
7383 (MI.getOpcode() == X86::VINSERTPSZrri) ? X86::VINSERTPSZrmi
7384 : (MI.getOpcode() == X86::VINSERTPSrri) ? X86::VINSERTPSrmi
7385 : X86::INSERTPSrmi;
7386 MachineInstr *NewMI =
7387 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
7388 NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
7389 return NewMI;
7390 }
7391 }
7392 break;
7393 case X86::MOVHLPSrr:
7394 case X86::VMOVHLPSrr:
7395 case X86::VMOVHLPSZrr:
7396 // Move the upper 64-bits of the second operand to the lower 64-bits.
7397 // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
7398 // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
7399 if (OpNum == 2) {
7400 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7401 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7402 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7403 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
7404 unsigned NewOpCode =
7405 (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm
7406 : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm
7407 : X86::MOVLPSrm;
7408 MachineInstr *NewMI =
7409 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
7410 return NewMI;
7411 }
7412 }
7413 break;
7414 case X86::UNPCKLPDrr:
7415 // If we won't be able to fold this to the memory form of UNPCKL, use
7416 // MOVHPD instead. Done as custom because we can't have this in the load
7417 // table twice.
7418 if (OpNum == 2) {
7419 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7420 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7421 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7422 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
7423 MachineInstr *NewMI =
7424 fuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
7425 return NewMI;
7426 }
7427 }
7428 break;
7429 case X86::MOV32r0:
7430 if (auto *NewMI =
7431 makeM0Inst(*this, (Size == 4) ? X86::MOV32mi : X86::MOV64mi32, MOs,
7432 InsertPt, MI))
7433 return NewMI;
7434 break;
7435 }
7436
7437 return nullptr;
7438}
7439
7441 MachineInstr &MI) {
7442 if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/ true) ||
7443 !MI.getOperand(1).isReg())
7444 return false;
7445
7446 // The are two cases we need to handle depending on where in the pipeline
7447 // the folding attempt is being made.
7448 // -Register has the undef flag set.
7449 // -Register is produced by the IMPLICIT_DEF instruction.
7450
7451 if (MI.getOperand(1).isUndef())
7452 return true;
7453
7455 MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
7456 return VRegDef && VRegDef->isImplicitDef();
7457}
7458
7459unsigned X86InstrInfo::commuteOperandsForFold(MachineInstr &MI,
7460 unsigned Idx1) const {
7461 unsigned Idx2 = CommuteAnyOperandIndex;
7462 if (!findCommutedOpIndices(MI, Idx1, Idx2))
7463 return Idx1;
7464
7465 bool HasDef = MI.getDesc().getNumDefs();
7466 Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
7467 Register Reg1 = MI.getOperand(Idx1).getReg();
7468 Register Reg2 = MI.getOperand(Idx2).getReg();
7469 bool Tied1 = 0 == MI.getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO);
7470 bool Tied2 = 0 == MI.getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO);
7471
7472 // If either of the commutable operands are tied to the destination
7473 // then we can not commute + fold.
7474 if ((HasDef && Reg0 == Reg1 && Tied1) || (HasDef && Reg0 == Reg2 && Tied2))
7475 return Idx1;
7476
7477 return commuteInstruction(MI, false, Idx1, Idx2) ? Idx2 : Idx1;
7478}
7479
7480static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx) {
7481 if (PrintFailedFusing && !MI.isCopy())
7482 dbgs() << "We failed to fuse operand " << Idx << " in " << MI;
7483}
7484
7486 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7488 unsigned Size, Align Alignment, bool AllowCommute, MachineInstr *&CopyMI,
7489 VirtRegMap *VRM) const {
7490 bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
7491 unsigned Opc = MI.getOpcode();
7492
7493 // For CPUs that favor the register form of a call or push,
7494 // do not fold loads into calls or pushes, unless optimizing for size
7495 // aggressively.
7496 if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
7497 (Opc == X86::CALL32r || Opc == X86::CALL64r ||
7498 Opc == X86::CALL64r_ImpCall || Opc == X86::PUSH16r ||
7499 Opc == X86::PUSH32r || Opc == X86::PUSH64r))
7500 return nullptr;
7501
7502 // Avoid partial and undef register update stalls unless optimizing for size.
7503 if (!MF.getFunction().hasOptSize() &&
7504 (hasPartialRegUpdate(Opc, Subtarget, /*ForLoadFold*/ true) ||
7506 return nullptr;
7507
7508 unsigned NumOps = MI.getDesc().getNumOperands();
7509 bool IsTwoAddr = NumOps > 1 && OpNum < 2 && MI.getOperand(0).isReg() &&
7510 MI.getOperand(1).isReg() &&
7511 MI.getOperand(0).getReg() == MI.getOperand(1).getReg();
7512
7513 // FIXME: AsmPrinter doesn't know how to handle
7514 // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
7515 if (Opc == X86::ADD32ri &&
7516 MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
7517 return nullptr;
7518
7519 // GOTTPOFF relocation loads can only be folded into add instructions.
7520 // FIXME: Need to exclude other relocations that only support specific
7521 // instructions.
7522 if (MOs.size() == X86::AddrNumOperands &&
7523 MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF &&
7524 Opc != X86::ADD64rr)
7525 return nullptr;
7526
7527 // Don't fold loads into indirect calls that need a KCFI check as we'll
7528 // have to unfold these in X86TargetLowering::EmitKCFICheck anyway.
7529 if (MI.isCall() && MI.getCFIType())
7530 return nullptr;
7531
7532 // Attempt to fold any custom cases we have.
7533 if (auto *CustomMI = foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt,
7534 Size, Alignment))
7535 return CustomMI;
7536
7537 // Folding a memory location into the two-address part of a two-address
7538 // instruction is different than folding it other places. It requires
7539 // replacing the *two* registers with the memory location.
7540 //
7541 // Utilize the mapping NonNDD -> RMW for the NDD variant.
7542 unsigned NonNDOpc = Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U;
7543 // Utilize the mapping NonNDD if NDD memory variant is not preferred.
7544 bool NoNDDM = NonNDOpc && !Subtarget.hasNDDM();
7545
7546 MachineRegisterInfo &MRI = MF.getRegInfo();
7547 if (NoNDDM && !IsTwoAddr && !MRI.isSSA()) {
7548 // Bail out if dst has subreg. It happens during register-coalescer from
7549 // 704B %19:gr32 = SUB32rr_ND killed %0:gr32, killed %7:gr32, ...
7550 // 752B undef %23.sub_32bit:gr64 = COPY killed %19:gr32
7551 // 768B %25:gr32 = LEA64_32r killed %23:gr64, 1, killed %21:gr64_nosp, ...
7552 // to
7553 // 704B undef %23.sub_32bit:gr64_with_sub_8bit = SUB32rr_ND %0:gr32, ...
7554 // 768B %25:gr32 = LEA64_32r %23:gr64_with_sub_8bit, 1, %21:gr64_nosp, ...
7555 // Machine verifier fails if we try to tie %23 to the source.
7556 if (MI.getOperand(0).getSubReg())
7557 return nullptr;
7558
7559 // Bail out if dst has been assigned a physical register. Otherwise, we
7560 // cannot update LiveRegMatrix properly.
7561 if (VRM && VRM->getPhys(MI.getOperand(0).getReg()) &&
7562 MI.getOperand(0).getReg() != MI.getOperand(1).getReg())
7563 return nullptr;
7564 }
7565
7566 const X86FoldTableEntry *I =
7567 IsTwoAddr ? lookupTwoAddrFoldTable(NonNDOpc ? NonNDOpc : Opc)
7568 : lookupFoldTable(NoNDDM ? NonNDOpc : Opc, OpNum);
7569
7570 MachineInstr *NewMI = nullptr;
7571 if (I) {
7572 unsigned Opcode = I->DstOp;
7573 if (Alignment <
7574 Align(1ULL << ((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT)))
7575 return nullptr;
7576 bool NarrowToMOV32rm = false;
7577 if (Size) {
7579 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7580 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7581 // Check if it's safe to fold the load. If the size of the object is
7582 // narrower than the load width, then it's not.
7583 // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
7584 if ((I->Flags & TB_FOLDED_LOAD) && Size < RCSize) {
7585 // If this is a 64-bit load, but the spill slot is 32, then we can do
7586 // a 32-bit load which is implicitly zero-extended. This likely is
7587 // due to live interval analysis remat'ing a load from stack slot.
7588 if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
7589 return nullptr;
7590 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
7591 return nullptr;
7592 Opcode = X86::MOV32rm;
7593 NarrowToMOV32rm = true;
7594 }
7595 // For stores, make sure the size of the object is equal to the size of
7596 // the store. If the object is larger, the extra bits would be garbage. If
7597 // the object is smaller we might overwrite another object or fault.
7598 if ((I->Flags & TB_FOLDED_STORE) && Size != RCSize)
7599 return nullptr;
7600 }
7601
7602 NewMI = IsTwoAddr ? fuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this)
7603 : fuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
7604
7605 if (NarrowToMOV32rm) {
7606 // If this is the special case where we use a MOV32rm to load a 32-bit
7607 // value and zero-extend the top bits. Change the destination register
7608 // to a 32-bit one.
7609 Register DstReg = NewMI->getOperand(0).getReg();
7610 if (DstReg.isPhysical())
7611 NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
7612 else
7613 NewMI->getOperand(0).setSubReg(X86::sub_32bit);
7614 }
7615
7616 if (NoNDDM && !IsTwoAddr) {
7617 Register SrcReg = MI.getOperand(1).getReg();
7618 unsigned SrcSub = MI.getOperand(1).getSubReg();
7619 if (MI.killsRegister(SrcReg, /*TRI=*/nullptr) ||
7620 MI.getOperand(0).getReg() == SrcReg)
7621 return NewMI;
7622
7623 const TargetRegisterClass &RC = *MF.getRegInfo().getRegClass(SrcReg);
7624 Register NewSrc = MRI.isSSA() ? MRI.createVirtualRegister(&RC)
7625 : MI.getOperand(0).getReg();
7626
7627 CopyMI = BuildMI(*NewMI->getParent(), *NewMI, MI.getDebugLoc(),
7628 get(TargetOpcode::COPY))
7629 .addDef(NewSrc)
7630 .addReg(SrcReg, {}, SrcSub);
7631 NewMI->getOperand(1).setReg(NewSrc);
7632 }
7633 return NewMI;
7634 }
7635
7636 if (AllowCommute) {
7637 // If the instruction and target operand are commutable, commute the
7638 // instruction and try again.
7639 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
7640 if (CommuteOpIdx2 == OpNum) {
7641 printFailMsgforFold(MI, OpNum);
7642 return nullptr;
7643 }
7644 // Attempt to fold with the commuted version of the instruction.
7645 NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
7646 Alignment, /*AllowCommute=*/false, CopyMI);
7647 if (NewMI)
7648 return NewMI;
7649 // Folding failed again - undo the commute before returning.
7650 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
7651 }
7652
7653 printFailMsgforFold(MI, OpNum);
7654 return nullptr;
7655}
7656
7659 MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr *&CopyMI,
7660 LiveIntervals *LIS, VirtRegMap *VRM) const {
7661 // Check switch flag
7662 if (NoFusing)
7663 return nullptr;
7664
7665 // Avoid partial and undef register update stalls unless optimizing for size.
7666 if (!MF.getFunction().hasOptSize() &&
7667 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
7669 return nullptr;
7670
7671 // Don't fold subreg spills, or reloads that use a high subreg.
7672 for (auto Op : Ops) {
7673 MachineOperand &MO = MI.getOperand(Op);
7674 auto SubReg = MO.getSubReg();
7675 // MOV32r0 is special b/c it's used to clear a 64-bit register too.
7676 // (See patterns for MOV32r0 in TD files).
7677 if (MI.getOpcode() == X86::MOV32r0 && SubReg == X86::sub_32bit)
7678 continue;
7679 if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
7680 return nullptr;
7681 }
7682
7683 const MachineFrameInfo &MFI = MF.getFrameInfo();
7684 unsigned Size = MFI.getObjectSize(FrameIndex);
7685 Align Alignment = MFI.getObjectAlign(FrameIndex);
7686 // If the function stack isn't realigned we don't want to fold instructions
7687 // that need increased alignment.
7688 if (!RI.hasStackRealignment(MF))
7689 Alignment =
7690 std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
7691
7692 auto Impl = [&]() {
7693 return foldMemoryOperandImpl(
7694 MF, MI, Ops[0], MachineOperand::CreateFI(FrameIndex), InsertPt, Size,
7695 Alignment, /*AllowCommute=*/true, CopyMI, VRM);
7696 };
7697 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
7698 unsigned NewOpc = 0;
7699 unsigned RCSize = 0;
7700 unsigned Opc = MI.getOpcode();
7701 switch (Opc) {
7702 default:
7703 // NDD can be folded into RMW though its Op0 and Op1 are not tied.
7704 return (Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U) ? Impl()
7705 : nullptr;
7706 case X86::TEST8rr:
7707 NewOpc = X86::CMP8ri;
7708 RCSize = 1;
7709 break;
7710 case X86::TEST16rr:
7711 NewOpc = X86::CMP16ri;
7712 RCSize = 2;
7713 break;
7714 case X86::TEST32rr:
7715 NewOpc = X86::CMP32ri;
7716 RCSize = 4;
7717 break;
7718 case X86::TEST64rr:
7719 NewOpc = X86::CMP64ri32;
7720 RCSize = 8;
7721 break;
7722 }
7723 // Check if it's safe to fold the load. If the size of the object is
7724 // narrower than the load width, then it's not.
7725 if (Size < RCSize)
7726 return nullptr;
7727 // Change to CMPXXri r, 0 first.
7728 MI.setDesc(get(NewOpc));
7729 MI.getOperand(1).ChangeToImmediate(0);
7730 } else if (Ops.size() != 1)
7731 return nullptr;
7732
7733 return Impl();
7734}
7735
7736/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
7737/// because the latter uses contents that wouldn't be defined in the folded
7738/// version. For instance, this transformation isn't legal:
7739/// movss (%rdi), %xmm0
7740/// addps %xmm0, %xmm0
7741/// ->
7742/// addps (%rdi), %xmm0
7743///
7744/// But this one is:
7745/// movss (%rdi), %xmm0
7746/// addss %xmm0, %xmm0
7747/// ->
7748/// addss (%rdi), %xmm0
7749///
7751 const MachineInstr &UserMI,
7752 const MachineFunction &MF) {
7753 unsigned Opc = LoadMI.getOpcode();
7754 unsigned UserOpc = UserMI.getOpcode();
7756 const TargetRegisterClass *RC =
7757 MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
7758 unsigned RegSize = TRI.getRegSizeInBits(*RC);
7759
7760 if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm ||
7761 Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt ||
7762 Opc == X86::VMOVSSZrm_alt) &&
7763 RegSize > 32) {
7764 // These instructions only load 32 bits, we can't fold them if the
7765 // destination register is wider than 32 bits (4 bytes), and its user
7766 // instruction isn't scalar (SS).
7767 switch (UserOpc) {
7768 case X86::CVTSS2SDrr_Int:
7769 case X86::VCVTSS2SDrr_Int:
7770 case X86::VCVTSS2SDZrr_Int:
7771 case X86::VCVTSS2SDZrrk_Int:
7772 case X86::VCVTSS2SDZrrkz_Int:
7773 case X86::CVTSS2SIrr_Int:
7774 case X86::CVTSS2SI64rr_Int:
7775 case X86::VCVTSS2SIrr_Int:
7776 case X86::VCVTSS2SI64rr_Int:
7777 case X86::VCVTSS2SIZrr_Int:
7778 case X86::VCVTSS2SI64Zrr_Int:
7779 case X86::CVTTSS2SIrr_Int:
7780 case X86::CVTTSS2SI64rr_Int:
7781 case X86::VCVTTSS2SIrr_Int:
7782 case X86::VCVTTSS2SI64rr_Int:
7783 case X86::VCVTTSS2SIZrr_Int:
7784 case X86::VCVTTSS2SI64Zrr_Int:
7785 case X86::VCVTSS2USIZrr_Int:
7786 case X86::VCVTSS2USI64Zrr_Int:
7787 case X86::VCVTTSS2USIZrr_Int:
7788 case X86::VCVTTSS2USI64Zrr_Int:
7789 case X86::RCPSSr_Int:
7790 case X86::VRCPSSr_Int:
7791 case X86::RSQRTSSr_Int:
7792 case X86::VRSQRTSSr_Int:
7793 case X86::ROUNDSSri_Int:
7794 case X86::VROUNDSSri_Int:
7795 case X86::COMISSrr_Int:
7796 case X86::VCOMISSrr_Int:
7797 case X86::VCOMISSZrr_Int:
7798 case X86::UCOMISSrr_Int:
7799 case X86::VUCOMISSrr_Int:
7800 case X86::VUCOMISSZrr_Int:
7801 case X86::ADDSSrr_Int:
7802 case X86::VADDSSrr_Int:
7803 case X86::VADDSSZrr_Int:
7804 case X86::CMPSSrri_Int:
7805 case X86::VCMPSSrri_Int:
7806 case X86::VCMPSSZrri_Int:
7807 case X86::DIVSSrr_Int:
7808 case X86::VDIVSSrr_Int:
7809 case X86::VDIVSSZrr_Int:
7810 case X86::MAXSSrr_Int:
7811 case X86::VMAXSSrr_Int:
7812 case X86::VMAXSSZrr_Int:
7813 case X86::MINSSrr_Int:
7814 case X86::VMINSSrr_Int:
7815 case X86::VMINSSZrr_Int:
7816 case X86::MULSSrr_Int:
7817 case X86::VMULSSrr_Int:
7818 case X86::VMULSSZrr_Int:
7819 case X86::SQRTSSr_Int:
7820 case X86::VSQRTSSr_Int:
7821 case X86::VSQRTSSZr_Int:
7822 case X86::SUBSSrr_Int:
7823 case X86::VSUBSSrr_Int:
7824 case X86::VSUBSSZrr_Int:
7825 case X86::VADDSSZrrk_Int:
7826 case X86::VADDSSZrrkz_Int:
7827 case X86::VCMPSSZrrik_Int:
7828 case X86::VDIVSSZrrk_Int:
7829 case X86::VDIVSSZrrkz_Int:
7830 case X86::VMAXSSZrrk_Int:
7831 case X86::VMAXSSZrrkz_Int:
7832 case X86::VMINSSZrrk_Int:
7833 case X86::VMINSSZrrkz_Int:
7834 case X86::VMULSSZrrk_Int:
7835 case X86::VMULSSZrrkz_Int:
7836 case X86::VSQRTSSZrk_Int:
7837 case X86::VSQRTSSZrkz_Int:
7838 case X86::VSUBSSZrrk_Int:
7839 case X86::VSUBSSZrrkz_Int:
7840 case X86::VFMADDSS4rr_Int:
7841 case X86::VFNMADDSS4rr_Int:
7842 case X86::VFMSUBSS4rr_Int:
7843 case X86::VFNMSUBSS4rr_Int:
7844 case X86::VFMADD132SSr_Int:
7845 case X86::VFNMADD132SSr_Int:
7846 case X86::VFMADD213SSr_Int:
7847 case X86::VFNMADD213SSr_Int:
7848 case X86::VFMADD231SSr_Int:
7849 case X86::VFNMADD231SSr_Int:
7850 case X86::VFMSUB132SSr_Int:
7851 case X86::VFNMSUB132SSr_Int:
7852 case X86::VFMSUB213SSr_Int:
7853 case X86::VFNMSUB213SSr_Int:
7854 case X86::VFMSUB231SSr_Int:
7855 case X86::VFNMSUB231SSr_Int:
7856 case X86::VFMADD132SSZr_Int:
7857 case X86::VFNMADD132SSZr_Int:
7858 case X86::VFMADD213SSZr_Int:
7859 case X86::VFNMADD213SSZr_Int:
7860 case X86::VFMADD231SSZr_Int:
7861 case X86::VFNMADD231SSZr_Int:
7862 case X86::VFMSUB132SSZr_Int:
7863 case X86::VFNMSUB132SSZr_Int:
7864 case X86::VFMSUB213SSZr_Int:
7865 case X86::VFNMSUB213SSZr_Int:
7866 case X86::VFMSUB231SSZr_Int:
7867 case X86::VFNMSUB231SSZr_Int:
7868 case X86::VFMADD132SSZrk_Int:
7869 case X86::VFNMADD132SSZrk_Int:
7870 case X86::VFMADD213SSZrk_Int:
7871 case X86::VFNMADD213SSZrk_Int:
7872 case X86::VFMADD231SSZrk_Int:
7873 case X86::VFNMADD231SSZrk_Int:
7874 case X86::VFMSUB132SSZrk_Int:
7875 case X86::VFNMSUB132SSZrk_Int:
7876 case X86::VFMSUB213SSZrk_Int:
7877 case X86::VFNMSUB213SSZrk_Int:
7878 case X86::VFMSUB231SSZrk_Int:
7879 case X86::VFNMSUB231SSZrk_Int:
7880 case X86::VFMADD132SSZrkz_Int:
7881 case X86::VFNMADD132SSZrkz_Int:
7882 case X86::VFMADD213SSZrkz_Int:
7883 case X86::VFNMADD213SSZrkz_Int:
7884 case X86::VFMADD231SSZrkz_Int:
7885 case X86::VFNMADD231SSZrkz_Int:
7886 case X86::VFMSUB132SSZrkz_Int:
7887 case X86::VFNMSUB132SSZrkz_Int:
7888 case X86::VFMSUB213SSZrkz_Int:
7889 case X86::VFNMSUB213SSZrkz_Int:
7890 case X86::VFMSUB231SSZrkz_Int:
7891 case X86::VFNMSUB231SSZrkz_Int:
7892 case X86::VFIXUPIMMSSZrri:
7893 case X86::VFIXUPIMMSSZrrik:
7894 case X86::VFIXUPIMMSSZrrikz:
7895 case X86::VFPCLASSSSZri:
7896 case X86::VFPCLASSSSZrik:
7897 case X86::VGETEXPSSZr:
7898 case X86::VGETEXPSSZrk:
7899 case X86::VGETEXPSSZrkz:
7900 case X86::VGETMANTSSZrri:
7901 case X86::VGETMANTSSZrrik:
7902 case X86::VGETMANTSSZrrikz:
7903 case X86::VRANGESSZrri:
7904 case X86::VRANGESSZrrik:
7905 case X86::VRANGESSZrrikz:
7906 case X86::VRCP14SSZrr:
7907 case X86::VRCP14SSZrrk:
7908 case X86::VRCP14SSZrrkz:
7909 case X86::VRCP28SSZr:
7910 case X86::VRCP28SSZrk:
7911 case X86::VRCP28SSZrkz:
7912 case X86::VREDUCESSZrri:
7913 case X86::VREDUCESSZrrik:
7914 case X86::VREDUCESSZrrikz:
7915 case X86::VRNDSCALESSZrri_Int:
7916 case X86::VRNDSCALESSZrrik_Int:
7917 case X86::VRNDSCALESSZrrikz_Int:
7918 case X86::VRSQRT14SSZrr:
7919 case X86::VRSQRT14SSZrrk:
7920 case X86::VRSQRT14SSZrrkz:
7921 case X86::VRSQRT28SSZr:
7922 case X86::VRSQRT28SSZrk:
7923 case X86::VRSQRT28SSZrkz:
7924 case X86::VSCALEFSSZrr:
7925 case X86::VSCALEFSSZrrk:
7926 case X86::VSCALEFSSZrrkz:
7927 return false;
7928 default:
7929 return true;
7930 }
7931 }
7932
7933 if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm ||
7934 Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt ||
7935 Opc == X86::VMOVSDZrm_alt) &&
7936 RegSize > 64) {
7937 // These instructions only load 64 bits, we can't fold them if the
7938 // destination register is wider than 64 bits (8 bytes), and its user
7939 // instruction isn't scalar (SD).
7940 switch (UserOpc) {
7941 case X86::CVTSD2SSrr_Int:
7942 case X86::VCVTSD2SSrr_Int:
7943 case X86::VCVTSD2SSZrr_Int:
7944 case X86::VCVTSD2SSZrrk_Int:
7945 case X86::VCVTSD2SSZrrkz_Int:
7946 case X86::CVTSD2SIrr_Int:
7947 case X86::CVTSD2SI64rr_Int:
7948 case X86::VCVTSD2SIrr_Int:
7949 case X86::VCVTSD2SI64rr_Int:
7950 case X86::VCVTSD2SIZrr_Int:
7951 case X86::VCVTSD2SI64Zrr_Int:
7952 case X86::CVTTSD2SIrr_Int:
7953 case X86::CVTTSD2SI64rr_Int:
7954 case X86::VCVTTSD2SIrr_Int:
7955 case X86::VCVTTSD2SI64rr_Int:
7956 case X86::VCVTTSD2SIZrr_Int:
7957 case X86::VCVTTSD2SI64Zrr_Int:
7958 case X86::VCVTSD2USIZrr_Int:
7959 case X86::VCVTSD2USI64Zrr_Int:
7960 case X86::VCVTTSD2USIZrr_Int:
7961 case X86::VCVTTSD2USI64Zrr_Int:
7962 case X86::ROUNDSDri_Int:
7963 case X86::VROUNDSDri_Int:
7964 case X86::COMISDrr_Int:
7965 case X86::VCOMISDrr_Int:
7966 case X86::VCOMISDZrr_Int:
7967 case X86::UCOMISDrr_Int:
7968 case X86::VUCOMISDrr_Int:
7969 case X86::VUCOMISDZrr_Int:
7970 case X86::ADDSDrr_Int:
7971 case X86::VADDSDrr_Int:
7972 case X86::VADDSDZrr_Int:
7973 case X86::CMPSDrri_Int:
7974 case X86::VCMPSDrri_Int:
7975 case X86::VCMPSDZrri_Int:
7976 case X86::DIVSDrr_Int:
7977 case X86::VDIVSDrr_Int:
7978 case X86::VDIVSDZrr_Int:
7979 case X86::MAXSDrr_Int:
7980 case X86::VMAXSDrr_Int:
7981 case X86::VMAXSDZrr_Int:
7982 case X86::MINSDrr_Int:
7983 case X86::VMINSDrr_Int:
7984 case X86::VMINSDZrr_Int:
7985 case X86::MULSDrr_Int:
7986 case X86::VMULSDrr_Int:
7987 case X86::VMULSDZrr_Int:
7988 case X86::SQRTSDr_Int:
7989 case X86::VSQRTSDr_Int:
7990 case X86::VSQRTSDZr_Int:
7991 case X86::SUBSDrr_Int:
7992 case X86::VSUBSDrr_Int:
7993 case X86::VSUBSDZrr_Int:
7994 case X86::VADDSDZrrk_Int:
7995 case X86::VADDSDZrrkz_Int:
7996 case X86::VCMPSDZrrik_Int:
7997 case X86::VDIVSDZrrk_Int:
7998 case X86::VDIVSDZrrkz_Int:
7999 case X86::VMAXSDZrrk_Int:
8000 case X86::VMAXSDZrrkz_Int:
8001 case X86::VMINSDZrrk_Int:
8002 case X86::VMINSDZrrkz_Int:
8003 case X86::VMULSDZrrk_Int:
8004 case X86::VMULSDZrrkz_Int:
8005 case X86::VSQRTSDZrk_Int:
8006 case X86::VSQRTSDZrkz_Int:
8007 case X86::VSUBSDZrrk_Int:
8008 case X86::VSUBSDZrrkz_Int:
8009 case X86::VFMADDSD4rr_Int:
8010 case X86::VFNMADDSD4rr_Int:
8011 case X86::VFMSUBSD4rr_Int:
8012 case X86::VFNMSUBSD4rr_Int:
8013 case X86::VFMADD132SDr_Int:
8014 case X86::VFNMADD132SDr_Int:
8015 case X86::VFMADD213SDr_Int:
8016 case X86::VFNMADD213SDr_Int:
8017 case X86::VFMADD231SDr_Int:
8018 case X86::VFNMADD231SDr_Int:
8019 case X86::VFMSUB132SDr_Int:
8020 case X86::VFNMSUB132SDr_Int:
8021 case X86::VFMSUB213SDr_Int:
8022 case X86::VFNMSUB213SDr_Int:
8023 case X86::VFMSUB231SDr_Int:
8024 case X86::VFNMSUB231SDr_Int:
8025 case X86::VFMADD132SDZr_Int:
8026 case X86::VFNMADD132SDZr_Int:
8027 case X86::VFMADD213SDZr_Int:
8028 case X86::VFNMADD213SDZr_Int:
8029 case X86::VFMADD231SDZr_Int:
8030 case X86::VFNMADD231SDZr_Int:
8031 case X86::VFMSUB132SDZr_Int:
8032 case X86::VFNMSUB132SDZr_Int:
8033 case X86::VFMSUB213SDZr_Int:
8034 case X86::VFNMSUB213SDZr_Int:
8035 case X86::VFMSUB231SDZr_Int:
8036 case X86::VFNMSUB231SDZr_Int:
8037 case X86::VFMADD132SDZrk_Int:
8038 case X86::VFNMADD132SDZrk_Int:
8039 case X86::VFMADD213SDZrk_Int:
8040 case X86::VFNMADD213SDZrk_Int:
8041 case X86::VFMADD231SDZrk_Int:
8042 case X86::VFNMADD231SDZrk_Int:
8043 case X86::VFMSUB132SDZrk_Int:
8044 case X86::VFNMSUB132SDZrk_Int:
8045 case X86::VFMSUB213SDZrk_Int:
8046 case X86::VFNMSUB213SDZrk_Int:
8047 case X86::VFMSUB231SDZrk_Int:
8048 case X86::VFNMSUB231SDZrk_Int:
8049 case X86::VFMADD132SDZrkz_Int:
8050 case X86::VFNMADD132SDZrkz_Int:
8051 case X86::VFMADD213SDZrkz_Int:
8052 case X86::VFNMADD213SDZrkz_Int:
8053 case X86::VFMADD231SDZrkz_Int:
8054 case X86::VFNMADD231SDZrkz_Int:
8055 case X86::VFMSUB132SDZrkz_Int:
8056 case X86::VFNMSUB132SDZrkz_Int:
8057 case X86::VFMSUB213SDZrkz_Int:
8058 case X86::VFNMSUB213SDZrkz_Int:
8059 case X86::VFMSUB231SDZrkz_Int:
8060 case X86::VFNMSUB231SDZrkz_Int:
8061 case X86::VFIXUPIMMSDZrri:
8062 case X86::VFIXUPIMMSDZrrik:
8063 case X86::VFIXUPIMMSDZrrikz:
8064 case X86::VFPCLASSSDZri:
8065 case X86::VFPCLASSSDZrik:
8066 case X86::VGETEXPSDZr:
8067 case X86::VGETEXPSDZrk:
8068 case X86::VGETEXPSDZrkz:
8069 case X86::VGETMANTSDZrri:
8070 case X86::VGETMANTSDZrrik:
8071 case X86::VGETMANTSDZrrikz:
8072 case X86::VRANGESDZrri:
8073 case X86::VRANGESDZrrik:
8074 case X86::VRANGESDZrrikz:
8075 case X86::VRCP14SDZrr:
8076 case X86::VRCP14SDZrrk:
8077 case X86::VRCP14SDZrrkz:
8078 case X86::VRCP28SDZr:
8079 case X86::VRCP28SDZrk:
8080 case X86::VRCP28SDZrkz:
8081 case X86::VREDUCESDZrri:
8082 case X86::VREDUCESDZrrik:
8083 case X86::VREDUCESDZrrikz:
8084 case X86::VRNDSCALESDZrri_Int:
8085 case X86::VRNDSCALESDZrrik_Int:
8086 case X86::VRNDSCALESDZrrikz_Int:
8087 case X86::VRSQRT14SDZrr:
8088 case X86::VRSQRT14SDZrrk:
8089 case X86::VRSQRT14SDZrrkz:
8090 case X86::VRSQRT28SDZr:
8091 case X86::VRSQRT28SDZrk:
8092 case X86::VRSQRT28SDZrkz:
8093 case X86::VSCALEFSDZrr:
8094 case X86::VSCALEFSDZrrk:
8095 case X86::VSCALEFSDZrrkz:
8096 return false;
8097 default:
8098 return true;
8099 }
8100 }
8101
8102 if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) {
8103 // These instructions only load 16 bits, we can't fold them if the
8104 // destination register is wider than 16 bits (2 bytes), and its user
8105 // instruction isn't scalar (SH).
8106 switch (UserOpc) {
8107 case X86::VADDSHZrr_Int:
8108 case X86::VCMPSHZrri_Int:
8109 case X86::VDIVSHZrr_Int:
8110 case X86::VMAXSHZrr_Int:
8111 case X86::VMINSHZrr_Int:
8112 case X86::VMULSHZrr_Int:
8113 case X86::VSUBSHZrr_Int:
8114 case X86::VADDSHZrrk_Int:
8115 case X86::VADDSHZrrkz_Int:
8116 case X86::VCMPSHZrrik_Int:
8117 case X86::VDIVSHZrrk_Int:
8118 case X86::VDIVSHZrrkz_Int:
8119 case X86::VMAXSHZrrk_Int:
8120 case X86::VMAXSHZrrkz_Int:
8121 case X86::VMINSHZrrk_Int:
8122 case X86::VMINSHZrrkz_Int:
8123 case X86::VMULSHZrrk_Int:
8124 case X86::VMULSHZrrkz_Int:
8125 case X86::VSUBSHZrrk_Int:
8126 case X86::VSUBSHZrrkz_Int:
8127 case X86::VFMADD132SHZr_Int:
8128 case X86::VFNMADD132SHZr_Int:
8129 case X86::VFMADD213SHZr_Int:
8130 case X86::VFNMADD213SHZr_Int:
8131 case X86::VFMADD231SHZr_Int:
8132 case X86::VFNMADD231SHZr_Int:
8133 case X86::VFMSUB132SHZr_Int:
8134 case X86::VFNMSUB132SHZr_Int:
8135 case X86::VFMSUB213SHZr_Int:
8136 case X86::VFNMSUB213SHZr_Int:
8137 case X86::VFMSUB231SHZr_Int:
8138 case X86::VFNMSUB231SHZr_Int:
8139 case X86::VFMADD132SHZrk_Int:
8140 case X86::VFNMADD132SHZrk_Int:
8141 case X86::VFMADD213SHZrk_Int:
8142 case X86::VFNMADD213SHZrk_Int:
8143 case X86::VFMADD231SHZrk_Int:
8144 case X86::VFNMADD231SHZrk_Int:
8145 case X86::VFMSUB132SHZrk_Int:
8146 case X86::VFNMSUB132SHZrk_Int:
8147 case X86::VFMSUB213SHZrk_Int:
8148 case X86::VFNMSUB213SHZrk_Int:
8149 case X86::VFMSUB231SHZrk_Int:
8150 case X86::VFNMSUB231SHZrk_Int:
8151 case X86::VFMADD132SHZrkz_Int:
8152 case X86::VFNMADD132SHZrkz_Int:
8153 case X86::VFMADD213SHZrkz_Int:
8154 case X86::VFNMADD213SHZrkz_Int:
8155 case X86::VFMADD231SHZrkz_Int:
8156 case X86::VFNMADD231SHZrkz_Int:
8157 case X86::VFMSUB132SHZrkz_Int:
8158 case X86::VFNMSUB132SHZrkz_Int:
8159 case X86::VFMSUB213SHZrkz_Int:
8160 case X86::VFNMSUB213SHZrkz_Int:
8161 case X86::VFMSUB231SHZrkz_Int:
8162 case X86::VFNMSUB231SHZrkz_Int:
8163 return false;
8164 default:
8165 return true;
8166 }
8167 }
8168
8169 return false;
8170}
8171
8174 MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
8175 MachineInstr *&CopyMI, LiveIntervals *LIS) const {
8176
8177 // If LoadMI is a masked load, check MI having the same mask.
8178 const MCInstrDesc &MCID = get(LoadMI.getOpcode());
8179 unsigned NumOps = MCID.getNumOperands();
8180 if (NumOps >= 3) {
8181 Register MaskReg;
8182 const MachineOperand &Op1 = LoadMI.getOperand(1);
8183 const MachineOperand &Op2 = LoadMI.getOperand(2);
8184
8185 auto IsVKWMClass = [](const TargetRegisterClass *RC) {
8186 return RC == &X86::VK2WMRegClass || RC == &X86::VK4WMRegClass ||
8187 RC == &X86::VK8WMRegClass || RC == &X86::VK16WMRegClass ||
8188 RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass;
8189 };
8190
8191 if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1)))
8192 MaskReg = Op1.getReg();
8193 else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2)))
8194 MaskReg = Op2.getReg();
8195
8196 if (MaskReg) {
8197 // Some instructions are invalid to fold into even with the same mask.
8198 // Folding is unsafe if an active destination element may read from a
8199 // source element that is masked off.
8200 if (isNonFoldableWithSameMask(MI.getOpcode()))
8201 return nullptr;
8202 bool HasSameMask = false;
8203 for (unsigned I = 1, E = MI.getDesc().getNumOperands(); I < E; ++I) {
8204 const MachineOperand &Op = MI.getOperand(I);
8205 if (Op.isReg() && Op.getReg() == MaskReg) {
8206 HasSameMask = true;
8207 break;
8208 }
8209 }
8210 if (!HasSameMask)
8211 return nullptr;
8212 }
8213 }
8214
8215 // TODO: Support the case where LoadMI loads a wide register, but MI
8216 // only uses a subreg.
8217 for (auto Op : Ops) {
8218 if (MI.getOperand(Op).getSubReg())
8219 return nullptr;
8220 }
8221
8222 // If loading from a FrameIndex, fold directly from the FrameIndex.
8223 int FrameIndex;
8224 if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
8225 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8226 return nullptr;
8227 return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, CopyMI,
8228 LIS);
8229 }
8230
8231 // Check switch flag
8232 if (NoFusing)
8233 return nullptr;
8234
8235 // Avoid partial and undef register update stalls unless optimizing for size.
8236 if (!MF.getFunction().hasOptSize() &&
8237 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
8239 return nullptr;
8240
8241 // Do not fold a NDD instruction and a memory instruction with relocation to
8242 // avoid emit APX relocation when the flag is disabled for backward
8243 // compatibility.
8244 uint64_t TSFlags = MI.getDesc().TSFlags;
8246 X86II::hasNewDataDest(TSFlags))
8247 return nullptr;
8248
8249 // Determine the alignment of the load.
8250 Align Alignment;
8251 unsigned LoadOpc = LoadMI.getOpcode();
8252 if (LoadMI.hasOneMemOperand())
8253 Alignment = (*LoadMI.memoperands_begin())->getAlign();
8254 else
8255 switch (LoadOpc) {
8256 case X86::AVX512_512_SET0:
8257 case X86::AVX512_512_SETALLONES:
8258 Alignment = Align(64);
8259 break;
8260 case X86::AVX2_SETALLONES:
8261 case X86::AVX1_SETALLONES:
8262 case X86::AVX_SET0:
8263 case X86::AVX512_256_SET0:
8264 case X86::AVX512_256_SETALLONES:
8265 Alignment = Align(32);
8266 break;
8267 case X86::V_SET0:
8268 case X86::V_SETALLONES:
8269 case X86::AVX512_128_SET0:
8270 case X86::FsFLD0F128:
8271 case X86::AVX512_FsFLD0F128:
8272 case X86::AVX512_128_SETALLONES:
8273 Alignment = Align(16);
8274 break;
8275 case X86::MMX_SET0:
8276 case X86::FsFLD0SD:
8277 case X86::AVX512_FsFLD0SD:
8278 Alignment = Align(8);
8279 break;
8280 case X86::FsFLD0SS:
8281 case X86::AVX512_FsFLD0SS:
8282 Alignment = Align(4);
8283 break;
8284 case X86::FsFLD0SH:
8285 case X86::AVX512_FsFLD0SH:
8286 Alignment = Align(2);
8287 break;
8288 default:
8289 return nullptr;
8290 }
8291 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
8292 unsigned NewOpc = 0;
8293 switch (MI.getOpcode()) {
8294 default:
8295 return nullptr;
8296 case X86::TEST8rr:
8297 NewOpc = X86::CMP8ri;
8298 break;
8299 case X86::TEST16rr:
8300 NewOpc = X86::CMP16ri;
8301 break;
8302 case X86::TEST32rr:
8303 NewOpc = X86::CMP32ri;
8304 break;
8305 case X86::TEST64rr:
8306 NewOpc = X86::CMP64ri32;
8307 break;
8308 }
8309 // Change to CMPXXri r, 0 first.
8310 MI.setDesc(get(NewOpc));
8311 MI.getOperand(1).ChangeToImmediate(0);
8312 } else if (Ops.size() != 1)
8313 return nullptr;
8314
8315 // Make sure the subregisters match.
8316 // Otherwise we risk changing the size of the load.
8317 if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
8318 return nullptr;
8319
8321 switch (LoadOpc) {
8322 case X86::MMX_SET0:
8323 case X86::V_SET0:
8324 case X86::V_SETALLONES:
8325 case X86::AVX2_SETALLONES:
8326 case X86::AVX1_SETALLONES:
8327 case X86::AVX_SET0:
8328 case X86::AVX512_128_SET0:
8329 case X86::AVX512_256_SET0:
8330 case X86::AVX512_512_SET0:
8331 case X86::AVX512_128_SETALLONES:
8332 case X86::AVX512_256_SETALLONES:
8333 case X86::AVX512_512_SETALLONES:
8334 case X86::FsFLD0SH:
8335 case X86::AVX512_FsFLD0SH:
8336 case X86::FsFLD0SD:
8337 case X86::AVX512_FsFLD0SD:
8338 case X86::FsFLD0SS:
8339 case X86::AVX512_FsFLD0SS:
8340 case X86::FsFLD0F128:
8341 case X86::AVX512_FsFLD0F128: {
8342 // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
8343 // Create a constant-pool entry and operands to load from it.
8344
8345 // Large code model can't fold loads this way.
8347 return nullptr;
8348
8349 // x86-32 PIC requires a PIC base register for constant pools.
8350 unsigned PICBase = 0;
8351 // Since we're using Small or Kernel code model, we can always use
8352 // RIP-relative addressing for a smaller encoding.
8353 if (Subtarget.is64Bit()) {
8354 PICBase = X86::RIP;
8355 } else if (MF.getTarget().isPositionIndependent()) {
8356 // FIXME: PICBase = getGlobalBaseReg(&MF);
8357 // This doesn't work for several reasons.
8358 // 1. GlobalBaseReg may have been spilled.
8359 // 2. It may not be live at MI.
8360 return nullptr;
8361 }
8362
8363 // Create a constant-pool entry.
8365 Type *Ty;
8366 bool IsAllOnes = false;
8367 switch (LoadOpc) {
8368 case X86::FsFLD0SS:
8369 case X86::AVX512_FsFLD0SS:
8371 break;
8372 case X86::FsFLD0SD:
8373 case X86::AVX512_FsFLD0SD:
8375 break;
8376 case X86::FsFLD0F128:
8377 case X86::AVX512_FsFLD0F128:
8379 break;
8380 case X86::FsFLD0SH:
8381 case X86::AVX512_FsFLD0SH:
8383 break;
8384 case X86::AVX512_512_SETALLONES:
8385 IsAllOnes = true;
8386 [[fallthrough]];
8387 case X86::AVX512_512_SET0:
8389 16);
8390 break;
8391 case X86::AVX1_SETALLONES:
8392 case X86::AVX2_SETALLONES:
8393 case X86::AVX512_256_SETALLONES:
8394 IsAllOnes = true;
8395 [[fallthrough]];
8396 case X86::AVX512_256_SET0:
8397 case X86::AVX_SET0:
8399 8);
8400
8401 break;
8402 case X86::MMX_SET0:
8404 2);
8405 break;
8406 case X86::V_SETALLONES:
8407 case X86::AVX512_128_SETALLONES:
8408 IsAllOnes = true;
8409 [[fallthrough]];
8410 case X86::V_SET0:
8411 case X86::AVX512_128_SET0:
8413 4);
8414 break;
8415 }
8416
8417 const Constant *C =
8419 unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
8420
8421 // Create operands to load from the constant pool entry.
8422 MOs.push_back(MachineOperand::CreateReg(PICBase, false));
8424 MOs.push_back(MachineOperand::CreateReg(0, false));
8426 MOs.push_back(MachineOperand::CreateReg(0, false));
8427 break;
8428 }
8429 case X86::VPBROADCASTBZ128rm:
8430 case X86::VPBROADCASTBZ256rm:
8431 case X86::VPBROADCASTBZrm:
8432 case X86::VBROADCASTF32X2Z256rm:
8433 case X86::VBROADCASTF32X2Zrm:
8434 case X86::VBROADCASTI32X2Z128rm:
8435 case X86::VBROADCASTI32X2Z256rm:
8436 case X86::VBROADCASTI32X2Zrm:
8437 // No instructions currently fuse with 8bits or 32bits x 2.
8438 return nullptr;
8439
8440#define FOLD_BROADCAST(SIZE) \
8441 MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, \
8442 LoadMI.operands_begin() + NumOps); \
8443 return foldMemoryBroadcast(MF, MI, Ops[0], MOs, InsertPt, /*Size=*/SIZE, \
8444 /*AllowCommute=*/true);
8445 case X86::VPBROADCASTWZ128rm:
8446 case X86::VPBROADCASTWZ256rm:
8447 case X86::VPBROADCASTWZrm:
8448 FOLD_BROADCAST(16);
8449 case X86::VPBROADCASTDZ128rm:
8450 case X86::VPBROADCASTDZ256rm:
8451 case X86::VPBROADCASTDZrm:
8452 case X86::VBROADCASTSSZ128rm:
8453 case X86::VBROADCASTSSZ256rm:
8454 case X86::VBROADCASTSSZrm:
8455 FOLD_BROADCAST(32);
8456 case X86::VPBROADCASTQZ128rm:
8457 case X86::VPBROADCASTQZ256rm:
8458 case X86::VPBROADCASTQZrm:
8459 case X86::VBROADCASTSDZ256rm:
8460 case X86::VBROADCASTSDZrm:
8461 FOLD_BROADCAST(64);
8462 default: {
8463 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8464 return nullptr;
8465
8466 // Folding a normal load. Just copy the load's address operands.
8468 LoadMI.operands_begin() + NumOps);
8469 break;
8470 }
8471 }
8472 return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
8473 /*Size=*/0, Alignment, /*AllowCommute=*/true,
8474 CopyMI);
8475}
8476
8478X86InstrInfo::foldMemoryBroadcast(MachineFunction &MF, MachineInstr &MI,
8479 unsigned OpNum, ArrayRef<MachineOperand> MOs,
8481 unsigned BitsSize, bool AllowCommute) const {
8482
8483 if (auto *I = lookupBroadcastFoldTable(MI.getOpcode(), OpNum))
8484 return matchBroadcastSize(*I, BitsSize)
8485 ? fuseInst(MF, I->DstOp, OpNum, MOs, InsertPt, MI, *this)
8486 : nullptr;
8487
8488 if (AllowCommute) {
8489 // If the instruction and target operand are commutable, commute the
8490 // instruction and try again.
8491 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
8492 if (CommuteOpIdx2 == OpNum) {
8493 printFailMsgforFold(MI, OpNum);
8494 return nullptr;
8495 }
8496 MachineInstr *NewMI =
8497 foldMemoryBroadcast(MF, MI, CommuteOpIdx2, MOs, InsertPt, BitsSize,
8498 /*AllowCommute=*/false);
8499 if (NewMI)
8500 return NewMI;
8501 // Folding failed again - undo the commute before returning.
8502 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
8503 }
8504
8505 printFailMsgforFold(MI, OpNum);
8506 return nullptr;
8507}
8508
8512
8513 for (MachineMemOperand *MMO : MMOs) {
8514 if (!MMO->isLoad())
8515 continue;
8516
8517 if (!MMO->isStore()) {
8518 // Reuse the MMO.
8519 LoadMMOs.push_back(MMO);
8520 } else {
8521 // Clone the MMO and unset the store flag.
8522 LoadMMOs.push_back(MF.getMachineMemOperand(
8523 MMO, MMO->getFlags() & ~MachineMemOperand::MOStore));
8524 }
8525 }
8526
8527 return LoadMMOs;
8528}
8529
8533
8534 for (MachineMemOperand *MMO : MMOs) {
8535 if (!MMO->isStore())
8536 continue;
8537
8538 if (!MMO->isLoad()) {
8539 // Reuse the MMO.
8540 StoreMMOs.push_back(MMO);
8541 } else {
8542 // Clone the MMO and unset the load flag.
8543 StoreMMOs.push_back(MF.getMachineMemOperand(
8544 MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad));
8545 }
8546 }
8547
8548 return StoreMMOs;
8549}
8550
8552 const TargetRegisterClass *RC,
8553 const X86Subtarget &STI) {
8554 assert(STI.hasAVX512() && "Expected at least AVX512!");
8555 unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
8556 assert((SpillSize == 64 || STI.hasVLX()) &&
8557 "Can't broadcast less than 64 bytes without AVX512VL!");
8558
8559#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64) \
8560 case TYPE: \
8561 switch (SpillSize) { \
8562 default: \
8563 llvm_unreachable("Unknown spill size"); \
8564 case 16: \
8565 return X86::OP16; \
8566 case 32: \
8567 return X86::OP32; \
8568 case 64: \
8569 return X86::OP64; \
8570 } \
8571 break;
8572
8573 switch (I->Flags & TB_BCAST_MASK) {
8574 default:
8575 llvm_unreachable("Unexpected broadcast type!");
8576 CASE_BCAST_TYPE_OPC(TB_BCAST_W, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8577 VPBROADCASTWZrm)
8578 CASE_BCAST_TYPE_OPC(TB_BCAST_D, VPBROADCASTDZ128rm, VPBROADCASTDZ256rm,
8579 VPBROADCASTDZrm)
8580 CASE_BCAST_TYPE_OPC(TB_BCAST_Q, VPBROADCASTQZ128rm, VPBROADCASTQZ256rm,
8581 VPBROADCASTQZrm)
8582 CASE_BCAST_TYPE_OPC(TB_BCAST_SH, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8583 VPBROADCASTWZrm)
8584 CASE_BCAST_TYPE_OPC(TB_BCAST_SS, VBROADCASTSSZ128rm, VBROADCASTSSZ256rm,
8585 VBROADCASTSSZrm)
8586 CASE_BCAST_TYPE_OPC(TB_BCAST_SD, VMOVDDUPZ128rm, VBROADCASTSDZ256rm,
8587 VBROADCASTSDZrm)
8588 }
8589}
8590
8592 MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad,
8593 bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
8594 const X86FoldTableEntry *I = lookupUnfoldTable(MI.getOpcode());
8595 if (I == nullptr)
8596 return false;
8597 unsigned Opc = I->DstOp;
8598 unsigned Index = I->Flags & TB_INDEX_MASK;
8599 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8600 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8601 if (UnfoldLoad && !FoldedLoad)
8602 return false;
8603 UnfoldLoad &= FoldedLoad;
8604 if (UnfoldStore && !FoldedStore)
8605 return false;
8606 UnfoldStore &= FoldedStore;
8607
8608 const MCInstrDesc &MCID = get(Opc);
8609
8610 const TargetRegisterClass *RC = getRegClass(MCID, Index);
8612 // TODO: Check if 32-byte or greater accesses are slow too?
8613 if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
8614 Subtarget.isUnalignedMem16Slow())
8615 // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
8616 // conservatively assume the address is unaligned. That's bad for
8617 // performance.
8618 return false;
8623 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
8624 MachineOperand &Op = MI.getOperand(i);
8625 if (i >= Index && i < Index + X86::AddrNumOperands)
8626 AddrOps.push_back(Op);
8627 else if (Op.isReg() && Op.isImplicit())
8628 ImpOps.push_back(Op);
8629 else if (i < Index)
8630 BeforeOps.push_back(Op);
8631 else if (i > Index)
8632 AfterOps.push_back(Op);
8633 }
8634
8635 // Emit the load or broadcast instruction.
8636 if (UnfoldLoad) {
8637 auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
8638
8639 unsigned Opc;
8640 if (I->Flags & TB_BCAST_MASK) {
8641 Opc = getBroadcastOpcode(I, RC, Subtarget);
8642 } else {
8643 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8644 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8645 Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
8646 }
8647
8648 DebugLoc DL;
8649 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
8650 for (const MachineOperand &AddrOp : AddrOps)
8651 MIB.add(AddrOp);
8652 MIB.setMemRefs(MMOs);
8653 NewMIs.push_back(MIB);
8654
8655 if (UnfoldStore) {
8656 // Address operands cannot be marked isKill.
8657 for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
8658 MachineOperand &MO = NewMIs[0]->getOperand(i);
8659 if (MO.isReg())
8660 MO.setIsKill(false);
8661 }
8662 }
8663 }
8664
8665 // Emit the data processing instruction.
8666 MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
8667 MachineInstrBuilder MIB(MF, DataMI);
8668
8669 if (FoldedStore)
8670 MIB.addReg(Reg, RegState::Define);
8671 for (MachineOperand &BeforeOp : BeforeOps)
8672 MIB.add(BeforeOp);
8673 if (FoldedLoad)
8674 MIB.addReg(Reg);
8675 for (MachineOperand &AfterOp : AfterOps)
8676 MIB.add(AfterOp);
8677 for (MachineOperand &ImpOp : ImpOps) {
8678 MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) |
8680 getKillRegState(ImpOp.isKill()) |
8681 getDeadRegState(ImpOp.isDead()) |
8682 getUndefRegState(ImpOp.isUndef()));
8683 }
8684 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8685 switch (DataMI->getOpcode()) {
8686 default:
8687 break;
8688 case X86::CMP64ri32:
8689 case X86::CMP32ri:
8690 case X86::CMP16ri:
8691 case X86::CMP8ri: {
8692 MachineOperand &MO0 = DataMI->getOperand(0);
8693 MachineOperand &MO1 = DataMI->getOperand(1);
8694 if (MO1.isImm() && MO1.getImm() == 0) {
8695 unsigned NewOpc;
8696 switch (DataMI->getOpcode()) {
8697 default:
8698 llvm_unreachable("Unreachable!");
8699 case X86::CMP64ri32:
8700 NewOpc = X86::TEST64rr;
8701 break;
8702 case X86::CMP32ri:
8703 NewOpc = X86::TEST32rr;
8704 break;
8705 case X86::CMP16ri:
8706 NewOpc = X86::TEST16rr;
8707 break;
8708 case X86::CMP8ri:
8709 NewOpc = X86::TEST8rr;
8710 break;
8711 }
8712 DataMI->setDesc(get(NewOpc));
8713 MO1.ChangeToRegister(MO0.getReg(), false);
8714 }
8715 }
8716 }
8717 NewMIs.push_back(DataMI);
8718
8719 // Emit the store instruction.
8720 if (UnfoldStore) {
8721 const TargetRegisterClass *DstRC = getRegClass(MCID, 0);
8722 auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
8723 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
8724 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8725 unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
8726 DebugLoc DL;
8727 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
8728 for (const MachineOperand &AddrOp : AddrOps)
8729 MIB.add(AddrOp);
8730 MIB.addReg(Reg, RegState::Kill);
8731 MIB.setMemRefs(MMOs);
8732 NewMIs.push_back(MIB);
8733 }
8734
8735 return true;
8736}
8737
8739 SelectionDAG &DAG, SDNode *N, SmallVectorImpl<SDNode *> &NewNodes) const {
8740 if (!N->isMachineOpcode())
8741 return false;
8742
8743 const X86FoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode());
8744 if (I == nullptr)
8745 return false;
8746 unsigned Opc = I->DstOp;
8747 unsigned Index = I->Flags & TB_INDEX_MASK;
8748 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8749 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8750 const MCInstrDesc &MCID = get(Opc);
8753 const TargetRegisterClass *RC = getRegClass(MCID, Index);
8754 unsigned NumDefs = MCID.NumDefs;
8755 std::vector<SDValue> AddrOps;
8756 std::vector<SDValue> BeforeOps;
8757 std::vector<SDValue> AfterOps;
8758 SDLoc dl(N);
8759 unsigned NumOps = N->getNumOperands();
8760 for (unsigned i = 0; i != NumOps - 1; ++i) {
8761 SDValue Op = N->getOperand(i);
8762 if (i >= Index - NumDefs && i < Index - NumDefs + X86::AddrNumOperands)
8763 AddrOps.push_back(Op);
8764 else if (i < Index - NumDefs)
8765 BeforeOps.push_back(Op);
8766 else if (i > Index - NumDefs)
8767 AfterOps.push_back(Op);
8768 }
8769 SDValue Chain = N->getOperand(NumOps - 1);
8770 AddrOps.push_back(Chain);
8771
8772 // Emit the load instruction.
8773 SDNode *Load = nullptr;
8774 if (FoldedLoad) {
8775 EVT VT = *TRI.legalclasstypes_begin(*RC);
8776 auto MMOs = extractLoadMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8777 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8778 Subtarget.isUnalignedMem16Slow())
8779 // Do not introduce a slow unaligned load.
8780 return false;
8781 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8782 // memory access is slow above.
8783
8784 unsigned Opc;
8785 if (I->Flags & TB_BCAST_MASK) {
8786 Opc = getBroadcastOpcode(I, RC, Subtarget);
8787 } else {
8788 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8789 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8790 Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
8791 }
8792
8793 Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
8794 NewNodes.push_back(Load);
8795
8796 // Preserve memory reference information.
8797 DAG.setNodeMemRefs(cast<MachineSDNode>(Load), MMOs);
8798 }
8799
8800 // Emit the data processing instruction.
8801 std::vector<EVT> VTs;
8802 const TargetRegisterClass *DstRC = nullptr;
8803 if (MCID.getNumDefs() > 0) {
8804 DstRC = getRegClass(MCID, 0);
8805 VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
8806 }
8807 for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
8808 EVT VT = N->getValueType(i);
8809 if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
8810 VTs.push_back(VT);
8811 }
8812 if (Load)
8813 BeforeOps.push_back(SDValue(Load, 0));
8814 llvm::append_range(BeforeOps, AfterOps);
8815 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8816 switch (Opc) {
8817 default:
8818 break;
8819 case X86::CMP64ri32:
8820 case X86::CMP32ri:
8821 case X86::CMP16ri:
8822 case X86::CMP8ri:
8823 if (isNullConstant(BeforeOps[1])) {
8824 switch (Opc) {
8825 default:
8826 llvm_unreachable("Unreachable!");
8827 case X86::CMP64ri32:
8828 Opc = X86::TEST64rr;
8829 break;
8830 case X86::CMP32ri:
8831 Opc = X86::TEST32rr;
8832 break;
8833 case X86::CMP16ri:
8834 Opc = X86::TEST16rr;
8835 break;
8836 case X86::CMP8ri:
8837 Opc = X86::TEST8rr;
8838 break;
8839 }
8840 BeforeOps[1] = BeforeOps[0];
8841 }
8842 }
8843 SDNode *NewNode = DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
8844 NewNodes.push_back(NewNode);
8845
8846 // Emit the store instruction.
8847 if (FoldedStore) {
8848 AddrOps.pop_back();
8849 AddrOps.push_back(SDValue(NewNode, 0));
8850 AddrOps.push_back(Chain);
8851 auto MMOs = extractStoreMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8852 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8853 Subtarget.isUnalignedMem16Slow())
8854 // Do not introduce a slow unaligned store.
8855 return false;
8856 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8857 // memory access is slow above.
8858 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8859 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8860 SDNode *Store =
8861 DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
8862 dl, MVT::Other, AddrOps);
8863 NewNodes.push_back(Store);
8864
8865 // Preserve memory reference information.
8866 DAG.setNodeMemRefs(cast<MachineSDNode>(Store), MMOs);
8867 }
8868
8869 return true;
8870}
8871
8872unsigned
8874 bool UnfoldStore,
8875 unsigned *LoadRegIndex) const {
8877 if (I == nullptr)
8878 return 0;
8879 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8880 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8881 if (UnfoldLoad && !FoldedLoad)
8882 return 0;
8883 if (UnfoldStore && !FoldedStore)
8884 return 0;
8885 if (LoadRegIndex)
8886 *LoadRegIndex = I->Flags & TB_INDEX_MASK;
8887 return I->DstOp;
8888}
8889
8891 int64_t &Offset1,
8892 int64_t &Offset2) const {
8893 if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
8894 return false;
8895
8896 auto IsLoadOpcode = [&](unsigned Opcode) {
8897 switch (Opcode) {
8898 default:
8899 return false;
8900 case X86::MOV8rm:
8901 case X86::MOV16rm:
8902 case X86::MOV32rm:
8903 case X86::MOV64rm:
8904 case X86::LD_Fp32m:
8905 case X86::LD_Fp64m:
8906 case X86::LD_Fp80m:
8907 case X86::MOVSSrm:
8908 case X86::MOVSSrm_alt:
8909 case X86::MOVSDrm:
8910 case X86::MOVSDrm_alt:
8911 case X86::MMX_MOVD64rm:
8912 case X86::MMX_MOVQ64rm:
8913 case X86::MOVAPSrm:
8914 case X86::MOVUPSrm:
8915 case X86::MOVAPDrm:
8916 case X86::MOVUPDrm:
8917 case X86::MOVDQArm:
8918 case X86::MOVDQUrm:
8919 // AVX load instructions
8920 case X86::VMOVSSrm:
8921 case X86::VMOVSSrm_alt:
8922 case X86::VMOVSDrm:
8923 case X86::VMOVSDrm_alt:
8924 case X86::VMOVAPSrm:
8925 case X86::VMOVUPSrm:
8926 case X86::VMOVAPDrm:
8927 case X86::VMOVUPDrm:
8928 case X86::VMOVDQArm:
8929 case X86::VMOVDQUrm:
8930 case X86::VMOVAPSYrm:
8931 case X86::VMOVUPSYrm:
8932 case X86::VMOVAPDYrm:
8933 case X86::VMOVUPDYrm:
8934 case X86::VMOVDQAYrm:
8935 case X86::VMOVDQUYrm:
8936 // AVX512 load instructions
8937 case X86::VMOVSSZrm:
8938 case X86::VMOVSSZrm_alt:
8939 case X86::VMOVSDZrm:
8940 case X86::VMOVSDZrm_alt:
8941 case X86::VMOVAPSZ128rm:
8942 case X86::VMOVUPSZ128rm:
8943 case X86::VMOVAPSZ128rm_NOVLX:
8944 case X86::VMOVUPSZ128rm_NOVLX:
8945 case X86::VMOVAPDZ128rm:
8946 case X86::VMOVUPDZ128rm:
8947 case X86::VMOVDQU8Z128rm:
8948 case X86::VMOVDQU16Z128rm:
8949 case X86::VMOVDQA32Z128rm:
8950 case X86::VMOVDQU32Z128rm:
8951 case X86::VMOVDQA64Z128rm:
8952 case X86::VMOVDQU64Z128rm:
8953 case X86::VMOVAPSZ256rm:
8954 case X86::VMOVUPSZ256rm:
8955 case X86::VMOVAPSZ256rm_NOVLX:
8956 case X86::VMOVUPSZ256rm_NOVLX:
8957 case X86::VMOVAPDZ256rm:
8958 case X86::VMOVUPDZ256rm:
8959 case X86::VMOVDQU8Z256rm:
8960 case X86::VMOVDQU16Z256rm:
8961 case X86::VMOVDQA32Z256rm:
8962 case X86::VMOVDQU32Z256rm:
8963 case X86::VMOVDQA64Z256rm:
8964 case X86::VMOVDQU64Z256rm:
8965 case X86::VMOVAPSZrm:
8966 case X86::VMOVUPSZrm:
8967 case X86::VMOVAPDZrm:
8968 case X86::VMOVUPDZrm:
8969 case X86::VMOVDQU8Zrm:
8970 case X86::VMOVDQU16Zrm:
8971 case X86::VMOVDQA32Zrm:
8972 case X86::VMOVDQU32Zrm:
8973 case X86::VMOVDQA64Zrm:
8974 case X86::VMOVDQU64Zrm:
8975 case X86::KMOVBkm:
8976 case X86::KMOVBkm_EVEX:
8977 case X86::KMOVWkm:
8978 case X86::KMOVWkm_EVEX:
8979 case X86::KMOVDkm:
8980 case X86::KMOVDkm_EVEX:
8981 case X86::KMOVQkm:
8982 case X86::KMOVQkm_EVEX:
8983 return true;
8984 }
8985 };
8986
8987 if (!IsLoadOpcode(Load1->getMachineOpcode()) ||
8988 !IsLoadOpcode(Load2->getMachineOpcode()))
8989 return false;
8990
8991 // Lambda to check if both the loads have the same value for an operand index.
8992 auto HasSameOp = [&](int I) {
8993 return Load1->getOperand(I) == Load2->getOperand(I);
8994 };
8995
8996 // All operands except the displacement should match.
8997 if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
8998 !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
8999 return false;
9000
9001 // Chain Operand must be the same.
9002 if (!HasSameOp(5))
9003 return false;
9004
9005 // Now let's examine if the displacements are constants.
9008 if (!Disp1 || !Disp2)
9009 return false;
9010
9011 Offset1 = Disp1->getSExtValue();
9012 Offset2 = Disp2->getSExtValue();
9013 return true;
9014}
9015
9017 int64_t Offset1, int64_t Offset2,
9018 unsigned NumLoads) const {
9019 assert(Offset2 > Offset1);
9020 if ((Offset2 - Offset1) / 8 > 64)
9021 return false;
9022
9023 unsigned Opc1 = Load1->getMachineOpcode();
9024 unsigned Opc2 = Load2->getMachineOpcode();
9025 if (Opc1 != Opc2)
9026 return false; // FIXME: overly conservative?
9027
9028 switch (Opc1) {
9029 default:
9030 break;
9031 case X86::LD_Fp32m:
9032 case X86::LD_Fp64m:
9033 case X86::LD_Fp80m:
9034 case X86::MMX_MOVD64rm:
9035 case X86::MMX_MOVQ64rm:
9036 return false;
9037 }
9038
9039 EVT VT = Load1->getValueType(0);
9040 switch (VT.getSimpleVT().SimpleTy) {
9041 default:
9042 // XMM registers. In 64-bit mode we can be a bit more aggressive since we
9043 // have 16 of them to play with.
9044 if (Subtarget.is64Bit()) {
9045 if (NumLoads >= 3)
9046 return false;
9047 } else if (NumLoads) {
9048 return false;
9049 }
9050 break;
9051 case MVT::i8:
9052 case MVT::i16:
9053 case MVT::i32:
9054 case MVT::i64:
9055 case MVT::f32:
9056 case MVT::f64:
9057 if (NumLoads)
9058 return false;
9059 break;
9060 }
9061
9062 return true;
9063}
9064
9066 const MachineBasicBlock *MBB,
9067 const MachineFunction &MF) const {
9068
9069 // ENDBR instructions should not be scheduled around.
9070 unsigned Opcode = MI.getOpcode();
9071 if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
9072 Opcode == X86::PLDTILECFGV)
9073 return true;
9074
9075 // Frame setup and destroy can't be scheduled around.
9076 if (MI.getFlag(MachineInstr::FrameSetup) ||
9078 return true;
9079
9081}
9082
9085 assert(Cond.size() == 1 && "Invalid X86 branch condition!");
9086 X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
9087 Cond[0].setImm(GetOppositeBranchCondition(CC));
9088 return false;
9089}
9090
9092 const TargetRegisterClass *RC) const {
9093 // FIXME: Return false for x87 stack register classes for now. We can't
9094 // allow any loads of these registers before FpGet_ST0_80.
9095 return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
9096 RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
9097 RC == &X86::RFP80RegClass);
9098}
9099
9100/// Return a virtual register initialized with the
9101/// the global base register value. Output instructions required to
9102/// initialize the register in the function entry block, if necessary.
9103///
9104/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
9105///
9108 Register GlobalBaseReg = X86FI->getGlobalBaseReg();
9109 if (GlobalBaseReg)
9110 return GlobalBaseReg;
9111
9112 // Create the register. The code to initialize it is inserted
9113 // later, by the CGBR pass (below).
9114 MachineRegisterInfo &RegInfo = MF->getRegInfo();
9115 GlobalBaseReg = RegInfo.createVirtualRegister(
9116 Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass);
9117 X86FI->setGlobalBaseReg(GlobalBaseReg);
9118 return GlobalBaseReg;
9119}
9120
9121// FIXME: Some shuffle and unpack instructions have equivalents in different
9122// domains, but they require a bit more work than just switching opcodes.
9123
9124static const uint16_t *lookup(unsigned opcode, unsigned domain,
9125 ArrayRef<uint16_t[3]> Table) {
9126 for (const uint16_t(&Row)[3] : Table)
9127 if (Row[domain - 1] == opcode)
9128 return Row;
9129 return nullptr;
9130}
9131
9132static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
9133 ArrayRef<uint16_t[4]> Table) {
9134 // If this is the integer domain make sure to check both integer columns.
9135 for (const uint16_t(&Row)[4] : Table)
9136 if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
9137 return Row;
9138 return nullptr;
9139}
9140
9141// Helper to attempt to widen/narrow blend masks.
9142static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
9143 unsigned NewWidth, unsigned *pNewMask = nullptr) {
9144 assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
9145 "Illegal blend mask scale");
9146 unsigned NewMask = 0;
9147
9148 if ((OldWidth % NewWidth) == 0) {
9149 unsigned Scale = OldWidth / NewWidth;
9150 unsigned SubMask = (1u << Scale) - 1;
9151 for (unsigned i = 0; i != NewWidth; ++i) {
9152 unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
9153 if (Sub == SubMask)
9154 NewMask |= (1u << i);
9155 else if (Sub != 0x0)
9156 return false;
9157 }
9158 } else {
9159 unsigned Scale = NewWidth / OldWidth;
9160 unsigned SubMask = (1u << Scale) - 1;
9161 for (unsigned i = 0; i != OldWidth; ++i) {
9162 if (OldMask & (1 << i)) {
9163 NewMask |= (SubMask << (i * Scale));
9164 }
9165 }
9166 }
9167
9168 if (pNewMask)
9169 *pNewMask = NewMask;
9170 return true;
9171}
9172
9174 unsigned Opcode = MI.getOpcode();
9175 unsigned NumOperands = MI.getDesc().getNumOperands();
9176
9177 auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
9178 uint16_t validDomains = 0;
9179 if (MI.getOperand(NumOperands - 1).isImm()) {
9180 unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
9181 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
9182 validDomains |= 0x2; // PackedSingle
9183 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
9184 validDomains |= 0x4; // PackedDouble
9185 if (!Is256 || Subtarget.hasAVX2())
9186 validDomains |= 0x8; // PackedInt
9187 }
9188 return validDomains;
9189 };
9190
9191 switch (Opcode) {
9192 case X86::BLENDPDrmi:
9193 case X86::BLENDPDrri:
9194 case X86::VBLENDPDrmi:
9195 case X86::VBLENDPDrri:
9196 return GetBlendDomains(2, false);
9197 case X86::VBLENDPDYrmi:
9198 case X86::VBLENDPDYrri:
9199 return GetBlendDomains(4, true);
9200 case X86::BLENDPSrmi:
9201 case X86::BLENDPSrri:
9202 case X86::VBLENDPSrmi:
9203 case X86::VBLENDPSrri:
9204 case X86::VPBLENDDrmi:
9205 case X86::VPBLENDDrri:
9206 return GetBlendDomains(4, false);
9207 case X86::VBLENDPSYrmi:
9208 case X86::VBLENDPSYrri:
9209 case X86::VPBLENDDYrmi:
9210 case X86::VPBLENDDYrri:
9211 return GetBlendDomains(8, true);
9212 case X86::PBLENDWrmi:
9213 case X86::PBLENDWrri:
9214 case X86::VPBLENDWrmi:
9215 case X86::VPBLENDWrri:
9216 // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
9217 case X86::VPBLENDWYrmi:
9218 case X86::VPBLENDWYrri:
9219 return GetBlendDomains(8, false);
9220 case X86::VPANDDZ128rr:
9221 case X86::VPANDDZ128rm:
9222 case X86::VPANDDZ256rr:
9223 case X86::VPANDDZ256rm:
9224 case X86::VPANDQZ128rr:
9225 case X86::VPANDQZ128rm:
9226 case X86::VPANDQZ256rr:
9227 case X86::VPANDQZ256rm:
9228 case X86::VPANDNDZ128rr:
9229 case X86::VPANDNDZ128rm:
9230 case X86::VPANDNDZ256rr:
9231 case X86::VPANDNDZ256rm:
9232 case X86::VPANDNQZ128rr:
9233 case X86::VPANDNQZ128rm:
9234 case X86::VPANDNQZ256rr:
9235 case X86::VPANDNQZ256rm:
9236 case X86::VPORDZ128rr:
9237 case X86::VPORDZ128rm:
9238 case X86::VPORDZ256rr:
9239 case X86::VPORDZ256rm:
9240 case X86::VPORQZ128rr:
9241 case X86::VPORQZ128rm:
9242 case X86::VPORQZ256rr:
9243 case X86::VPORQZ256rm:
9244 case X86::VPXORDZ128rr:
9245 case X86::VPXORDZ128rm:
9246 case X86::VPXORDZ256rr:
9247 case X86::VPXORDZ256rm:
9248 case X86::VPXORQZ128rr:
9249 case X86::VPXORQZ128rm:
9250 case X86::VPXORQZ256rr:
9251 case X86::VPXORQZ256rm:
9252 // If we don't have DQI see if we can still switch from an EVEX integer
9253 // instruction to a VEX floating point instruction.
9254 if (Subtarget.hasDQI())
9255 return 0;
9256
9257 if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16)
9258 return 0;
9259 if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16)
9260 return 0;
9261 // Register forms will have 3 operands. Memory form will have more.
9262 if (NumOperands == 3 &&
9263 RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
9264 return 0;
9265
9266 // All domains are valid.
9267 return 0xe;
9268 case X86::MOVHLPSrr:
9269 // We can swap domains when both inputs are the same register.
9270 // FIXME: This doesn't catch all the cases we would like. If the input
9271 // register isn't KILLed by the instruction, the two address instruction
9272 // pass puts a COPY on one input. The other input uses the original
9273 // register. This prevents the same physical register from being used by
9274 // both inputs.
9275 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9276 MI.getOperand(0).getSubReg() == 0 &&
9277 MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0)
9278 return 0x6;
9279 return 0;
9280 case X86::SHUFPDrri:
9281 return 0x6;
9282 }
9283 return 0;
9284}
9285
9286#include "X86ReplaceableInstrs.def"
9287
9289 unsigned Domain) const {
9290 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9291 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9292 assert(dom && "Not an SSE instruction");
9293
9294 unsigned Opcode = MI.getOpcode();
9295 unsigned NumOperands = MI.getDesc().getNumOperands();
9296
9297 auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
9298 if (MI.getOperand(NumOperands - 1).isImm()) {
9299 unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
9300 Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
9301 unsigned NewImm = Imm;
9302
9303 const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
9304 if (!table)
9305 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9306
9307 if (Domain == 1) { // PackedSingle
9308 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9309 } else if (Domain == 2) { // PackedDouble
9310 AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
9311 } else if (Domain == 3) { // PackedInt
9312 if (Subtarget.hasAVX2()) {
9313 // If we are already VPBLENDW use that, else use VPBLENDD.
9314 if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
9315 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9316 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9317 }
9318 } else {
9319 assert(!Is256 && "128-bit vector expected");
9320 AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
9321 }
9322 }
9323
9324 assert(table && table[Domain - 1] && "Unknown domain op");
9325 MI.setDesc(get(table[Domain - 1]));
9326 MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
9327 }
9328 return true;
9329 };
9330
9331 switch (Opcode) {
9332 case X86::BLENDPDrmi:
9333 case X86::BLENDPDrri:
9334 case X86::VBLENDPDrmi:
9335 case X86::VBLENDPDrri:
9336 return SetBlendDomain(2, false);
9337 case X86::VBLENDPDYrmi:
9338 case X86::VBLENDPDYrri:
9339 return SetBlendDomain(4, true);
9340 case X86::BLENDPSrmi:
9341 case X86::BLENDPSrri:
9342 case X86::VBLENDPSrmi:
9343 case X86::VBLENDPSrri:
9344 case X86::VPBLENDDrmi:
9345 case X86::VPBLENDDrri:
9346 return SetBlendDomain(4, false);
9347 case X86::VBLENDPSYrmi:
9348 case X86::VBLENDPSYrri:
9349 case X86::VPBLENDDYrmi:
9350 case X86::VPBLENDDYrri:
9351 return SetBlendDomain(8, true);
9352 case X86::PBLENDWrmi:
9353 case X86::PBLENDWrri:
9354 case X86::VPBLENDWrmi:
9355 case X86::VPBLENDWrri:
9356 return SetBlendDomain(8, false);
9357 case X86::VPBLENDWYrmi:
9358 case X86::VPBLENDWYrri:
9359 return SetBlendDomain(16, true);
9360 case X86::VPANDDZ128rr:
9361 case X86::VPANDDZ128rm:
9362 case X86::VPANDDZ256rr:
9363 case X86::VPANDDZ256rm:
9364 case X86::VPANDQZ128rr:
9365 case X86::VPANDQZ128rm:
9366 case X86::VPANDQZ256rr:
9367 case X86::VPANDQZ256rm:
9368 case X86::VPANDNDZ128rr:
9369 case X86::VPANDNDZ128rm:
9370 case X86::VPANDNDZ256rr:
9371 case X86::VPANDNDZ256rm:
9372 case X86::VPANDNQZ128rr:
9373 case X86::VPANDNQZ128rm:
9374 case X86::VPANDNQZ256rr:
9375 case X86::VPANDNQZ256rm:
9376 case X86::VPORDZ128rr:
9377 case X86::VPORDZ128rm:
9378 case X86::VPORDZ256rr:
9379 case X86::VPORDZ256rm:
9380 case X86::VPORQZ128rr:
9381 case X86::VPORQZ128rm:
9382 case X86::VPORQZ256rr:
9383 case X86::VPORQZ256rm:
9384 case X86::VPXORDZ128rr:
9385 case X86::VPXORDZ128rm:
9386 case X86::VPXORDZ256rr:
9387 case X86::VPXORDZ256rm:
9388 case X86::VPXORQZ128rr:
9389 case X86::VPXORQZ128rm:
9390 case X86::VPXORQZ256rr:
9391 case X86::VPXORQZ256rm: {
9392 // Without DQI, convert EVEX instructions to VEX instructions.
9393 if (Subtarget.hasDQI())
9394 return false;
9395
9396 const uint16_t *table =
9397 lookupAVX512(MI.getOpcode(), dom, ReplaceableCustomAVX512LogicInstrs);
9398 assert(table && "Instruction not found in table?");
9399 // Don't change integer Q instructions to D instructions and
9400 // use D intructions if we started with a PS instruction.
9401 if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9402 Domain = 4;
9403 MI.setDesc(get(table[Domain - 1]));
9404 return true;
9405 }
9406 case X86::UNPCKHPDrr:
9407 case X86::MOVHLPSrr:
9408 // We just need to commute the instruction which will switch the domains.
9409 if (Domain != dom && Domain != 3 &&
9410 MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9411 MI.getOperand(0).getSubReg() == 0 &&
9412 MI.getOperand(1).getSubReg() == 0 &&
9413 MI.getOperand(2).getSubReg() == 0) {
9414 commuteInstruction(MI, false);
9415 return true;
9416 }
9417 // We must always return true for MOVHLPSrr.
9418 if (Opcode == X86::MOVHLPSrr)
9419 return true;
9420 break;
9421 case X86::SHUFPDrri: {
9422 if (Domain == 1) {
9423 unsigned Imm = MI.getOperand(3).getImm();
9424 unsigned NewImm = 0x44;
9425 if (Imm & 1)
9426 NewImm |= 0x0a;
9427 if (Imm & 2)
9428 NewImm |= 0xa0;
9429 MI.getOperand(3).setImm(NewImm);
9430 MI.setDesc(get(X86::SHUFPSrri));
9431 }
9432 return true;
9433 }
9434 }
9435 return false;
9436}
9437
9438std::pair<uint16_t, uint16_t>
9440 uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9441 unsigned opcode = MI.getOpcode();
9442 uint16_t validDomains = 0;
9443 if (domain) {
9444 // Attempt to match for custom instructions.
9445 validDomains = getExecutionDomainCustom(MI);
9446 if (validDomains)
9447 return std::make_pair(domain, validDomains);
9448
9449 if (lookup(opcode, domain, ReplaceableInstrs)) {
9450 validDomains = 0xe;
9451 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
9452 validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
9453 } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
9454 validDomains = 0x6;
9455 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
9456 // Insert/extract instructions should only effect domain if AVX2
9457 // is enabled.
9458 if (!Subtarget.hasAVX2())
9459 return std::make_pair(0, 0);
9460 validDomains = 0xe;
9461 } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
9462 validDomains = 0xe;
9463 } else if (Subtarget.hasDQI() &&
9464 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
9465 validDomains = 0xe;
9466 } else if (Subtarget.hasDQI()) {
9467 if (const uint16_t *table =
9468 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) {
9469 if (domain == 1 || (domain == 3 && table[3] == opcode))
9470 validDomains = 0xa;
9471 else
9472 validDomains = 0xc;
9473 }
9474 }
9475 }
9476 return std::make_pair(domain, validDomains);
9477}
9478
9480 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9481 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9482 assert(dom && "Not an SSE instruction");
9483
9484 // Attempt to match for custom instructions.
9486 return;
9487
9488 const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
9489 if (!table) { // try the other table
9490 assert((Subtarget.hasAVX2() || Domain < 3) &&
9491 "256-bit vector operations only available in AVX2");
9492 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
9493 }
9494 if (!table) { // try the FP table
9495 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
9496 assert((!table || Domain < 3) &&
9497 "Can only select PackedSingle or PackedDouble");
9498 }
9499 if (!table) { // try the other table
9500 assert(Subtarget.hasAVX2() &&
9501 "256-bit insert/extract only available in AVX2");
9502 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
9503 }
9504 if (!table) { // try the AVX512 table
9505 assert(Subtarget.hasAVX512() && "Requires AVX-512");
9506 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
9507 // Don't change integer Q instructions to D instructions.
9508 if (table && Domain == 3 && table[3] == MI.getOpcode())
9509 Domain = 4;
9510 }
9511 if (!table) { // try the AVX512DQ table
9512 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9513 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
9514 // Don't change integer Q instructions to D instructions and
9515 // use D instructions if we started with a PS instruction.
9516 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9517 Domain = 4;
9518 }
9519 if (!table) { // try the AVX512DQMasked table
9520 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9521 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
9522 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9523 Domain = 4;
9524 }
9525 assert(table && "Cannot change domain");
9526 MI.setDesc(get(table[Domain - 1]));
9527}
9528
9534
9535/// Return the noop instruction to use for a noop.
9537 MCInst Nop;
9538 Nop.setOpcode(X86::NOOP);
9539 return Nop;
9540}
9541
9543 switch (opc) {
9544 default:
9545 return false;
9546 case X86::DIVPDrm:
9547 case X86::DIVPDrr:
9548 case X86::DIVPSrm:
9549 case X86::DIVPSrr:
9550 case X86::DIVSDrm:
9551 case X86::DIVSDrm_Int:
9552 case X86::DIVSDrr:
9553 case X86::DIVSDrr_Int:
9554 case X86::DIVSSrm:
9555 case X86::DIVSSrm_Int:
9556 case X86::DIVSSrr:
9557 case X86::DIVSSrr_Int:
9558 case X86::SQRTPDm:
9559 case X86::SQRTPDr:
9560 case X86::SQRTPSm:
9561 case X86::SQRTPSr:
9562 case X86::SQRTSDm:
9563 case X86::SQRTSDm_Int:
9564 case X86::SQRTSDr:
9565 case X86::SQRTSDr_Int:
9566 case X86::SQRTSSm:
9567 case X86::SQRTSSm_Int:
9568 case X86::SQRTSSr:
9569 case X86::SQRTSSr_Int:
9570 // AVX instructions with high latency
9571 case X86::VDIVPDrm:
9572 case X86::VDIVPDrr:
9573 case X86::VDIVPDYrm:
9574 case X86::VDIVPDYrr:
9575 case X86::VDIVPSrm:
9576 case X86::VDIVPSrr:
9577 case X86::VDIVPSYrm:
9578 case X86::VDIVPSYrr:
9579 case X86::VDIVSDrm:
9580 case X86::VDIVSDrm_Int:
9581 case X86::VDIVSDrr:
9582 case X86::VDIVSDrr_Int:
9583 case X86::VDIVSSrm:
9584 case X86::VDIVSSrm_Int:
9585 case X86::VDIVSSrr:
9586 case X86::VDIVSSrr_Int:
9587 case X86::VSQRTPDm:
9588 case X86::VSQRTPDr:
9589 case X86::VSQRTPDYm:
9590 case X86::VSQRTPDYr:
9591 case X86::VSQRTPSm:
9592 case X86::VSQRTPSr:
9593 case X86::VSQRTPSYm:
9594 case X86::VSQRTPSYr:
9595 case X86::VSQRTSDm:
9596 case X86::VSQRTSDm_Int:
9597 case X86::VSQRTSDr:
9598 case X86::VSQRTSDr_Int:
9599 case X86::VSQRTSSm:
9600 case X86::VSQRTSSm_Int:
9601 case X86::VSQRTSSr:
9602 case X86::VSQRTSSr_Int:
9603 // AVX512 instructions with high latency
9604 case X86::VDIVPDZ128rm:
9605 case X86::VDIVPDZ128rmb:
9606 case X86::VDIVPDZ128rmbk:
9607 case X86::VDIVPDZ128rmbkz:
9608 case X86::VDIVPDZ128rmk:
9609 case X86::VDIVPDZ128rmkz:
9610 case X86::VDIVPDZ128rr:
9611 case X86::VDIVPDZ128rrk:
9612 case X86::VDIVPDZ128rrkz:
9613 case X86::VDIVPDZ256rm:
9614 case X86::VDIVPDZ256rmb:
9615 case X86::VDIVPDZ256rmbk:
9616 case X86::VDIVPDZ256rmbkz:
9617 case X86::VDIVPDZ256rmk:
9618 case X86::VDIVPDZ256rmkz:
9619 case X86::VDIVPDZ256rr:
9620 case X86::VDIVPDZ256rrk:
9621 case X86::VDIVPDZ256rrkz:
9622 case X86::VDIVPDZrrb:
9623 case X86::VDIVPDZrrbk:
9624 case X86::VDIVPDZrrbkz:
9625 case X86::VDIVPDZrm:
9626 case X86::VDIVPDZrmb:
9627 case X86::VDIVPDZrmbk:
9628 case X86::VDIVPDZrmbkz:
9629 case X86::VDIVPDZrmk:
9630 case X86::VDIVPDZrmkz:
9631 case X86::VDIVPDZrr:
9632 case X86::VDIVPDZrrk:
9633 case X86::VDIVPDZrrkz:
9634 case X86::VDIVPSZ128rm:
9635 case X86::VDIVPSZ128rmb:
9636 case X86::VDIVPSZ128rmbk:
9637 case X86::VDIVPSZ128rmbkz:
9638 case X86::VDIVPSZ128rmk:
9639 case X86::VDIVPSZ128rmkz:
9640 case X86::VDIVPSZ128rr:
9641 case X86::VDIVPSZ128rrk:
9642 case X86::VDIVPSZ128rrkz:
9643 case X86::VDIVPSZ256rm:
9644 case X86::VDIVPSZ256rmb:
9645 case X86::VDIVPSZ256rmbk:
9646 case X86::VDIVPSZ256rmbkz:
9647 case X86::VDIVPSZ256rmk:
9648 case X86::VDIVPSZ256rmkz:
9649 case X86::VDIVPSZ256rr:
9650 case X86::VDIVPSZ256rrk:
9651 case X86::VDIVPSZ256rrkz:
9652 case X86::VDIVPSZrrb:
9653 case X86::VDIVPSZrrbk:
9654 case X86::VDIVPSZrrbkz:
9655 case X86::VDIVPSZrm:
9656 case X86::VDIVPSZrmb:
9657 case X86::VDIVPSZrmbk:
9658 case X86::VDIVPSZrmbkz:
9659 case X86::VDIVPSZrmk:
9660 case X86::VDIVPSZrmkz:
9661 case X86::VDIVPSZrr:
9662 case X86::VDIVPSZrrk:
9663 case X86::VDIVPSZrrkz:
9664 case X86::VDIVSDZrm:
9665 case X86::VDIVSDZrr:
9666 case X86::VDIVSDZrm_Int:
9667 case X86::VDIVSDZrmk_Int:
9668 case X86::VDIVSDZrmkz_Int:
9669 case X86::VDIVSDZrr_Int:
9670 case X86::VDIVSDZrrk_Int:
9671 case X86::VDIVSDZrrkz_Int:
9672 case X86::VDIVSDZrrb_Int:
9673 case X86::VDIVSDZrrbk_Int:
9674 case X86::VDIVSDZrrbkz_Int:
9675 case X86::VDIVSSZrm:
9676 case X86::VDIVSSZrr:
9677 case X86::VDIVSSZrm_Int:
9678 case X86::VDIVSSZrmk_Int:
9679 case X86::VDIVSSZrmkz_Int:
9680 case X86::VDIVSSZrr_Int:
9681 case X86::VDIVSSZrrk_Int:
9682 case X86::VDIVSSZrrkz_Int:
9683 case X86::VDIVSSZrrb_Int:
9684 case X86::VDIVSSZrrbk_Int:
9685 case X86::VDIVSSZrrbkz_Int:
9686 case X86::VSQRTPDZ128m:
9687 case X86::VSQRTPDZ128mb:
9688 case X86::VSQRTPDZ128mbk:
9689 case X86::VSQRTPDZ128mbkz:
9690 case X86::VSQRTPDZ128mk:
9691 case X86::VSQRTPDZ128mkz:
9692 case X86::VSQRTPDZ128r:
9693 case X86::VSQRTPDZ128rk:
9694 case X86::VSQRTPDZ128rkz:
9695 case X86::VSQRTPDZ256m:
9696 case X86::VSQRTPDZ256mb:
9697 case X86::VSQRTPDZ256mbk:
9698 case X86::VSQRTPDZ256mbkz:
9699 case X86::VSQRTPDZ256mk:
9700 case X86::VSQRTPDZ256mkz:
9701 case X86::VSQRTPDZ256r:
9702 case X86::VSQRTPDZ256rk:
9703 case X86::VSQRTPDZ256rkz:
9704 case X86::VSQRTPDZm:
9705 case X86::VSQRTPDZmb:
9706 case X86::VSQRTPDZmbk:
9707 case X86::VSQRTPDZmbkz:
9708 case X86::VSQRTPDZmk:
9709 case X86::VSQRTPDZmkz:
9710 case X86::VSQRTPDZr:
9711 case X86::VSQRTPDZrb:
9712 case X86::VSQRTPDZrbk:
9713 case X86::VSQRTPDZrbkz:
9714 case X86::VSQRTPDZrk:
9715 case X86::VSQRTPDZrkz:
9716 case X86::VSQRTPSZ128m:
9717 case X86::VSQRTPSZ128mb:
9718 case X86::VSQRTPSZ128mbk:
9719 case X86::VSQRTPSZ128mbkz:
9720 case X86::VSQRTPSZ128mk:
9721 case X86::VSQRTPSZ128mkz:
9722 case X86::VSQRTPSZ128r:
9723 case X86::VSQRTPSZ128rk:
9724 case X86::VSQRTPSZ128rkz:
9725 case X86::VSQRTPSZ256m:
9726 case X86::VSQRTPSZ256mb:
9727 case X86::VSQRTPSZ256mbk:
9728 case X86::VSQRTPSZ256mbkz:
9729 case X86::VSQRTPSZ256mk:
9730 case X86::VSQRTPSZ256mkz:
9731 case X86::VSQRTPSZ256r:
9732 case X86::VSQRTPSZ256rk:
9733 case X86::VSQRTPSZ256rkz:
9734 case X86::VSQRTPSZm:
9735 case X86::VSQRTPSZmb:
9736 case X86::VSQRTPSZmbk:
9737 case X86::VSQRTPSZmbkz:
9738 case X86::VSQRTPSZmk:
9739 case X86::VSQRTPSZmkz:
9740 case X86::VSQRTPSZr:
9741 case X86::VSQRTPSZrb:
9742 case X86::VSQRTPSZrbk:
9743 case X86::VSQRTPSZrbkz:
9744 case X86::VSQRTPSZrk:
9745 case X86::VSQRTPSZrkz:
9746 case X86::VSQRTSDZm:
9747 case X86::VSQRTSDZm_Int:
9748 case X86::VSQRTSDZmk_Int:
9749 case X86::VSQRTSDZmkz_Int:
9750 case X86::VSQRTSDZr:
9751 case X86::VSQRTSDZr_Int:
9752 case X86::VSQRTSDZrk_Int:
9753 case X86::VSQRTSDZrkz_Int:
9754 case X86::VSQRTSDZrb_Int:
9755 case X86::VSQRTSDZrbk_Int:
9756 case X86::VSQRTSDZrbkz_Int:
9757 case X86::VSQRTSSZm:
9758 case X86::VSQRTSSZm_Int:
9759 case X86::VSQRTSSZmk_Int:
9760 case X86::VSQRTSSZmkz_Int:
9761 case X86::VSQRTSSZr:
9762 case X86::VSQRTSSZr_Int:
9763 case X86::VSQRTSSZrk_Int:
9764 case X86::VSQRTSSZrkz_Int:
9765 case X86::VSQRTSSZrb_Int:
9766 case X86::VSQRTSSZrbk_Int:
9767 case X86::VSQRTSSZrbkz_Int:
9768
9769 case X86::VGATHERDPDYrm:
9770 case X86::VGATHERDPDZ128rm:
9771 case X86::VGATHERDPDZ256rm:
9772 case X86::VGATHERDPDZrm:
9773 case X86::VGATHERDPDrm:
9774 case X86::VGATHERDPSYrm:
9775 case X86::VGATHERDPSZ128rm:
9776 case X86::VGATHERDPSZ256rm:
9777 case X86::VGATHERDPSZrm:
9778 case X86::VGATHERDPSrm:
9779 case X86::VGATHERPF0DPDm:
9780 case X86::VGATHERPF0DPSm:
9781 case X86::VGATHERPF0QPDm:
9782 case X86::VGATHERPF0QPSm:
9783 case X86::VGATHERPF1DPDm:
9784 case X86::VGATHERPF1DPSm:
9785 case X86::VGATHERPF1QPDm:
9786 case X86::VGATHERPF1QPSm:
9787 case X86::VGATHERQPDYrm:
9788 case X86::VGATHERQPDZ128rm:
9789 case X86::VGATHERQPDZ256rm:
9790 case X86::VGATHERQPDZrm:
9791 case X86::VGATHERQPDrm:
9792 case X86::VGATHERQPSYrm:
9793 case X86::VGATHERQPSZ128rm:
9794 case X86::VGATHERQPSZ256rm:
9795 case X86::VGATHERQPSZrm:
9796 case X86::VGATHERQPSrm:
9797 case X86::VPGATHERDDYrm:
9798 case X86::VPGATHERDDZ128rm:
9799 case X86::VPGATHERDDZ256rm:
9800 case X86::VPGATHERDDZrm:
9801 case X86::VPGATHERDDrm:
9802 case X86::VPGATHERDQYrm:
9803 case X86::VPGATHERDQZ128rm:
9804 case X86::VPGATHERDQZ256rm:
9805 case X86::VPGATHERDQZrm:
9806 case X86::VPGATHERDQrm:
9807 case X86::VPGATHERQDYrm:
9808 case X86::VPGATHERQDZ128rm:
9809 case X86::VPGATHERQDZ256rm:
9810 case X86::VPGATHERQDZrm:
9811 case X86::VPGATHERQDrm:
9812 case X86::VPGATHERQQYrm:
9813 case X86::VPGATHERQQZ128rm:
9814 case X86::VPGATHERQQZ256rm:
9815 case X86::VPGATHERQQZrm:
9816 case X86::VPGATHERQQrm:
9817 case X86::VSCATTERDPDZ128mr:
9818 case X86::VSCATTERDPDZ256mr:
9819 case X86::VSCATTERDPDZmr:
9820 case X86::VSCATTERDPSZ128mr:
9821 case X86::VSCATTERDPSZ256mr:
9822 case X86::VSCATTERDPSZmr:
9823 case X86::VSCATTERPF0DPDm:
9824 case X86::VSCATTERPF0DPSm:
9825 case X86::VSCATTERPF0QPDm:
9826 case X86::VSCATTERPF0QPSm:
9827 case X86::VSCATTERPF1DPDm:
9828 case X86::VSCATTERPF1DPSm:
9829 case X86::VSCATTERPF1QPDm:
9830 case X86::VSCATTERPF1QPSm:
9831 case X86::VSCATTERQPDZ128mr:
9832 case X86::VSCATTERQPDZ256mr:
9833 case X86::VSCATTERQPDZmr:
9834 case X86::VSCATTERQPSZ128mr:
9835 case X86::VSCATTERQPSZ256mr:
9836 case X86::VSCATTERQPSZmr:
9837 case X86::VPSCATTERDDZ128mr:
9838 case X86::VPSCATTERDDZ256mr:
9839 case X86::VPSCATTERDDZmr:
9840 case X86::VPSCATTERDQZ128mr:
9841 case X86::VPSCATTERDQZ256mr:
9842 case X86::VPSCATTERDQZmr:
9843 case X86::VPSCATTERQDZ128mr:
9844 case X86::VPSCATTERQDZ256mr:
9845 case X86::VPSCATTERQDZmr:
9846 case X86::VPSCATTERQQZ128mr:
9847 case X86::VPSCATTERQQZ256mr:
9848 case X86::VPSCATTERQQZmr:
9849 return true;
9850 }
9851}
9852
9854 const MachineRegisterInfo *MRI,
9855 const MachineInstr &DefMI,
9856 unsigned DefIdx,
9857 const MachineInstr &UseMI,
9858 unsigned UseIdx) const {
9859 return isHighLatencyDef(DefMI.getOpcode());
9860}
9861
9863 const MachineBasicBlock *MBB) const {
9864 assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 &&
9865 Inst.getNumDefs() <= 2 && "Reassociation needs binary operators");
9866
9867 // Integer binary math/logic instructions have a third source operand:
9868 // the EFLAGS register. That operand must be both defined here and never
9869 // used; ie, it must be dead. If the EFLAGS operand is live, then we can
9870 // not change anything because rearranging the operands could affect other
9871 // instructions that depend on the exact status flags (zero, sign, etc.)
9872 // that are set by using these particular operands with this operation.
9873 const MachineOperand *FlagDef =
9874 Inst.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
9875 assert((Inst.getNumDefs() == 1 || FlagDef) && "Implicit def isn't flags?");
9876 if (FlagDef && !FlagDef->isDead())
9877 return false;
9878
9880}
9881
9882// TODO: There are many more machine instruction opcodes to match:
9883// 1. Other data types (integer, vectors)
9884// 2. Other math / logic operations (xor, or)
9885// 3. Other forms of the same operation (intrinsics and other variants)
9887 bool Invert) const {
9888 if (Invert)
9889 return false;
9890 switch (Inst.getOpcode()) {
9891 CASE_ND(ADD8rr)
9892 CASE_ND(ADD16rr)
9893 CASE_ND(ADD32rr)
9894 CASE_ND(ADD64rr)
9895 CASE_ND(AND8rr)
9896 CASE_ND(AND16rr)
9897 CASE_ND(AND32rr)
9898 CASE_ND(AND64rr)
9899 CASE_ND(OR8rr)
9900 CASE_ND(OR16rr)
9901 CASE_ND(OR32rr)
9902 CASE_ND(OR64rr)
9903 CASE_ND(XOR8rr)
9904 CASE_ND(XOR16rr)
9905 CASE_ND(XOR32rr)
9906 CASE_ND(XOR64rr)
9907 CASE_ND(IMUL16rr)
9908 CASE_ND(IMUL32rr)
9909 CASE_ND(IMUL64rr)
9910 case X86::PANDrr:
9911 case X86::PORrr:
9912 case X86::PXORrr:
9913 case X86::ANDPDrr:
9914 case X86::ANDPSrr:
9915 case X86::ORPDrr:
9916 case X86::ORPSrr:
9917 case X86::XORPDrr:
9918 case X86::XORPSrr:
9919 case X86::PADDBrr:
9920 case X86::PADDWrr:
9921 case X86::PADDDrr:
9922 case X86::PADDQrr:
9923 case X86::PMULLWrr:
9924 case X86::PMULLDrr:
9925 case X86::PMAXSBrr:
9926 case X86::PMAXSDrr:
9927 case X86::PMAXSWrr:
9928 case X86::PMAXUBrr:
9929 case X86::PMAXUDrr:
9930 case X86::PMAXUWrr:
9931 case X86::PMINSBrr:
9932 case X86::PMINSDrr:
9933 case X86::PMINSWrr:
9934 case X86::PMINUBrr:
9935 case X86::PMINUDrr:
9936 case X86::PMINUWrr:
9937 case X86::VPANDrr:
9938 case X86::VPANDYrr:
9939 case X86::VPANDDZ128rr:
9940 case X86::VPANDDZ256rr:
9941 case X86::VPANDDZrr:
9942 case X86::VPANDQZ128rr:
9943 case X86::VPANDQZ256rr:
9944 case X86::VPANDQZrr:
9945 case X86::VPORrr:
9946 case X86::VPORYrr:
9947 case X86::VPORDZ128rr:
9948 case X86::VPORDZ256rr:
9949 case X86::VPORDZrr:
9950 case X86::VPORQZ128rr:
9951 case X86::VPORQZ256rr:
9952 case X86::VPORQZrr:
9953 case X86::VPXORrr:
9954 case X86::VPXORYrr:
9955 case X86::VPXORDZ128rr:
9956 case X86::VPXORDZ256rr:
9957 case X86::VPXORDZrr:
9958 case X86::VPXORQZ128rr:
9959 case X86::VPXORQZ256rr:
9960 case X86::VPXORQZrr:
9961 case X86::VANDPDrr:
9962 case X86::VANDPSrr:
9963 case X86::VANDPDYrr:
9964 case X86::VANDPSYrr:
9965 case X86::VANDPDZ128rr:
9966 case X86::VANDPSZ128rr:
9967 case X86::VANDPDZ256rr:
9968 case X86::VANDPSZ256rr:
9969 case X86::VANDPDZrr:
9970 case X86::VANDPSZrr:
9971 case X86::VORPDrr:
9972 case X86::VORPSrr:
9973 case X86::VORPDYrr:
9974 case X86::VORPSYrr:
9975 case X86::VORPDZ128rr:
9976 case X86::VORPSZ128rr:
9977 case X86::VORPDZ256rr:
9978 case X86::VORPSZ256rr:
9979 case X86::VORPDZrr:
9980 case X86::VORPSZrr:
9981 case X86::VXORPDrr:
9982 case X86::VXORPSrr:
9983 case X86::VXORPDYrr:
9984 case X86::VXORPSYrr:
9985 case X86::VXORPDZ128rr:
9986 case X86::VXORPSZ128rr:
9987 case X86::VXORPDZ256rr:
9988 case X86::VXORPSZ256rr:
9989 case X86::VXORPDZrr:
9990 case X86::VXORPSZrr:
9991 case X86::KADDBkk:
9992 case X86::KADDWkk:
9993 case X86::KADDDkk:
9994 case X86::KADDQkk:
9995 case X86::KANDBkk:
9996 case X86::KANDWkk:
9997 case X86::KANDDkk:
9998 case X86::KANDQkk:
9999 case X86::KORBkk:
10000 case X86::KORWkk:
10001 case X86::KORDkk:
10002 case X86::KORQkk:
10003 case X86::KXORBkk:
10004 case X86::KXORWkk:
10005 case X86::KXORDkk:
10006 case X86::KXORQkk:
10007 case X86::VPADDBrr:
10008 case X86::VPADDWrr:
10009 case X86::VPADDDrr:
10010 case X86::VPADDQrr:
10011 case X86::VPADDBYrr:
10012 case X86::VPADDWYrr:
10013 case X86::VPADDDYrr:
10014 case X86::VPADDQYrr:
10015 case X86::VPADDBZ128rr:
10016 case X86::VPADDWZ128rr:
10017 case X86::VPADDDZ128rr:
10018 case X86::VPADDQZ128rr:
10019 case X86::VPADDBZ256rr:
10020 case X86::VPADDWZ256rr:
10021 case X86::VPADDDZ256rr:
10022 case X86::VPADDQZ256rr:
10023 case X86::VPADDBZrr:
10024 case X86::VPADDWZrr:
10025 case X86::VPADDDZrr:
10026 case X86::VPADDQZrr:
10027 case X86::VPMULLWrr:
10028 case X86::VPMULLWYrr:
10029 case X86::VPMULLWZ128rr:
10030 case X86::VPMULLWZ256rr:
10031 case X86::VPMULLWZrr:
10032 case X86::VPMULLDrr:
10033 case X86::VPMULLDYrr:
10034 case X86::VPMULLDZ128rr:
10035 case X86::VPMULLDZ256rr:
10036 case X86::VPMULLDZrr:
10037 case X86::VPMULLQZ128rr:
10038 case X86::VPMULLQZ256rr:
10039 case X86::VPMULLQZrr:
10040 case X86::VPMAXSBrr:
10041 case X86::VPMAXSBYrr:
10042 case X86::VPMAXSBZ128rr:
10043 case X86::VPMAXSBZ256rr:
10044 case X86::VPMAXSBZrr:
10045 case X86::VPMAXSDrr:
10046 case X86::VPMAXSDYrr:
10047 case X86::VPMAXSDZ128rr:
10048 case X86::VPMAXSDZ256rr:
10049 case X86::VPMAXSDZrr:
10050 case X86::VPMAXSQZ128rr:
10051 case X86::VPMAXSQZ256rr:
10052 case X86::VPMAXSQZrr:
10053 case X86::VPMAXSWrr:
10054 case X86::VPMAXSWYrr:
10055 case X86::VPMAXSWZ128rr:
10056 case X86::VPMAXSWZ256rr:
10057 case X86::VPMAXSWZrr:
10058 case X86::VPMAXUBrr:
10059 case X86::VPMAXUBYrr:
10060 case X86::VPMAXUBZ128rr:
10061 case X86::VPMAXUBZ256rr:
10062 case X86::VPMAXUBZrr:
10063 case X86::VPMAXUDrr:
10064 case X86::VPMAXUDYrr:
10065 case X86::VPMAXUDZ128rr:
10066 case X86::VPMAXUDZ256rr:
10067 case X86::VPMAXUDZrr:
10068 case X86::VPMAXUQZ128rr:
10069 case X86::VPMAXUQZ256rr:
10070 case X86::VPMAXUQZrr:
10071 case X86::VPMAXUWrr:
10072 case X86::VPMAXUWYrr:
10073 case X86::VPMAXUWZ128rr:
10074 case X86::VPMAXUWZ256rr:
10075 case X86::VPMAXUWZrr:
10076 case X86::VPMINSBrr:
10077 case X86::VPMINSBYrr:
10078 case X86::VPMINSBZ128rr:
10079 case X86::VPMINSBZ256rr:
10080 case X86::VPMINSBZrr:
10081 case X86::VPMINSDrr:
10082 case X86::VPMINSDYrr:
10083 case X86::VPMINSDZ128rr:
10084 case X86::VPMINSDZ256rr:
10085 case X86::VPMINSDZrr:
10086 case X86::VPMINSQZ128rr:
10087 case X86::VPMINSQZ256rr:
10088 case X86::VPMINSQZrr:
10089 case X86::VPMINSWrr:
10090 case X86::VPMINSWYrr:
10091 case X86::VPMINSWZ128rr:
10092 case X86::VPMINSWZ256rr:
10093 case X86::VPMINSWZrr:
10094 case X86::VPMINUBrr:
10095 case X86::VPMINUBYrr:
10096 case X86::VPMINUBZ128rr:
10097 case X86::VPMINUBZ256rr:
10098 case X86::VPMINUBZrr:
10099 case X86::VPMINUDrr:
10100 case X86::VPMINUDYrr:
10101 case X86::VPMINUDZ128rr:
10102 case X86::VPMINUDZ256rr:
10103 case X86::VPMINUDZrr:
10104 case X86::VPMINUQZ128rr:
10105 case X86::VPMINUQZ256rr:
10106 case X86::VPMINUQZrr:
10107 case X86::VPMINUWrr:
10108 case X86::VPMINUWYrr:
10109 case X86::VPMINUWZ128rr:
10110 case X86::VPMINUWZ256rr:
10111 case X86::VPMINUWZrr:
10112 // Normal min/max instructions are not commutative because of NaN and signed
10113 // zero semantics, but these are. Thus, there's no need to check for global
10114 // relaxed math; the instructions themselves have the properties we need.
10115 case X86::MAXCPDrr:
10116 case X86::MAXCPSrr:
10117 case X86::MAXCSDrr:
10118 case X86::MAXCSSrr:
10119 case X86::MINCPDrr:
10120 case X86::MINCPSrr:
10121 case X86::MINCSDrr:
10122 case X86::MINCSSrr:
10123 case X86::VMAXCPDrr:
10124 case X86::VMAXCPSrr:
10125 case X86::VMAXCPDYrr:
10126 case X86::VMAXCPSYrr:
10127 case X86::VMAXCPDZ128rr:
10128 case X86::VMAXCPSZ128rr:
10129 case X86::VMAXCPDZ256rr:
10130 case X86::VMAXCPSZ256rr:
10131 case X86::VMAXCPDZrr:
10132 case X86::VMAXCPSZrr:
10133 case X86::VMAXCSDrr:
10134 case X86::VMAXCSSrr:
10135 case X86::VMAXCSDZrr:
10136 case X86::VMAXCSSZrr:
10137 case X86::VMINCPDrr:
10138 case X86::VMINCPSrr:
10139 case X86::VMINCPDYrr:
10140 case X86::VMINCPSYrr:
10141 case X86::VMINCPDZ128rr:
10142 case X86::VMINCPSZ128rr:
10143 case X86::VMINCPDZ256rr:
10144 case X86::VMINCPSZ256rr:
10145 case X86::VMINCPDZrr:
10146 case X86::VMINCPSZrr:
10147 case X86::VMINCSDrr:
10148 case X86::VMINCSSrr:
10149 case X86::VMINCSDZrr:
10150 case X86::VMINCSSZrr:
10151 case X86::VMAXCPHZ128rr:
10152 case X86::VMAXCPHZ256rr:
10153 case X86::VMAXCPHZrr:
10154 case X86::VMAXCSHZrr:
10155 case X86::VMINCPHZ128rr:
10156 case X86::VMINCPHZ256rr:
10157 case X86::VMINCPHZrr:
10158 case X86::VMINCSHZrr:
10159 return true;
10160 case X86::ADDPDrr:
10161 case X86::ADDPSrr:
10162 case X86::ADDSDrr:
10163 case X86::ADDSSrr:
10164 case X86::MULPDrr:
10165 case X86::MULPSrr:
10166 case X86::MULSDrr:
10167 case X86::MULSSrr:
10168 case X86::VADDPDrr:
10169 case X86::VADDPSrr:
10170 case X86::VADDPDYrr:
10171 case X86::VADDPSYrr:
10172 case X86::VADDPDZ128rr:
10173 case X86::VADDPSZ128rr:
10174 case X86::VADDPDZ256rr:
10175 case X86::VADDPSZ256rr:
10176 case X86::VADDPDZrr:
10177 case X86::VADDPSZrr:
10178 case X86::VADDSDrr:
10179 case X86::VADDSSrr:
10180 case X86::VADDSDZrr:
10181 case X86::VADDSSZrr:
10182 case X86::VMULPDrr:
10183 case X86::VMULPSrr:
10184 case X86::VMULPDYrr:
10185 case X86::VMULPSYrr:
10186 case X86::VMULPDZ128rr:
10187 case X86::VMULPSZ128rr:
10188 case X86::VMULPDZ256rr:
10189 case X86::VMULPSZ256rr:
10190 case X86::VMULPDZrr:
10191 case X86::VMULPSZrr:
10192 case X86::VMULSDrr:
10193 case X86::VMULSSrr:
10194 case X86::VMULSDZrr:
10195 case X86::VMULSSZrr:
10196 case X86::VADDPHZ128rr:
10197 case X86::VADDPHZ256rr:
10198 case X86::VADDPHZrr:
10199 case X86::VADDSHZrr:
10200 case X86::VMULPHZ128rr:
10201 case X86::VMULPHZ256rr:
10202 case X86::VMULPHZrr:
10203 case X86::VMULSHZrr:
10206 default:
10207 return false;
10208 }
10209}
10210
10211/// If \p DescribedReg overlaps with the MOVrr instruction's destination
10212/// register then, if possible, describe the value in terms of the source
10213/// register.
10214static std::optional<ParamLoadedValue>
10216 const TargetRegisterInfo *TRI) {
10217 Register DestReg = MI.getOperand(0).getReg();
10218 Register SrcReg = MI.getOperand(1).getReg();
10219
10220 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10221
10222 // If the described register is the destination, just return the source.
10223 if (DestReg == DescribedReg)
10224 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10225
10226 // If the described register is a sub-register of the destination register,
10227 // then pick out the source register's corresponding sub-register.
10228 if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
10229 Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
10230 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
10231 }
10232
10233 // The remaining case to consider is when the described register is a
10234 // super-register of the destination register. MOV8rr and MOV16rr does not
10235 // write to any of the other bytes in the register, meaning that we'd have to
10236 // describe the value using a combination of the source register and the
10237 // non-overlapping bits in the described register, which is not currently
10238 // possible.
10239 if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr ||
10240 !TRI->isSuperRegister(DestReg, DescribedReg))
10241 return std::nullopt;
10242
10243 assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case");
10244 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10245}
10246
10247std::optional<ParamLoadedValue>
10249 const MachineOperand *Op = nullptr;
10250 DIExpression *Expr = nullptr;
10251
10253
10254 switch (MI.getOpcode()) {
10255 case X86::LEA32r:
10256 case X86::LEA64r:
10257 case X86::LEA64_32r: {
10258 // We may need to describe a 64-bit parameter with a 32-bit LEA.
10259 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10260 return std::nullopt;
10261
10262 // Operand 4 could be global address. For now we do not support
10263 // such situation.
10264 if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
10265 return std::nullopt;
10266
10267 const MachineOperand &Op1 = MI.getOperand(1);
10268 const MachineOperand &Op2 = MI.getOperand(3);
10269 assert(Op2.isReg() &&
10270 (Op2.getReg() == X86::NoRegister || Op2.getReg().isPhysical()));
10271
10272 // Omit situations like:
10273 // %rsi = lea %rsi, 4, ...
10274 if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
10275 Op2.getReg() == MI.getOperand(0).getReg())
10276 return std::nullopt;
10277 else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
10278 TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
10279 (Op2.getReg() != X86::NoRegister &&
10280 TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
10281 return std::nullopt;
10282
10283 int64_t Coef = MI.getOperand(2).getImm();
10284 int64_t Offset = MI.getOperand(4).getImm();
10286
10287 if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
10288 Op = &Op1;
10289 } else if (Op1.isFI())
10290 Op = &Op1;
10291
10292 if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
10293 Ops.push_back(dwarf::DW_OP_constu);
10294 Ops.push_back(Coef + 1);
10295 Ops.push_back(dwarf::DW_OP_mul);
10296 } else {
10297 if (Op && Op2.getReg() != X86::NoRegister) {
10298 int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
10299 if (dwarfReg < 0)
10300 return std::nullopt;
10301 else if (dwarfReg < 32) {
10302 Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
10303 Ops.push_back(0);
10304 } else {
10305 Ops.push_back(dwarf::DW_OP_bregx);
10306 Ops.push_back(dwarfReg);
10307 Ops.push_back(0);
10308 }
10309 } else if (!Op) {
10310 assert(Op2.getReg() != X86::NoRegister);
10311 Op = &Op2;
10312 }
10313
10314 if (Coef > 1) {
10315 assert(Op2.getReg() != X86::NoRegister);
10316 Ops.push_back(dwarf::DW_OP_constu);
10317 Ops.push_back(Coef);
10318 Ops.push_back(dwarf::DW_OP_mul);
10319 }
10320
10321 if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
10322 Op2.getReg() != X86::NoRegister) {
10323 Ops.push_back(dwarf::DW_OP_plus);
10324 }
10325 }
10326
10328 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
10329
10330 return ParamLoadedValue(*Op, Expr);
10331 }
10332 case X86::MOV8ri:
10333 case X86::MOV16ri:
10334 // TODO: Handle MOV8ri and MOV16ri.
10335 return std::nullopt;
10336 case X86::MOV32ri:
10337 case X86::MOV64ri:
10338 case X86::MOV64ri32:
10339 // MOV32ri may be used for producing zero-extended 32-bit immediates in
10340 // 64-bit parameters, so we need to consider super-registers.
10341 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10342 return std::nullopt;
10343 return ParamLoadedValue(MI.getOperand(1), Expr);
10344 case X86::MOV8rr:
10345 case X86::MOV16rr:
10346 case X86::MOV32rr:
10347 case X86::MOV64rr:
10348 return describeMOVrrLoadedValue(MI, Reg, TRI);
10349 case X86::XOR32rr: {
10350 // 64-bit parameters are zero-materialized using XOR32rr, so also consider
10351 // super-registers.
10352 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10353 return std::nullopt;
10354 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
10356 return std::nullopt;
10357 }
10358 case X86::MOVSX64rr32: {
10359 // We may need to describe the lower 32 bits of the MOVSX; for example, in
10360 // cases like this:
10361 //
10362 // $ebx = [...]
10363 // $rdi = MOVSX64rr32 $ebx
10364 // $esi = MOV32rr $edi
10365 if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg))
10366 return std::nullopt;
10367
10368 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10369
10370 // If the described register is the destination register we need to
10371 // sign-extend the source register from 32 bits. The other case we handle
10372 // is when the described register is the 32-bit sub-register of the
10373 // destination register, in case we just need to return the source
10374 // register.
10375 if (Reg == MI.getOperand(0).getReg())
10376 Expr = DIExpression::appendExt(Expr, 32, 64, true);
10377 else
10378 assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) &&
10379 "Unhandled sub-register case for MOVSX64rr32");
10380
10381 return ParamLoadedValue(MI.getOperand(1), Expr);
10382 }
10383 default:
10384 assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction");
10386 }
10387}
10388
10389/// This is an architecture-specific helper function of reassociateOps.
10390/// Set special operand attributes for new instructions after reassociation.
10392 MachineInstr &OldMI2,
10393 MachineInstr &NewMI1,
10394 MachineInstr &NewMI2) const {
10395 // Integer instructions may define an implicit EFLAGS dest register operand.
10396 MachineOperand *OldFlagDef1 =
10397 OldMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10398 MachineOperand *OldFlagDef2 =
10399 OldMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10400
10401 assert(!OldFlagDef1 == !OldFlagDef2 &&
10402 "Unexpected instruction type for reassociation");
10403
10404 if (!OldFlagDef1 || !OldFlagDef2)
10405 return;
10406
10407 assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() &&
10408 "Must have dead EFLAGS operand in reassociable instruction");
10409
10410 MachineOperand *NewFlagDef1 =
10411 NewMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10412 MachineOperand *NewFlagDef2 =
10413 NewMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10414
10415 assert(NewFlagDef1 && NewFlagDef2 &&
10416 "Unexpected operand in reassociable instruction");
10417
10418 // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
10419 // of this pass or other passes. The EFLAGS operands must be dead in these new
10420 // instructions because the EFLAGS operands in the original instructions must
10421 // be dead in order for reassociation to occur.
10422 NewFlagDef1->setIsDead();
10423 NewFlagDef2->setIsDead();
10424}
10425
10426std::pair<unsigned, unsigned>
10428 return std::make_pair(TF, 0u);
10429}
10430
10433 using namespace X86II;
10434 static const std::pair<unsigned, const char *> TargetFlags[] = {
10435 {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
10436 {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
10437 {MO_GOT, "x86-got"},
10438 {MO_GOTOFF, "x86-gotoff"},
10439 {MO_GOTPCREL, "x86-gotpcrel"},
10440 {MO_GOTPCREL_NORELAX, "x86-gotpcrel-norelax"},
10441 {MO_PLT, "x86-plt"},
10442 {MO_TLSGD, "x86-tlsgd"},
10443 {MO_TLSLD, "x86-tlsld"},
10444 {MO_TLSLDM, "x86-tlsldm"},
10445 {MO_GOTTPOFF, "x86-gottpoff"},
10446 {MO_INDNTPOFF, "x86-indntpoff"},
10447 {MO_TPOFF, "x86-tpoff"},
10448 {MO_DTPOFF, "x86-dtpoff"},
10449 {MO_NTPOFF, "x86-ntpoff"},
10450 {MO_GOTNTPOFF, "x86-gotntpoff"},
10451 {MO_DLLIMPORT, "x86-dllimport"},
10452 {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
10453 {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
10454 {MO_TLVP, "x86-tlvp"},
10455 {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
10456 {MO_SECREL, "x86-secrel"},
10457 {MO_COFFSTUB, "x86-coffstub"}};
10458 return ArrayRef(TargetFlags);
10459}
10460
10461/// Constants defining how certain sequences should be outlined.
10462///
10463/// \p MachineOutlinerDefault implies that the function is called with a call
10464/// instruction, and a return must be emitted for the outlined function frame.
10465///
10466/// That is,
10467///
10468/// I1 OUTLINED_FUNCTION:
10469/// I2 --> call OUTLINED_FUNCTION I1
10470/// I3 I2
10471/// I3
10472/// ret
10473///
10474/// * Call construction overhead: 1 (call instruction)
10475/// * Frame construction overhead: 1 (return instruction)
10476///
10477/// \p MachineOutlinerTailCall implies that the function is being tail called.
10478/// A jump is emitted instead of a call, and the return is already present in
10479/// the outlined sequence. That is,
10480///
10481/// I1 OUTLINED_FUNCTION:
10482/// I2 --> jmp OUTLINED_FUNCTION I1
10483/// ret I2
10484/// ret
10485///
10486/// * Call construction overhead: 1 (jump instruction)
10487/// * Frame construction overhead: 0 (don't need to return)
10488///
10490
10491std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10493 const MachineModuleInfo &MMI,
10494 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10495 unsigned MinRepeats) const {
10496 unsigned SequenceSize = 0;
10497 for (auto &MI : RepeatedSequenceLocs[0]) {
10498 // FIXME: x86 doesn't implement getInstSizeInBytes, so
10499 // we can't tell the cost. Just assume each instruction
10500 // is one byte.
10501 if (MI.isDebugInstr() || MI.isKill())
10502 continue;
10503 SequenceSize += 1;
10504 }
10505
10506 // We check to see if CFI Instructions are present, and if they are
10507 // we find the number of CFI Instructions in the candidates.
10508 unsigned CFICount = 0;
10509 for (auto &I : RepeatedSequenceLocs[0]) {
10510 if (I.isCFIInstruction())
10511 CFICount++;
10512 }
10513
10514 // We compare the number of found CFI Instructions to the number of CFI
10515 // instructions in the parent function for each candidate. We must check this
10516 // since if we outline one of the CFI instructions in a function, we have to
10517 // outline them all for correctness. If we do not, the address offsets will be
10518 // incorrect between the two sections of the program.
10519 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10520 std::vector<MCCFIInstruction> CFIInstructions =
10521 C.getMF()->getFrameInstructions();
10522
10523 if (CFICount > 0 && CFICount != CFIInstructions.size())
10524 return std::nullopt;
10525 }
10526
10527 // FIXME: Use real size in bytes for call and ret instructions.
10528 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10529 for (outliner::Candidate &C : RepeatedSequenceLocs)
10530 C.setCallInfo(MachineOutlinerTailCall, 1);
10531
10532 return std::make_unique<outliner::OutlinedFunction>(
10533 RepeatedSequenceLocs, SequenceSize,
10534 0, // Number of bytes to emit frame.
10535 MachineOutlinerTailCall // Type of frame.
10536 );
10537 }
10538
10539 if (CFICount > 0)
10540 return std::nullopt;
10541
10542 for (outliner::Candidate &C : RepeatedSequenceLocs)
10543 C.setCallInfo(MachineOutlinerDefault, 1);
10544
10545 return std::make_unique<outliner::OutlinedFunction>(
10546 RepeatedSequenceLocs, SequenceSize, 1, MachineOutlinerDefault);
10547}
10548
10550 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10551 const Function &F = MF.getFunction();
10552
10553 // Does the function use a red zone? If it does, then we can't risk messing
10554 // with the stack.
10555 if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) {
10556 // It could have a red zone. If it does, then we don't want to touch it.
10558 if (!X86FI || X86FI->getUsesRedZone())
10559 return false;
10560 }
10561
10562 // If we *don't* want to outline from things that could potentially be deduped
10563 // then return false.
10564 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10565 return false;
10566
10567 // This function is viable for outlining, so return true.
10568 return true;
10569}
10570
10574 unsigned Flags) const {
10575 MachineInstr &MI = *MIT;
10576
10577 // Is this a terminator for a basic block?
10578 if (MI.isTerminator())
10579 // TargetInstrInfo::getOutliningType has already filtered out anything
10580 // that would break this, so we can allow it here.
10582
10583 // Don't outline anything that modifies or reads from the stack pointer.
10584 //
10585 // FIXME: There are instructions which are being manually built without
10586 // explicit uses/defs so we also have to check the MCInstrDesc. We should be
10587 // able to remove the extra checks once those are fixed up. For example,
10588 // sometimes we might get something like %rax = POP64r 1. This won't be
10589 // caught by modifiesRegister or readsRegister even though the instruction
10590 // really ought to be formed so that modifiesRegister/readsRegister would
10591 // catch it.
10592 if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
10593 MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
10594 MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
10596
10597 // Outlined calls change the instruction pointer, so don't read from it.
10598 if (MI.readsRegister(X86::RIP, &RI) ||
10599 MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
10600 MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
10602
10603 // Don't outline CFI instructions.
10604 if (MI.isCFIInstruction())
10606
10608}
10609
10612 const outliner::OutlinedFunction &OF) const {
10613 // If we're a tail call, we already have a return, so don't do anything.
10614 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10615 return;
10616
10617 // We're a normal call, so our sequence doesn't have a return instruction.
10618 // Add it in.
10619 MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RET64));
10620 MBB.insert(MBB.end(), retq);
10621}
10622
10626 // Is it a tail call?
10627 if (C.CallConstructionID == MachineOutlinerTailCall) {
10628 // Yes, just insert a JMP.
10629 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64))
10630 .addGlobalAddress(M.getNamedValue(MF.getName())));
10631 } else {
10632 // No, insert a call.
10633 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
10634 .addGlobalAddress(M.getNamedValue(MF.getName())));
10635 }
10636
10637 return It;
10638}
10639
10642 DebugLoc &DL,
10643 bool AllowSideEffects) const {
10644 const MachineFunction &MF = *MBB.getParent();
10645 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
10647
10648 if (ST.hasMMX() && X86::VR64RegClass.contains(Reg))
10649 // FIXME: Should we ignore MMX registers?
10650 return;
10651
10652 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10653 // Convert register to the 32-bit version. Both 'movl' and 'xorl' clear the
10654 // upper bits of a 64-bit register automagically.
10655 Reg = getX86SubSuperRegister(Reg, 32);
10656
10657 if (!AllowSideEffects)
10658 // XOR affects flags, so use a MOV instead.
10659 BuildMI(MBB, Iter, DL, get(X86::MOV32ri), Reg).addImm(0);
10660 else
10661 BuildMI(MBB, Iter, DL, get(X86::XOR32rr), Reg)
10662 .addReg(Reg, RegState::Undef)
10663 .addReg(Reg, RegState::Undef);
10664 } else if (X86::VR128RegClass.contains(Reg)) {
10665 // XMM#
10666 if (!ST.hasSSE1())
10667 return;
10668
10669 BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
10670 } else if (X86::VR256RegClass.contains(Reg)) {
10671 // YMM#
10672 if (!ST.hasAVX())
10673 return;
10674
10675 BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
10676 } else if (X86::VR512RegClass.contains(Reg)) {
10677 // ZMM#
10678 if (!ST.hasAVX512())
10679 return;
10680
10681 BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
10682 } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
10683 X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
10684 X86::VK16RegClass.contains(Reg)) {
10685 if (!ST.hasVLX())
10686 return;
10687
10688 unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
10689 BuildMI(MBB, Iter, DL, get(Op), Reg);
10690 }
10691}
10692
10694 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
10695 bool DoRegPressureReduce) const {
10696 unsigned Opc = Root.getOpcode();
10697 switch (Opc) {
10698 case X86::VPDPWSSDrr:
10699 case X86::VPDPWSSDrm:
10700 case X86::VPDPWSSDYrr:
10701 case X86::VPDPWSSDYrm: {
10702 if (!Subtarget.hasFastDPWSSD()) {
10704 return true;
10705 }
10706 break;
10707 }
10708 case X86::VPDPWSSDZ128rr:
10709 case X86::VPDPWSSDZ128rm:
10710 case X86::VPDPWSSDZ256rr:
10711 case X86::VPDPWSSDZ256rm:
10712 case X86::VPDPWSSDZrr:
10713 case X86::VPDPWSSDZrm: {
10714 if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
10716 return true;
10717 }
10718 break;
10719 }
10720 }
10722 Patterns, DoRegPressureReduce);
10723}
10724
10725static void
10729 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
10730 MachineFunction *MF = Root.getMF();
10732
10733 unsigned Opc = Root.getOpcode();
10734 unsigned AddOpc = 0;
10735 unsigned MaddOpc = 0;
10736 switch (Opc) {
10737 default:
10738 assert(false && "It should not reach here");
10739 break;
10740 // vpdpwssd xmm2,xmm3,xmm1
10741 // -->
10742 // vpmaddwd xmm3,xmm3,xmm1
10743 // vpaddd xmm2,xmm2,xmm3
10744 case X86::VPDPWSSDrr:
10745 MaddOpc = X86::VPMADDWDrr;
10746 AddOpc = X86::VPADDDrr;
10747 break;
10748 case X86::VPDPWSSDrm:
10749 MaddOpc = X86::VPMADDWDrm;
10750 AddOpc = X86::VPADDDrr;
10751 break;
10752 case X86::VPDPWSSDZ128rr:
10753 MaddOpc = X86::VPMADDWDZ128rr;
10754 AddOpc = X86::VPADDDZ128rr;
10755 break;
10756 case X86::VPDPWSSDZ128rm:
10757 MaddOpc = X86::VPMADDWDZ128rm;
10758 AddOpc = X86::VPADDDZ128rr;
10759 break;
10760 // vpdpwssd ymm2,ymm3,ymm1
10761 // -->
10762 // vpmaddwd ymm3,ymm3,ymm1
10763 // vpaddd ymm2,ymm2,ymm3
10764 case X86::VPDPWSSDYrr:
10765 MaddOpc = X86::VPMADDWDYrr;
10766 AddOpc = X86::VPADDDYrr;
10767 break;
10768 case X86::VPDPWSSDYrm:
10769 MaddOpc = X86::VPMADDWDYrm;
10770 AddOpc = X86::VPADDDYrr;
10771 break;
10772 case X86::VPDPWSSDZ256rr:
10773 MaddOpc = X86::VPMADDWDZ256rr;
10774 AddOpc = X86::VPADDDZ256rr;
10775 break;
10776 case X86::VPDPWSSDZ256rm:
10777 MaddOpc = X86::VPMADDWDZ256rm;
10778 AddOpc = X86::VPADDDZ256rr;
10779 break;
10780 // vpdpwssd zmm2,zmm3,zmm1
10781 // -->
10782 // vpmaddwd zmm3,zmm3,zmm1
10783 // vpaddd zmm2,zmm2,zmm3
10784 case X86::VPDPWSSDZrr:
10785 MaddOpc = X86::VPMADDWDZrr;
10786 AddOpc = X86::VPADDDZrr;
10787 break;
10788 case X86::VPDPWSSDZrm:
10789 MaddOpc = X86::VPMADDWDZrm;
10790 AddOpc = X86::VPADDDZrr;
10791 break;
10792 }
10793 // Create vpmaddwd.
10794 const TargetRegisterClass *RC =
10795 RegInfo.getRegClass(Root.getOperand(0).getReg());
10796 Register NewReg = RegInfo.createVirtualRegister(RC);
10797 MachineInstr *Madd = Root.getMF()->CloneMachineInstr(&Root);
10798 Madd->setDesc(TII.get(MaddOpc));
10799 Madd->untieRegOperand(1);
10800 Madd->removeOperand(1);
10801 Madd->getOperand(0).setReg(NewReg);
10802 InstrIdxForVirtReg.insert(std::make_pair(NewReg, 0));
10803 // Create vpaddd.
10804 Register DstReg = Root.getOperand(0).getReg();
10805 bool IsKill = Root.getOperand(1).isKill();
10806 MachineInstr *Add =
10807 BuildMI(*MF, MIMetadata(Root), TII.get(AddOpc), DstReg)
10808 .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill))
10809 .addReg(Madd->getOperand(0).getReg(), getKillRegState(true));
10810 InsInstrs.push_back(Madd);
10811 InsInstrs.push_back(Add);
10812 DelInstrs.push_back(&Root);
10813}
10814
10816 MachineInstr &Root, unsigned Pattern,
10819 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
10820 switch (Pattern) {
10821 default:
10822 // Reassociate instructions.
10824 DelInstrs, InstrIdxForVirtReg);
10825 return;
10827 genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs,
10828 InstrIdxForVirtReg);
10829 return;
10830 }
10831}
10832
10833// See also: X86DAGToDAGISel::SelectInlineAsmMemoryOperand().
10835 int FI) const {
10838 M.Base.FrameIndex = FI;
10839 M.getFullAddress(Ops);
10840}
10841
10843X86InstrInfo::insertCodePrefetchInstr(MachineBasicBlock &MBB,
10844 MachineBasicBlock::iterator InsertBefore,
10845 const GlobalValue *GV) const {
10846 MachineFunction &MF = *MBB.getParent();
10847 MachineInstr *PrefetchInstr = MF.CreateMachineInstr(
10848 get(X86::PREFETCHIT1),
10849 InsertBefore == MBB.instr_end() ? MBB.findPrevDebugLoc(InsertBefore)
10850 : InsertBefore->getDebugLoc(),
10851 true);
10852 MachineInstrBuilder MIB(MF, PrefetchInstr);
10855 /*base_alignment=*/llvm::Align(1)));
10856 MIB.addReg(X86::RIP).addImm(1).addReg(X86::NoRegister);
10857 MIB.addGlobalAddress(GV);
10858 MIB.addReg(X86::NoRegister);
10859 MBB.insert(InsertBefore, PrefetchInstr);
10860 return PrefetchInstr;
10861}
10862
10863#define GET_INSTRINFO_HELPERS
10864#include "X86GenInstrInfo.inc"
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
return SDValue()
static bool isFrameStoreOpcode(int Opcode)
static bool isFrameLoadOpcode(int Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerDefault
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
static SDValue isNOT(SDValue V, SelectionDAG &DAG)
static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
Expand a single-def pseudo instruction to a two-addr instruction with two undef reads of the register...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
Provides some synthesis utilities to produce sequences of values.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
#define LLVM_DEBUG(...)
Definition Debug.h:114
#define FROM_TO(FROM, TO)
cl::opt< bool > X86EnableAPXForRelocation
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static bool isLEA(unsigned Opcode)
static void addOperands(MachineInstrBuilder &MIB, ArrayRef< MachineOperand > MOs, int PtrOffset=0)
static std::optional< ParamLoadedValue > describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetRegisterInfo *TRI)
If DescribedReg overlaps with the MOVrr instruction's destination register then, if possible,...
static cl::opt< unsigned > PartialRegUpdateClearance("partial-reg-update-clearance", cl::desc("Clearance between two register writes " "for inserting XOR to avoid partial " "register update"), cl::init(64), cl::Hidden)
static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI)
static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg, const X86Subtarget &Subtarget)
static bool isConvertibleLEA(MachineInstr *MI)
static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, const X86Subtarget &Subtarget)
static bool isAMXOpcode(unsigned Opc)
static int getJumpTableIndexFromReg(const MachineRegisterInfo &MRI, Register Reg)
static void updateOperandRegConstraints(MachineFunction &MF, MachineInstr &NewMI, const TargetInstrInfo &TII)
static int getJumpTableIndexFromAddr(const MachineInstr &MI)
static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth, unsigned NewWidth, unsigned *pNewMask=nullptr)
static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, bool MinusOne)
static unsigned getNewOpcFromTable(ArrayRef< X86TableEntry > Table, unsigned Opc)
static unsigned getStoreRegOpcode(Register SrcReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
#define FOLD_BROADCAST(SIZE)
static cl::opt< unsigned > UndefRegClearance("undef-reg-clearance", cl::desc("How many idle instructions we would like before " "certain undef register reads"), cl::init(128), cl::Hidden)
#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64)
static bool isTruncatedShiftCountForLEA(unsigned ShAmt)
Check whether the given shift count is appropriate can be represented by a LEA instruction.
static cl::opt< bool > ReMatPICStubLoad("remat-pic-stub-load", cl::desc("Re-materialize load from stub in PIC mode"), cl::init(false), cl::Hidden)
static SmallVector< MachineMemOperand *, 2 > extractLoadMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static MachineInstr * fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII)
static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx)
static bool canConvert2Copy(unsigned Opc)
static cl::opt< bool > NoFusing("disable-spill-fusing", cl::desc("Disable fusing of spill code into instructions"), cl::Hidden)
static bool expandNOVLXStore(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &StoreDesc, const MCInstrDesc &ExtractDesc, unsigned SubIdx)
static bool isX87Reg(Register Reg)
Return true if the Reg is X87 register.
static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc, Register Reg)
Expand a single-def pseudo instruction to a two-addr instruction with two k0 reads.
#define VPERM_CASES_BROADCAST(Suffix)
static std::pair< X86::CondCode, unsigned > isUseDefConvertible(const MachineInstr &MI)
Check whether the use can be converted to remove a comparison against zero.
static bool findRedundantFlagInstr(MachineInstr &CmpInstr, MachineInstr &CmpValDefInstr, const MachineRegisterInfo *MRI, MachineInstr **AndInstr, const TargetRegisterInfo *TRI, const X86Subtarget &ST, bool &NoSignFlag, bool &ClearsOverflowFlag)
static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
static unsigned getLoadRegOpcode(Register DestReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, bool ForLoadFold=false)
static MachineInstr * makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI)
#define GET_ND_IF_ENABLED(OPC)
static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI, const TargetInstrInfo &TII, bool HasAVX)
static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget, bool ForLoadFold=false)
Return true for all instructions that only update the first 32 or 64-bits of the destination register...
#define CASE_NF(OP)
static const uint16_t * lookupAVX512(unsigned opcode, unsigned domain, ArrayRef< uint16_t[4]> Table)
static unsigned getLoadStoreRegOpcode(Register Reg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI, bool Load)
#define VPERM_CASES(Suffix)
#define FROM_TO_SIZE(A, B, S)
static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, bool &ClearsOverflowFlag)
Check whether the definition can be converted to remove a comparison against zero.
static MachineInstr * fuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII, int PtrOffset=0)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode)
static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static unsigned convertALUrr2ALUri(unsigned Opc, bool HasNDDI)
Convert an ALUrr opcode to corresponding ALUri opcode.
static MachineBasicBlock * getFallThroughMBB(MachineBasicBlock *MBB, MachineBasicBlock *TBB)
static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, const MachineInstr &UserMI, const MachineFunction &MF)
Check if LoadMI is a partial register load that we can't fold into MI because the latter uses content...
static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI)
static bool isHReg(Register Reg)
Test if the given register is a physical h register.
static cl::opt< bool > PrintFailedFusing("print-failed-fuse-candidates", cl::desc("Print instructions that the allocator wants to" " fuse, but the X86 backend currently can't"), cl::Hidden)
static bool expandNOVLXLoad(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &LoadDesc, const MCInstrDesc &BroadcastDesc, unsigned SubIdx)
static void genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
#define CASE_ND(OP)
static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
This determines which of three possible cases of a three source commute the source indexes correspond...
static unsigned getTruncatedShiftCount(const MachineInstr &MI, unsigned ShiftAmtOperandIdx)
Check whether the shift count for a machine operand is non-zero.
static SmallVector< MachineMemOperand *, 2 > extractStoreMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static unsigned getBroadcastOpcode(const X86FoldTableEntry *I, const TargetRegisterClass *RC, const X86Subtarget &STI)
static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI)
Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
static bool isCommutableVPERMV3Instruction(unsigned Opcode)
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DWARF expression.
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
static LLVM_ABI DIExpression * appendExt(const DIExpression *Expr, unsigned FromSize, unsigned ToSize, bool Signed)
Append a zero- or sign-extension to Expr.
A debug info location.
Definition DebugLoc.h:123
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:714
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
LiveInterval - This class represents the liveness of a register, or stack slot.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
A set of physical registers with utility functions to track liveness when walking backward/forward th...
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
static LocationSize precise(uint64_t Value)
bool usesWindowsCFI() const
Definition MCAsmInfo.h:665
static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int64_t Adjustment, SMLoc Loc={})
.cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but Offset is a relative value that is added/subt...
Definition MCDwarf.h:599
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
void setOpcode(unsigned Op)
Definition MCInst.h:201
Describe properties that are true of each instruction in the target description file.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
Set of metadata that should be preserved when using BuildMI().
SimpleValueType SimpleTy
MachineInstrBundleIterator< const MachineInstr > const_iterator
void push_back(MachineInstr *MI)
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
This class is a data container for one entry in a MachineConstantPool.
union llvm::MachineConstantPoolEntry::@004270020304201266316354007027341142157160323045 Val
The constant itself.
bool isMachineConstantPoolEntry() const
isMachineConstantPoolEntry - Return true if the MachineConstantPoolEntry is indeed a target specific ...
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
bool needsFrameMoves() const
True if this function needs frame moves for debug or exceptions.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
mop_iterator operands_begin()
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isImplicitDef() const
const MachineBasicBlock * getParent() const
void dropDebugNumber()
Drop any variable location debugging information associated with this instruction.
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI unsigned getNumExplicitDefs() const
Returns the number of non-implicit definitions.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
void setFlag(MIFlag Flag)
Set a MI flag.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
unsigned getNumDefs() const
Returns the total number of definitions.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void setDebugLoc(DebugLoc DL)
Replace current source information with new such.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImplicit(bool Val=true)
void setImm(int64_t immVal)
int64_t getImm() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
bool isCPI() const
isCPI - Tests if this is a MO_ConstantPoolIndex operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
bool isJTI() const
isJTI - Tests if this is a MO_JumpTableIndex operand.
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateCPI(unsigned Idx, int Offset, unsigned TargetFlags=0)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< def_instr_iterator > def_instructions(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
MachineFunction & getMachineFunction() const
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
virtual const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
virtual bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const
Returns true iff the routine could find two commutable operands in the given machine instruction.
virtual bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const
Return true when \P Inst has reassociable operands in the same \P MBB.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
bool isPositionIndependent() const
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
virtual const TargetFrameLowering * getFrameLowering() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getZero()
Definition TypeSize.h:349
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
static LLVM_ABI Type * getFP128Ty(LLVMContext &C)
Definition Type.cpp:295
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:291
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:288
SlotIndex def
The index of the defining instruction.
LLVM Value Representation.
Definition Value.h:75
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst, MachineInstr::MIFlag Flag=MachineInstr::NoFlags) const
Wraps up getting a CFI index and building a MachineInstr for it.
void getFrameIndexOperands(SmallVectorImpl< MachineOperand > &Ops, int FI) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
Check if there exists an earlier instruction that operates on the same source operands and sets eflag...
bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
Overrides the isSchedulingBoundary from Codegen/TargetInstrInfo.cpp to make it capable of identifying...
MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, outliner::Candidate &C) const override
void replaceBranchWithTailCall(MachineBasicBlock &MBB, SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex=nullptr) const override
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override
Returns true iff the routine could find two commutable operands in the given machine instruction.
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
X86InstrInfo(const X86Subtarget &STI)
static bool isDataInvariantLoad(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value l...
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned CommuteOpIdx1, unsigned CommuteOpIdx2) const override
bool isFunctionSafeToOutlineFrom(MachineFunction &MF, bool OutlineFromLinkOnceODRs) const override
const X86RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
bool hasCommutePreference(MachineInstr &MI, bool &Commute) const override
Returns true if we have preference on the operands order in MI, the commute decision is returned in C...
bool hasLiveCondCodeDef(MachineInstr &MI) const
True if MI has a condition code def, e.g.
std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const override
bool canMakeTailCallConditional(SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl< MachineInstr * > &NewMIs) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
convertToThreeAddress - This method must be implemented by targets that set the M_CONVERTIBLE_TO_3_AD...
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool expandPostRAPseudo(MachineInstr &MI) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isAssociativeAndCommutative(const MachineInstr &Inst, bool Invert) const override
MCInst getNop() const override
Return the noop instruction to use for a noop.
outliner::InstrType getOutliningTypeImpl(const MachineModuleInfo &MMI, MachineBasicBlock::iterator &MIT, unsigned Flags) const override
bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override
This is a used by the pre-regalloc scheduler to determine (in conjunction with areLoadsFromSameBasePt...
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isStoreToStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const override
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
bool isUnconditionalTailCall(const MachineInstr &MI) const override
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
std::optional< std::unique_ptr< outliner::OutlinedFunction > > getOutliningCandidateInfo(const MachineModuleInfo &MMI, std::vector< outliner::Candidate > &RepeatedSequenceLocs, unsigned MinRepeats) const override
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, Register &NewSrc, unsigned &NewSrcSubReg, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV, LiveIntervals *LIS) const
Given an operand within a MachineInstr, insert preceding code to put it into the right format for a p...
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isLoadFromStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
Fold a load or store of the specified stack slot into the specified machine instruction for the speci...
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const
int getSPAdjust(const MachineInstr &MI) const override
getSPAdjust - This returns the stack pointer adjustment made by this instruction.
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isReMaterializableImpl(const MachineInstr &MI) const override
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
int getJumpTableIndex(const MachineInstr &MI) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const override
This is an architecture-specific helper function of reassociateOps.
std::pair< uint16_t, uint16_t > getExecutionDomain(const MachineInstr &MI) const override
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
isCoalescableExtInstr - Return true if the instruction is a "coalescable" extension instruction.
void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Opc, Register Reg, int FrameIdx, bool isKill=false) const
void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg) const override
When getMachineCombinerPatterns() finds potential patterns, this function generates the instructions ...
bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, TargetInstrInfo::MachineBranchPredicate &MBP, bool AllowModify=false) const override
static bool isDataInvariant(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value o...
unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before certain undef register...
void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
void buildClearRegister(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator Iter, DebugLoc &DL, bool AllowSideEffects=true) const override
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
int64_t getFrameAdjustment(const MachineInstr &I) const
Returns the stack pointer adjustment that happens inside the frame setup..destroy sequence (e....
bool hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr &DefMI, unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const override
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override
uint16_t getExecutionDomainCustom(const MachineInstr &MI) const
bool isHighLatencyDef(int opc) const override
void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const override
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override
foldImmediate - 'Reg' is known to be defined by a move immediate instruction, try to fold the immedia...
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, const X86InstrFMA3Group &FMA3Group) const
Returns an adjusted FMA opcode that must be used in FMA instruction that performs the same computatio...
bool preservesZeroValueInReg(const MachineInstr *MI, const Register NullValueReg, const TargetRegisterInfo *TRI) const override
unsigned getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before a partial register upd...
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
const TargetRegisterClass * constrainRegClassToNonRex2(const TargetRegisterClass *RC) const
bool hasAVX512() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
const X86FrameLowering * getFrameLowering() const override
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
X86II - This namespace holds all of the target specific flags that instruction info tracks.
bool isKMergeMasked(uint64_t TSFlags)
bool hasNewDataDest(uint64_t TSFlags)
@ MO_GOT_ABSOLUTE_ADDRESS
MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a relocation of: SYMBOL_LABEL + [.
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
@ SSEDomainShift
Execution domain for SSE instructions.
bool canUseApxExtendedReg(const MCInstrDesc &Desc)
bool isPseudo(uint64_t TSFlags)
bool isKMasked(uint64_t TSFlags)
int getMemoryOperandNo(uint64_t TSFlags)
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
Define some predicates that are used for node matching.
CondCode getCondFromBranch(const MachineInstr &MI)
CondCode getCondFromCFCMov(const MachineInstr &MI)
@ LAST_VALID_COND
Definition X86BaseInfo.h:94
CondCode getCondFromMI(const MachineInstr &MI)
Return the condition code of the instruction.
int getFirstAddrOperandIdx(const MachineInstr &MI)
Return the index of the instruction's first address operand, if it has a memory reference,...
@ AddrNumOperands
Definition X86BaseInfo.h:36
unsigned getSwappedVCMPImm(unsigned Imm)
Get the VCMP immediate if the opcodes are swapped.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
unsigned getSwappedVPCOMImm(unsigned Imm)
Get the VPCOM immediate if the opcodes are swapped.
bool isX87Instruction(MachineInstr &MI)
Check if the instruction is X87 instruction.
unsigned getNonNDVariant(unsigned Opc)
unsigned getVPCMPImmForCond(ISD::CondCode CC)
Get the VPCMP immediate for the given condition.
std::pair< CondCode, bool > getX86ConditionCode(CmpInst::Predicate Predicate)
Return a pair of condition code for the given predicate and whether the instruction operands should b...
CondCode getCondFromSETCC(const MachineInstr &MI)
unsigned getSwappedVPCMPImm(unsigned Imm)
Get the VPCMP immediate if the opcodes are swapped.
CondCode getCondFromCCMP(const MachineInstr &MI)
int getCCMPCondFlagsFromCondCode(CondCode CC)
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
const Constant * getConstantFromPool(const MachineInstr &MI, unsigned OpNo)
Find any constant pool entry associated with a specific instruction operand.
unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand=false, bool HasNDD=false)
Return a cmov opcode for the given register size in bytes, and operand type.
unsigned getNFVariant(unsigned Opc)
unsigned getVectorRegisterWidth(const MCOperandInfo &Info)
Get the width of the vector register operand.
CondCode getCondFromCMov(const MachineInstr &MI)
initializer< Ty > init(const Ty &Val)
InstrType
Represents how an instruction should be mapped by the outliner.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
static bool isAddMemInstrWithRelocation(const MachineInstr &MI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
static bool isMem(const MachineInstr &MI, unsigned Op)
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition Alignment.h:134
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
static const MachineInstrBuilder & addRegReg(const MachineInstrBuilder &MIB, Register Reg1, bool isKill1, unsigned SubReg1, Register Reg2, bool isKill2, unsigned SubReg2)
addRegReg - This function is used to add a memory reference of the form: [Reg + Reg].
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
constexpr RegState getDeadRegState(bool B)
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
bool isNonFoldableWithSameMask(unsigned RegOp)
const X86FoldTableEntry * lookupBroadcastFoldTable(unsigned RegOp, unsigned OpNum)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
const X86InstrFMA3Group * getFMA3Group(unsigned Opcode, uint64_t TSFlags)
Returns a reference to a group of FMA3 opcodes to where the given Opcode is included.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
const X86FoldTableEntry * lookupTwoAddrFoldTable(unsigned RegOp)
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1970
constexpr RegState getDefRegState(bool B)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
RegState getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
static bool isMemInstrWithGOTPCREL(const MachineInstr &MI)
static const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, int Offset)
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2052
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
const X86FoldTableEntry * lookupUnfoldTable(unsigned MemOp)
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool matchBroadcastSize(const X86FoldTableEntry &Entry, unsigned BroadcastBits)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const X86FoldTableEntry * lookupFoldTable(unsigned RegOp, unsigned OpNum)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
constexpr RegState getUndefRegState(bool B)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
This represents a simple continuous liveness interval for a value.
std::vector< MachineInstr * > Kills
Kills - List of MachineInstruction's which are the last use of this virtual register (kill it) in the...
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
X86AddressMode - This struct holds a generalized full x86 address mode.
enum llvm::X86AddressMode::@202116273335065351270200035056227005202106004277 BaseType
This class is used to group {132, 213, 231} forms of FMA opcodes together.
unsigned get213Opcode() const
Returns the 213 form of FMA opcode.
unsigned get231Opcode() const
Returns the 231 form of FMA opcode.
bool isIntrinsic() const
Returns true iff the group of FMA opcodes holds intrinsic opcodes.
unsigned get132Opcode() const
Returns the 132 form of FMA opcode.
An individual sequence of instructions to be replaced with a call to an outlined function.
The information necessary to create an outlined function for some class of candidate.