LLVM 23.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the X86 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86InstrInfo.h"
14#include "X86.h"
15#include "X86InstrBuilder.h"
16#include "X86InstrFoldTables.h"
18#include "X86Subtarget.h"
19#include "X86TargetMachine.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/Sequence.h"
36#include "llvm/IR/Function.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Module.h"
39#include "llvm/MC/MCAsmInfo.h"
40#include "llvm/MC/MCExpr.h"
41#include "llvm/MC/MCInst.h"
43#include "llvm/Support/Debug.h"
48#include <atomic>
49#include <optional>
50
51using namespace llvm;
52
53#define DEBUG_TYPE "x86-instr-info"
54
55#define GET_INSTRINFO_CTOR_DTOR
56#include "X86GenInstrInfo.inc"
57
59
60static cl::opt<bool>
61 NoFusing("disable-spill-fusing",
62 cl::desc("Disable fusing of spill code into instructions"),
64static cl::opt<bool>
65 PrintFailedFusing("print-failed-fuse-candidates",
66 cl::desc("Print instructions that the allocator wants to"
67 " fuse, but the X86 backend currently can't"),
69static cl::opt<bool>
70 ReMatPICStubLoad("remat-pic-stub-load",
71 cl::desc("Re-materialize load from stub in PIC mode"),
72 cl::init(false), cl::Hidden);
74 PartialRegUpdateClearance("partial-reg-update-clearance",
75 cl::desc("Clearance between two register writes "
76 "for inserting XOR to avoid partial "
77 "register update"),
78 cl::init(64), cl::Hidden);
80 "undef-reg-clearance",
81 cl::desc("How many idle instructions we would like before "
82 "certain undef register reads"),
83 cl::init(128), cl::Hidden);
84
85// Pin the vtable to this file.
86void X86InstrInfo::anchor() {}
87
89 : X86GenInstrInfo(STI, RI,
90 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
91 : X86::ADJCALLSTACKDOWN32),
92 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
93 : X86::ADJCALLSTACKUP32),
94 X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
95 Subtarget(STI), RI(STI.getTargetTriple()) {}
96
98 unsigned OpNum) const {
99 auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum);
100 // If the target does not have egpr, then r16-r31 will be resereved for all
101 // instructions.
102 if (!RC || !Subtarget.hasEGPR())
103 return RC;
104
106 return RC;
107
108 const X86RegisterInfo *RI = Subtarget.getRegisterInfo();
109 return RI->constrainRegClassToNonRex2(RC);
110}
111
113 Register &SrcReg, Register &DstReg,
114 unsigned &SubIdx) const {
115 switch (MI.getOpcode()) {
116 default:
117 break;
118 case X86::MOVSX16rr8:
119 case X86::MOVZX16rr8:
120 case X86::MOVSX32rr8:
121 case X86::MOVZX32rr8:
122 case X86::MOVSX64rr8:
123 if (!Subtarget.is64Bit())
124 // It's not always legal to reference the low 8-bit of the larger
125 // register in 32-bit mode.
126 return false;
127 [[fallthrough]];
128 case X86::MOVSX32rr16:
129 case X86::MOVZX32rr16:
130 case X86::MOVSX64rr16:
131 case X86::MOVSX64rr32: {
132 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
133 // Be conservative.
134 return false;
135 SrcReg = MI.getOperand(1).getReg();
136 DstReg = MI.getOperand(0).getReg();
137 switch (MI.getOpcode()) {
138 default:
139 llvm_unreachable("Unreachable!");
140 case X86::MOVSX16rr8:
141 case X86::MOVZX16rr8:
142 case X86::MOVSX32rr8:
143 case X86::MOVZX32rr8:
144 case X86::MOVSX64rr8:
145 SubIdx = X86::sub_8bit;
146 break;
147 case X86::MOVSX32rr16:
148 case X86::MOVZX32rr16:
149 case X86::MOVSX64rr16:
150 SubIdx = X86::sub_16bit;
151 break;
152 case X86::MOVSX64rr32:
153 SubIdx = X86::sub_32bit;
154 break;
155 }
156 return true;
157 }
158 }
159 return false;
160}
161
163 if (MI.mayLoad() || MI.mayStore())
164 return false;
165
166 // Some target-independent operations that trivially lower to data-invariant
167 // instructions.
168 if (MI.isCopyLike() || MI.isInsertSubreg())
169 return true;
170
171 unsigned Opcode = MI.getOpcode();
172 using namespace X86;
173 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
174 // However, they set flags and are perhaps the most surprisingly constant
175 // time operations so we call them out here separately.
176 if (isIMUL(Opcode))
177 return true;
178 // Bit scanning and counting instructions that are somewhat surprisingly
179 // constant time as they scan across bits and do other fairly complex
180 // operations like popcnt, but are believed to be constant time on x86.
181 // However, these set flags.
182 if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
183 isTZCNT(Opcode))
184 return true;
185 // Bit manipulation instructions are effectively combinations of basic
186 // arithmetic ops, and should still execute in constant time. These also
187 // set flags.
188 if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
189 isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
190 isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
191 isTZMSK(Opcode))
192 return true;
193 // Bit extracting and clearing instructions should execute in constant time,
194 // and set flags.
195 if (isBEXTR(Opcode) || isBZHI(Opcode))
196 return true;
197 // Shift and rotate.
198 if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
199 isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
200 return true;
201 // Basic arithmetic is constant time on the input but does set flags.
202 if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
203 isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
204 return true;
205 // Arithmetic with just 32-bit and 64-bit variants and no immediates.
206 if (isANDN(Opcode))
207 return true;
208 // Unary arithmetic operations.
209 if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
210 return true;
211 // Unlike other arithmetic, NOT doesn't set EFLAGS.
212 if (isNOT(Opcode))
213 return true;
214 // Various move instructions used to zero or sign extend things. Note that we
215 // intentionally don't support the _NOREX variants as we can't handle that
216 // register constraint anyways.
217 if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
218 return true;
219 // Arithmetic instructions that are both constant time and don't set flags.
220 if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
221 return true;
222 // LEA doesn't actually access memory, and its arithmetic is constant time.
223 if (isLEA(Opcode))
224 return true;
225 // By default, assume that the instruction is not data invariant.
226 return false;
227}
228
230 switch (MI.getOpcode()) {
231 default:
232 // By default, assume that the load will immediately leak.
233 return false;
234
235 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
236 // However, they set flags and are perhaps the most surprisingly constant
237 // time operations so we call them out here separately.
238 case X86::IMUL16rm:
239 case X86::IMUL16rmi:
240 case X86::IMUL32rm:
241 case X86::IMUL32rmi:
242 case X86::IMUL64rm:
243 case X86::IMUL64rmi32:
244
245 // Bit scanning and counting instructions that are somewhat surprisingly
246 // constant time as they scan across bits and do other fairly complex
247 // operations like popcnt, but are believed to be constant time on x86.
248 // However, these set flags.
249 case X86::BSF16rm:
250 case X86::BSF32rm:
251 case X86::BSF64rm:
252 case X86::BSR16rm:
253 case X86::BSR32rm:
254 case X86::BSR64rm:
255 case X86::LZCNT16rm:
256 case X86::LZCNT32rm:
257 case X86::LZCNT64rm:
258 case X86::POPCNT16rm:
259 case X86::POPCNT32rm:
260 case X86::POPCNT64rm:
261 case X86::TZCNT16rm:
262 case X86::TZCNT32rm:
263 case X86::TZCNT64rm:
264
265 // Bit manipulation instructions are effectively combinations of basic
266 // arithmetic ops, and should still execute in constant time. These also
267 // set flags.
268 case X86::BLCFILL32rm:
269 case X86::BLCFILL64rm:
270 case X86::BLCI32rm:
271 case X86::BLCI64rm:
272 case X86::BLCIC32rm:
273 case X86::BLCIC64rm:
274 case X86::BLCMSK32rm:
275 case X86::BLCMSK64rm:
276 case X86::BLCS32rm:
277 case X86::BLCS64rm:
278 case X86::BLSFILL32rm:
279 case X86::BLSFILL64rm:
280 case X86::BLSI32rm:
281 case X86::BLSI64rm:
282 case X86::BLSIC32rm:
283 case X86::BLSIC64rm:
284 case X86::BLSMSK32rm:
285 case X86::BLSMSK64rm:
286 case X86::BLSR32rm:
287 case X86::BLSR64rm:
288 case X86::TZMSK32rm:
289 case X86::TZMSK64rm:
290
291 // Bit extracting and clearing instructions should execute in constant time,
292 // and set flags.
293 case X86::BEXTR32rm:
294 case X86::BEXTR64rm:
295 case X86::BEXTRI32mi:
296 case X86::BEXTRI64mi:
297 case X86::BZHI32rm:
298 case X86::BZHI64rm:
299
300 // Basic arithmetic is constant time on the input but does set flags.
301 case X86::ADC8rm:
302 case X86::ADC16rm:
303 case X86::ADC32rm:
304 case X86::ADC64rm:
305 case X86::ADD8rm:
306 case X86::ADD16rm:
307 case X86::ADD32rm:
308 case X86::ADD64rm:
309 case X86::AND8rm:
310 case X86::AND16rm:
311 case X86::AND32rm:
312 case X86::AND64rm:
313 case X86::ANDN32rm:
314 case X86::ANDN64rm:
315 case X86::OR8rm:
316 case X86::OR16rm:
317 case X86::OR32rm:
318 case X86::OR64rm:
319 case X86::SBB8rm:
320 case X86::SBB16rm:
321 case X86::SBB32rm:
322 case X86::SBB64rm:
323 case X86::SUB8rm:
324 case X86::SUB16rm:
325 case X86::SUB32rm:
326 case X86::SUB64rm:
327 case X86::XOR8rm:
328 case X86::XOR16rm:
329 case X86::XOR32rm:
330 case X86::XOR64rm:
331
332 // Integer multiply w/o affecting flags is still believed to be constant
333 // time on x86. Called out separately as this is among the most surprising
334 // instructions to exhibit that behavior.
335 case X86::MULX32rm:
336 case X86::MULX64rm:
337
338 // Arithmetic instructions that are both constant time and don't set flags.
339 case X86::RORX32mi:
340 case X86::RORX64mi:
341 case X86::SARX32rm:
342 case X86::SARX64rm:
343 case X86::SHLX32rm:
344 case X86::SHLX64rm:
345 case X86::SHRX32rm:
346 case X86::SHRX64rm:
347
348 // Conversions are believed to be constant time and don't set flags.
349 case X86::CVTTSD2SI64rm:
350 case X86::VCVTTSD2SI64rm:
351 case X86::VCVTTSD2SI64Zrm:
352 case X86::CVTTSD2SIrm:
353 case X86::VCVTTSD2SIrm:
354 case X86::VCVTTSD2SIZrm:
355 case X86::CVTTSS2SI64rm:
356 case X86::VCVTTSS2SI64rm:
357 case X86::VCVTTSS2SI64Zrm:
358 case X86::CVTTSS2SIrm:
359 case X86::VCVTTSS2SIrm:
360 case X86::VCVTTSS2SIZrm:
361 case X86::CVTSI2SDrm:
362 case X86::VCVTSI2SDrm:
363 case X86::VCVTSI2SDZrm:
364 case X86::CVTSI2SSrm:
365 case X86::VCVTSI2SSrm:
366 case X86::VCVTSI2SSZrm:
367 case X86::CVTSI642SDrm:
368 case X86::VCVTSI642SDrm:
369 case X86::VCVTSI642SDZrm:
370 case X86::CVTSI642SSrm:
371 case X86::VCVTSI642SSrm:
372 case X86::VCVTSI642SSZrm:
373 case X86::CVTSS2SDrm:
374 case X86::VCVTSS2SDrm:
375 case X86::VCVTSS2SDZrm:
376 case X86::CVTSD2SSrm:
377 case X86::VCVTSD2SSrm:
378 case X86::VCVTSD2SSZrm:
379 // AVX512 added unsigned integer conversions.
380 case X86::VCVTTSD2USI64Zrm:
381 case X86::VCVTTSD2USIZrm:
382 case X86::VCVTTSS2USI64Zrm:
383 case X86::VCVTTSS2USIZrm:
384 case X86::VCVTUSI2SDZrm:
385 case X86::VCVTUSI642SDZrm:
386 case X86::VCVTUSI2SSZrm:
387 case X86::VCVTUSI642SSZrm:
388
389 // Loads to register don't set flags.
390 case X86::MOV8rm:
391 case X86::MOV8rm_NOREX:
392 case X86::MOV16rm:
393 case X86::MOV32rm:
394 case X86::MOV64rm:
395 case X86::MOVSX16rm8:
396 case X86::MOVSX32rm16:
397 case X86::MOVSX32rm8:
398 case X86::MOVSX32rm8_NOREX:
399 case X86::MOVSX64rm16:
400 case X86::MOVSX64rm32:
401 case X86::MOVSX64rm8:
402 case X86::MOVZX16rm8:
403 case X86::MOVZX32rm16:
404 case X86::MOVZX32rm8:
405 case X86::MOVZX32rm8_NOREX:
406 case X86::MOVZX64rm16:
407 case X86::MOVZX64rm8:
408 return true;
409 }
410}
411
413 const MachineFunction *MF = MI.getParent()->getParent();
415
416 if (isFrameInstr(MI)) {
417 int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
418 SPAdj -= getFrameAdjustment(MI);
419 if (!isFrameSetup(MI))
420 SPAdj = -SPAdj;
421 return SPAdj;
422 }
423
424 // To know whether a call adjusts the stack, we need information
425 // that is bound to the following ADJCALLSTACKUP pseudo.
426 // Look for the next ADJCALLSTACKUP that follows the call.
427 if (MI.isCall()) {
428 const MachineBasicBlock *MBB = MI.getParent();
430 for (auto E = MBB->end(); I != E; ++I) {
431 if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall())
432 break;
433 }
434
435 // If we could not find a frame destroy opcode, then it has already
436 // been simplified, so we don't care.
437 if (I->getOpcode() != getCallFrameDestroyOpcode())
438 return 0;
439
440 return -(I->getOperand(1).getImm());
441 }
442
443 // Currently handle only PUSHes we can reasonably expect to see
444 // in call sequences
445 switch (MI.getOpcode()) {
446 default:
447 return 0;
448 case X86::PUSH32r:
449 case X86::PUSH32rmm:
450 case X86::PUSH32rmr:
451 case X86::PUSH32i:
452 return 4;
453 case X86::PUSH64r:
454 case X86::PUSH64rmm:
455 case X86::PUSH64rmr:
456 case X86::PUSH64i32:
457 return 8;
458 }
459}
460
461/// Return true and the FrameIndex if the specified
462/// operand and follow operands form a reference to the stack frame.
463bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
464 int &FrameIndex) const {
465 if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
466 MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
467 MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
468 MI.getOperand(Op + X86::AddrDisp).isImm() &&
469 MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
470 MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
471 MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
472 FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
473 return true;
474 }
475 return false;
476}
477
478static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
479 switch (Opcode) {
480 default:
481 return false;
482 case X86::MOV8rm:
483 case X86::KMOVBkm:
484 case X86::KMOVBkm_EVEX:
485 MemBytes = TypeSize::getFixed(1);
486 return true;
487 case X86::MOV16rm:
488 case X86::KMOVWkm:
489 case X86::KMOVWkm_EVEX:
490 case X86::VMOVSHZrm:
491 case X86::VMOVSHZrm_alt:
492 MemBytes = TypeSize::getFixed(2);
493 return true;
494 case X86::MOV32rm:
495 case X86::MOVSSrm:
496 case X86::MOVSSrm_alt:
497 case X86::VMOVSSrm:
498 case X86::VMOVSSrm_alt:
499 case X86::VMOVSSZrm:
500 case X86::VMOVSSZrm_alt:
501 case X86::KMOVDkm:
502 case X86::KMOVDkm_EVEX:
503 MemBytes = TypeSize::getFixed(4);
504 return true;
505 case X86::MOV64rm:
506 case X86::LD_Fp64m:
507 case X86::MOVSDrm:
508 case X86::MOVSDrm_alt:
509 case X86::VMOVSDrm:
510 case X86::VMOVSDrm_alt:
511 case X86::VMOVSDZrm:
512 case X86::VMOVSDZrm_alt:
513 case X86::MMX_MOVD64rm:
514 case X86::MMX_MOVQ64rm:
515 case X86::KMOVQkm:
516 case X86::KMOVQkm_EVEX:
517 MemBytes = TypeSize::getFixed(8);
518 return true;
519 case X86::MOVAPSrm:
520 case X86::MOVUPSrm:
521 case X86::MOVAPDrm:
522 case X86::MOVUPDrm:
523 case X86::MOVDQArm:
524 case X86::MOVDQUrm:
525 case X86::VMOVAPSrm:
526 case X86::VMOVUPSrm:
527 case X86::VMOVAPDrm:
528 case X86::VMOVUPDrm:
529 case X86::VMOVDQArm:
530 case X86::VMOVDQUrm:
531 case X86::VMOVAPSZ128rm:
532 case X86::VMOVUPSZ128rm:
533 case X86::VMOVAPSZ128rm_NOVLX:
534 case X86::VMOVUPSZ128rm_NOVLX:
535 case X86::VMOVAPDZ128rm:
536 case X86::VMOVUPDZ128rm:
537 case X86::VMOVDQU8Z128rm:
538 case X86::VMOVDQU16Z128rm:
539 case X86::VMOVDQA32Z128rm:
540 case X86::VMOVDQU32Z128rm:
541 case X86::VMOVDQA64Z128rm:
542 case X86::VMOVDQU64Z128rm:
543 MemBytes = TypeSize::getFixed(16);
544 return true;
545 case X86::VMOVAPSYrm:
546 case X86::VMOVUPSYrm:
547 case X86::VMOVAPDYrm:
548 case X86::VMOVUPDYrm:
549 case X86::VMOVDQAYrm:
550 case X86::VMOVDQUYrm:
551 case X86::VMOVAPSZ256rm:
552 case X86::VMOVUPSZ256rm:
553 case X86::VMOVAPSZ256rm_NOVLX:
554 case X86::VMOVUPSZ256rm_NOVLX:
555 case X86::VMOVAPDZ256rm:
556 case X86::VMOVUPDZ256rm:
557 case X86::VMOVDQU8Z256rm:
558 case X86::VMOVDQU16Z256rm:
559 case X86::VMOVDQA32Z256rm:
560 case X86::VMOVDQU32Z256rm:
561 case X86::VMOVDQA64Z256rm:
562 case X86::VMOVDQU64Z256rm:
563 MemBytes = TypeSize::getFixed(32);
564 return true;
565 case X86::VMOVAPSZrm:
566 case X86::VMOVUPSZrm:
567 case X86::VMOVAPDZrm:
568 case X86::VMOVUPDZrm:
569 case X86::VMOVDQU8Zrm:
570 case X86::VMOVDQU16Zrm:
571 case X86::VMOVDQA32Zrm:
572 case X86::VMOVDQU32Zrm:
573 case X86::VMOVDQA64Zrm:
574 case X86::VMOVDQU64Zrm:
575 MemBytes = TypeSize::getFixed(64);
576 return true;
577 }
578}
579
580static bool isFrameStoreOpcode(int Opcode, TypeSize &MemBytes) {
581 switch (Opcode) {
582 default:
583 return false;
584 case X86::MOV8mr:
585 case X86::KMOVBmk:
586 case X86::KMOVBmk_EVEX:
587 MemBytes = TypeSize::getFixed(1);
588 return true;
589 case X86::MOV16mr:
590 case X86::KMOVWmk:
591 case X86::KMOVWmk_EVEX:
592 case X86::VMOVSHZmr:
593 MemBytes = TypeSize::getFixed(2);
594 return true;
595 case X86::MOV32mr:
596 case X86::MOVSSmr:
597 case X86::VMOVSSmr:
598 case X86::VMOVSSZmr:
599 case X86::KMOVDmk:
600 case X86::KMOVDmk_EVEX:
601 MemBytes = TypeSize::getFixed(4);
602 return true;
603 case X86::MOV64mr:
604 case X86::ST_FpP64m:
605 case X86::MOVSDmr:
606 case X86::VMOVSDmr:
607 case X86::VMOVSDZmr:
608 case X86::MMX_MOVD64mr:
609 case X86::MMX_MOVQ64mr:
610 case X86::MMX_MOVNTQmr:
611 case X86::KMOVQmk:
612 case X86::KMOVQmk_EVEX:
613 MemBytes = TypeSize::getFixed(8);
614 return true;
615 case X86::MOVAPSmr:
616 case X86::MOVUPSmr:
617 case X86::MOVAPDmr:
618 case X86::MOVUPDmr:
619 case X86::MOVDQAmr:
620 case X86::MOVDQUmr:
621 case X86::VMOVAPSmr:
622 case X86::VMOVUPSmr:
623 case X86::VMOVAPDmr:
624 case X86::VMOVUPDmr:
625 case X86::VMOVDQAmr:
626 case X86::VMOVDQUmr:
627 case X86::VMOVUPSZ128mr:
628 case X86::VMOVAPSZ128mr:
629 case X86::VMOVUPSZ128mr_NOVLX:
630 case X86::VMOVAPSZ128mr_NOVLX:
631 case X86::VMOVUPDZ128mr:
632 case X86::VMOVAPDZ128mr:
633 case X86::VMOVDQA32Z128mr:
634 case X86::VMOVDQU32Z128mr:
635 case X86::VMOVDQA64Z128mr:
636 case X86::VMOVDQU64Z128mr:
637 case X86::VMOVDQU8Z128mr:
638 case X86::VMOVDQU16Z128mr:
639 MemBytes = TypeSize::getFixed(16);
640 return true;
641 case X86::VMOVUPSYmr:
642 case X86::VMOVAPSYmr:
643 case X86::VMOVUPDYmr:
644 case X86::VMOVAPDYmr:
645 case X86::VMOVDQUYmr:
646 case X86::VMOVDQAYmr:
647 case X86::VMOVUPSZ256mr:
648 case X86::VMOVAPSZ256mr:
649 case X86::VMOVUPSZ256mr_NOVLX:
650 case X86::VMOVAPSZ256mr_NOVLX:
651 case X86::VMOVUPDZ256mr:
652 case X86::VMOVAPDZ256mr:
653 case X86::VMOVDQU8Z256mr:
654 case X86::VMOVDQU16Z256mr:
655 case X86::VMOVDQA32Z256mr:
656 case X86::VMOVDQU32Z256mr:
657 case X86::VMOVDQA64Z256mr:
658 case X86::VMOVDQU64Z256mr:
659 MemBytes = TypeSize::getFixed(32);
660 return true;
661 case X86::VMOVUPSZmr:
662 case X86::VMOVAPSZmr:
663 case X86::VMOVUPDZmr:
664 case X86::VMOVAPDZmr:
665 case X86::VMOVDQU8Zmr:
666 case X86::VMOVDQU16Zmr:
667 case X86::VMOVDQA32Zmr:
668 case X86::VMOVDQU32Zmr:
669 case X86::VMOVDQA64Zmr:
670 case X86::VMOVDQU64Zmr:
671 MemBytes = TypeSize::getFixed(64);
672 return true;
673 }
674 return false;
675}
676
678 int &FrameIndex) const {
679 TypeSize Dummy = TypeSize::getZero();
680 return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
681}
682
684 int &FrameIndex,
685 TypeSize &MemBytes) const {
686 if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
687 if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
688 return MI.getOperand(0).getReg();
689 return Register();
690}
691
693 int &FrameIndex) const {
694 TypeSize Dummy = TypeSize::getZero();
695 if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
696 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
697 return Reg;
698 // Check for post-frame index elimination operations
700 if (hasLoadFromStackSlot(MI, Accesses)) {
701 FrameIndex =
702 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
703 ->getFrameIndex();
704 return MI.getOperand(0).getReg();
705 }
706 }
707 return Register();
708}
709
711 int &FrameIndex) const {
712 TypeSize Dummy = TypeSize::getZero();
713 return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
714}
715
717 int &FrameIndex,
718 TypeSize &MemBytes) const {
719 if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
720 if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
721 isFrameOperand(MI, 0, FrameIndex))
722 return MI.getOperand(X86::AddrNumOperands).getReg();
723 return Register();
724}
725
727 int &FrameIndex) const {
728 TypeSize Dummy = TypeSize::getZero();
729 if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
730 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
731 return Reg;
732 // Check for post-frame index elimination operations
734 if (hasStoreToStackSlot(MI, Accesses)) {
735 FrameIndex =
736 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
737 ->getFrameIndex();
738 return MI.getOperand(X86::AddrNumOperands).getReg();
739 }
740 }
741 return Register();
742}
743
744/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
745static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
746 // Don't waste compile time scanning use-def chains of physregs.
747 if (!BaseReg.isVirtual())
748 return false;
749 bool isPICBase = false;
750 for (const MachineInstr &DefMI : MRI.def_instructions(BaseReg)) {
751 if (DefMI.getOpcode() != X86::MOVPC32r)
752 return false;
753 assert(!isPICBase && "More than one PIC base?");
754 isPICBase = true;
755 }
756 return isPICBase;
757}
758
760 const MachineInstr &MI) const {
761 switch (MI.getOpcode()) {
762 default:
763 // This function should only be called for opcodes with the ReMaterializable
764 // flag set.
765 llvm_unreachable("Unknown rematerializable operation!");
766 break;
767 case X86::IMPLICIT_DEF:
768 // Defer to generic logic.
769 break;
770 case X86::LOAD_STACK_GUARD:
771 case X86::LD_Fp032:
772 case X86::LD_Fp064:
773 case X86::LD_Fp080:
774 case X86::LD_Fp132:
775 case X86::LD_Fp164:
776 case X86::LD_Fp180:
777 case X86::AVX1_SETALLONES:
778 case X86::AVX2_SETALLONES:
779 case X86::AVX512_128_SET0:
780 case X86::AVX512_256_SET0:
781 case X86::AVX512_512_SET0:
782 case X86::AVX512_128_SETALLONES:
783 case X86::AVX512_256_SETALLONES:
784 case X86::AVX512_512_SETALLONES:
785 case X86::AVX512_FsFLD0SD:
786 case X86::AVX512_FsFLD0SH:
787 case X86::AVX512_FsFLD0SS:
788 case X86::AVX512_FsFLD0F128:
789 case X86::AVX_SET0:
790 case X86::FsFLD0SD:
791 case X86::FsFLD0SS:
792 case X86::FsFLD0SH:
793 case X86::FsFLD0F128:
794 case X86::KSET0B:
795 case X86::KSET0D:
796 case X86::KSET0Q:
797 case X86::KSET0W:
798 case X86::KSET1B:
799 case X86::KSET1D:
800 case X86::KSET1Q:
801 case X86::KSET1W:
802 case X86::MMX_SET0:
803 case X86::MOV32ImmSExti8:
804 case X86::MOV32r0:
805 case X86::MOV32r1:
806 case X86::MOV32r_1:
807 case X86::MOV32ri64:
808 case X86::MOV64ImmSExti8:
809 case X86::V_SET0:
810 case X86::V_SETALLONES:
811 case X86::MOV16ri:
812 case X86::MOV32ri:
813 case X86::MOV64ri:
814 case X86::MOV64ri32:
815 case X86::MOV8ri:
816 case X86::PTILEZEROV:
817 return true;
818
819 case X86::MOV8rm:
820 case X86::MOV8rm_NOREX:
821 case X86::MOV16rm:
822 case X86::MOV32rm:
823 case X86::MOV64rm:
824 case X86::MOVSSrm:
825 case X86::MOVSSrm_alt:
826 case X86::MOVSDrm:
827 case X86::MOVSDrm_alt:
828 case X86::MOVAPSrm:
829 case X86::MOVUPSrm:
830 case X86::MOVAPDrm:
831 case X86::MOVUPDrm:
832 case X86::MOVDQArm:
833 case X86::MOVDQUrm:
834 case X86::VMOVSSrm:
835 case X86::VMOVSSrm_alt:
836 case X86::VMOVSDrm:
837 case X86::VMOVSDrm_alt:
838 case X86::VMOVAPSrm:
839 case X86::VMOVUPSrm:
840 case X86::VMOVAPDrm:
841 case X86::VMOVUPDrm:
842 case X86::VMOVDQArm:
843 case X86::VMOVDQUrm:
844 case X86::VMOVAPSYrm:
845 case X86::VMOVUPSYrm:
846 case X86::VMOVAPDYrm:
847 case X86::VMOVUPDYrm:
848 case X86::VMOVDQAYrm:
849 case X86::VMOVDQUYrm:
850 case X86::MMX_MOVD64rm:
851 case X86::MMX_MOVQ64rm:
852 case X86::VBROADCASTSSrm:
853 case X86::VBROADCASTSSYrm:
854 case X86::VBROADCASTSDYrm:
855 // AVX-512
856 case X86::VPBROADCASTBZ128rm:
857 case X86::VPBROADCASTBZ256rm:
858 case X86::VPBROADCASTBZrm:
859 case X86::VBROADCASTF32X2Z256rm:
860 case X86::VBROADCASTF32X2Zrm:
861 case X86::VBROADCASTI32X2Z128rm:
862 case X86::VBROADCASTI32X2Z256rm:
863 case X86::VBROADCASTI32X2Zrm:
864 case X86::VPBROADCASTWZ128rm:
865 case X86::VPBROADCASTWZ256rm:
866 case X86::VPBROADCASTWZrm:
867 case X86::VPBROADCASTDZ128rm:
868 case X86::VPBROADCASTDZ256rm:
869 case X86::VPBROADCASTDZrm:
870 case X86::VBROADCASTSSZ128rm:
871 case X86::VBROADCASTSSZ256rm:
872 case X86::VBROADCASTSSZrm:
873 case X86::VPBROADCASTQZ128rm:
874 case X86::VPBROADCASTQZ256rm:
875 case X86::VPBROADCASTQZrm:
876 case X86::VBROADCASTSDZ256rm:
877 case X86::VBROADCASTSDZrm:
878 case X86::VMOVSSZrm:
879 case X86::VMOVSSZrm_alt:
880 case X86::VMOVSDZrm:
881 case X86::VMOVSDZrm_alt:
882 case X86::VMOVSHZrm:
883 case X86::VMOVSHZrm_alt:
884 case X86::VMOVAPDZ128rm:
885 case X86::VMOVAPDZ256rm:
886 case X86::VMOVAPDZrm:
887 case X86::VMOVAPSZ128rm:
888 case X86::VMOVAPSZ256rm:
889 case X86::VMOVAPSZ128rm_NOVLX:
890 case X86::VMOVAPSZ256rm_NOVLX:
891 case X86::VMOVAPSZrm:
892 case X86::VMOVDQA32Z128rm:
893 case X86::VMOVDQA32Z256rm:
894 case X86::VMOVDQA32Zrm:
895 case X86::VMOVDQA64Z128rm:
896 case X86::VMOVDQA64Z256rm:
897 case X86::VMOVDQA64Zrm:
898 case X86::VMOVDQU16Z128rm:
899 case X86::VMOVDQU16Z256rm:
900 case X86::VMOVDQU16Zrm:
901 case X86::VMOVDQU32Z128rm:
902 case X86::VMOVDQU32Z256rm:
903 case X86::VMOVDQU32Zrm:
904 case X86::VMOVDQU64Z128rm:
905 case X86::VMOVDQU64Z256rm:
906 case X86::VMOVDQU64Zrm:
907 case X86::VMOVDQU8Z128rm:
908 case X86::VMOVDQU8Z256rm:
909 case X86::VMOVDQU8Zrm:
910 case X86::VMOVUPDZ128rm:
911 case X86::VMOVUPDZ256rm:
912 case X86::VMOVUPDZrm:
913 case X86::VMOVUPSZ128rm:
914 case X86::VMOVUPSZ256rm:
915 case X86::VMOVUPSZ128rm_NOVLX:
916 case X86::VMOVUPSZ256rm_NOVLX:
917 case X86::VMOVUPSZrm: {
918 // Loads from constant pools are trivially rematerializable.
919 if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
920 MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
921 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
922 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
923 MI.isDereferenceableInvariantLoad()) {
924 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
925 if (BaseReg == 0 || BaseReg == X86::RIP)
926 return true;
927 // Allow re-materialization of PIC load.
928 if (!(!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())) {
929 const MachineFunction &MF = *MI.getParent()->getParent();
930 const MachineRegisterInfo &MRI = MF.getRegInfo();
931 if (regIsPICBase(BaseReg, MRI))
932 return true;
933 }
934 }
935 break;
936 }
937
938 case X86::LEA32r:
939 case X86::LEA64r: {
940 if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
941 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
942 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
943 !MI.getOperand(1 + X86::AddrDisp).isReg()) {
944 // lea fi#, lea GV, etc. are all rematerializable.
945 if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
946 return true;
947 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
948 if (BaseReg == 0)
949 return true;
950 // Allow re-materialization of lea PICBase + x.
951 const MachineFunction &MF = *MI.getParent()->getParent();
952 const MachineRegisterInfo &MRI = MF.getRegInfo();
953 if (regIsPICBase(BaseReg, MRI))
954 return true;
955 }
956 break;
957 }
958 }
960}
961
964 Register DestReg, unsigned SubIdx,
965 const MachineInstr &Orig,
966 LaneBitmask UsedLanes) const {
967 bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
968 if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
970 // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
971 // effects.
972 int Value;
973 switch (Orig.getOpcode()) {
974 case X86::MOV32r0:
975 Value = 0;
976 break;
977 case X86::MOV32r1:
978 Value = 1;
979 break;
980 case X86::MOV32r_1:
981 Value = -1;
982 break;
983 default:
984 llvm_unreachable("Unexpected instruction!");
985 }
986
987 const DebugLoc &DL = Orig.getDebugLoc();
988 BuildMI(MBB, I, DL, get(X86::MOV32ri))
989 .add(Orig.getOperand(0))
990 .addImm(Value);
991 } else {
992 MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
993 MBB.insert(I, MI);
994 }
995
996 MachineInstr &NewMI = *std::prev(I);
997 NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
998}
999
1000/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
1002 for (const MachineOperand &MO : MI.operands()) {
1003 if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS &&
1004 !MO.isDead()) {
1005 return true;
1006 }
1007 }
1008 return false;
1009}
1010
1011/// Check whether the shift count for a machine operand is non-zero.
1012inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1013 unsigned ShiftAmtOperandIdx) {
1014 // The shift count is six bits with the REX.W prefix and five bits without.
1015 unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1016 unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1017 return Imm & ShiftCountMask;
1018}
1019
1020/// Check whether the given shift count is appropriate
1021/// can be represented by a LEA instruction.
1022inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1023 // Left shift instructions can be transformed into load-effective-address
1024 // instructions if we can encode them appropriately.
1025 // A LEA instruction utilizes a SIB byte to encode its scale factor.
1026 // The SIB.scale field is two bits wide which means that we can encode any
1027 // shift amount less than 4.
1028 return ShAmt < 4 && ShAmt > 0;
1029}
1030
1031static bool
1033 const MachineRegisterInfo *MRI, MachineInstr **AndInstr,
1034 const TargetRegisterInfo *TRI, const X86Subtarget &ST,
1035 bool &NoSignFlag, bool &ClearsOverflowFlag) {
1036 if (!(CmpValDefInstr.getOpcode() == X86::SUBREG_TO_REG &&
1037 CmpInstr.getOpcode() == X86::TEST64rr) &&
1038 !(CmpValDefInstr.getOpcode() == X86::COPY &&
1039 CmpInstr.getOpcode() == X86::TEST16rr))
1040 return false;
1041
1042 // CmpInstr is a TEST16rr/TEST64rr instruction, and
1043 // `X86InstrInfo::analyzeCompare` guarantees that it's analyzable only if two
1044 // registers are identical.
1045 assert((CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
1046 "CmpInstr is an analyzable TEST16rr/TEST64rr, and "
1047 "`X86InstrInfo::analyzeCompare` requires two reg operands are the"
1048 "same.");
1049
1050 // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
1051 // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
1052 // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
1053 // redundant.
1054 assert(
1055 (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
1056 "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG or TEST16rr "
1057 "is a user of COPY sub16bit.");
1058 MachineInstr *VregDefInstr = nullptr;
1059 if (CmpInstr.getOpcode() == X86::TEST16rr) {
1060 if (!CmpValDefInstr.getOperand(1).getReg().isVirtual())
1061 return false;
1062 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1063 if (!VregDefInstr)
1064 return false;
1065 // We can only remove test when AND32ri or AND64ri32 whose imm can fit 16bit
1066 // size, others 32/64 bit ops would test higher bits which test16rr don't
1067 // want to.
1068 if (!((VregDefInstr->getOpcode() == X86::AND32ri ||
1069 VregDefInstr->getOpcode() == X86::AND64ri32) &&
1070 isUInt<16>(VregDefInstr->getOperand(2).getImm())))
1071 return false;
1072 }
1073
1074 if (CmpInstr.getOpcode() == X86::TEST64rr) {
1075 // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1076 // sub_32bit or sub_xmm.
1077 if (CmpValDefInstr.getOperand(2).getImm() != X86::sub_32bit)
1078 return false;
1079
1080 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1081 }
1082
1083 assert(VregDefInstr && "Must have a definition (SSA)");
1084
1085 // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1086 // to simplify the subsequent analysis.
1087 //
1088 // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1089 // `CmpValDefInstr.getParent()`, this could be handled.
1090 if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1091 return false;
1092
1093 if (X86::isAND(VregDefInstr->getOpcode()) &&
1094 (!ST.hasNF() || VregDefInstr->modifiesRegister(X86::EFLAGS, TRI))) {
1095 // Get a sequence of instructions like
1096 // %reg = and* ... // Set EFLAGS
1097 // ... // EFLAGS not changed
1098 // %extended_reg = subreg_to_reg %reg, %subreg.sub_32bit
1099 // test64rr %extended_reg, %extended_reg, implicit-def $eflags
1100 // or
1101 // %reg = and32* ...
1102 // ... // EFLAGS not changed.
1103 // %src_reg = copy %reg.sub_16bit:gr32
1104 // test16rr %src_reg, %src_reg, implicit-def $eflags
1105 //
1106 // If subsequent readers use a subset of bits that don't change
1107 // after `and*` instructions, it's likely that the test64rr could
1108 // be optimized away.
1109 for (const MachineInstr &Instr :
1110 make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1111 MachineBasicBlock::iterator(CmpValDefInstr))) {
1112 // There are instructions between 'VregDefInstr' and
1113 // 'CmpValDefInstr' that modifies EFLAGS.
1114 if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1115 return false;
1116 }
1117
1118 *AndInstr = VregDefInstr;
1119
1120 // AND instruction will essentially update SF and clear OF, so
1121 // NoSignFlag should be false in the sense that SF is modified by `AND`.
1122 //
1123 // However, the implementation artifically sets `NoSignFlag` to true
1124 // to poison the SF bit; that is to say, if SF is looked at later, the
1125 // optimization (to erase TEST64rr) will be disabled.
1126 //
1127 // The reason to poison SF bit is that SF bit value could be different
1128 // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1129 // and is known to be 0 as a result of `TEST64rr`.
1130 //
1131 // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1132 // the AND instruction and using the static information to guide peephole
1133 // optimization if possible. For example, it's possible to fold a
1134 // conditional move into a copy if the relevant EFLAG bits could be deduced
1135 // from an immediate operand of and operation.
1136 //
1137 NoSignFlag = true;
1138 // ClearsOverflowFlag is true for AND operation (no surprise).
1139 ClearsOverflowFlag = true;
1140 return true;
1141 }
1142 return false;
1143}
1144
1146 unsigned Opc, bool AllowSP, Register &NewSrc,
1147 unsigned &NewSrcSubReg, bool &isKill,
1148 MachineOperand &ImplicitOp, LiveVariables *LV,
1149 LiveIntervals *LIS) const {
1150 MachineFunction &MF = *MI.getParent()->getParent();
1151 const TargetRegisterClass *RC;
1152 if (AllowSP) {
1153 RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1154 } else {
1155 RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1156 }
1157 Register SrcReg = Src.getReg();
1158 unsigned SubReg = Src.getSubReg();
1159 isKill = MI.killsRegister(SrcReg, /*TRI=*/nullptr);
1160
1161 NewSrcSubReg = X86::NoSubRegister;
1162
1163 // For both LEA64 and LEA32 the register already has essentially the right
1164 // type (32-bit or 64-bit) we may just need to forbid SP.
1165 if (Opc != X86::LEA64_32r) {
1166 NewSrc = SrcReg;
1167 NewSrcSubReg = SubReg;
1168 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1169
1170 if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1171 return false;
1172
1173 return true;
1174 }
1175
1176 // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1177 // another we need to add 64-bit registers to the final MI.
1178 if (SrcReg.isPhysical()) {
1179 ImplicitOp = Src;
1180 ImplicitOp.setImplicit();
1181
1182 NewSrc = getX86SubSuperRegister(SrcReg, 64);
1183 assert(!SubReg && "no superregister for source");
1184 assert(NewSrc.isValid() && "Invalid Operand");
1185 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1186 } else {
1187 // Virtual register of the wrong class, we have to create a temporary 64-bit
1188 // vreg to feed into the LEA.
1189 NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1190 NewSrcSubReg = X86::NoSubRegister;
1191 MachineInstr *Copy =
1192 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1193 .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1194 .addReg(SrcReg, getKillRegState(isKill), SubReg);
1195
1196 // Which is obviously going to be dead after we're done with it.
1197 isKill = true;
1198
1199 if (LV)
1200 LV->replaceKillInstruction(SrcReg, MI, *Copy);
1201
1202 if (LIS) {
1203 SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1204 SlotIndex Idx = LIS->getInstructionIndex(MI);
1205 LiveInterval &LI = LIS->getInterval(SrcReg);
1207 if (S->end.getBaseIndex() == Idx)
1208 S->end = CopyIdx.getRegSlot();
1209 }
1210 }
1211
1212 // We've set all the parameters without issue.
1213 return true;
1214}
1215
1216MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1218 LiveVariables *LV,
1219 LiveIntervals *LIS,
1220 bool Is8BitOp) const {
1221 // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1222 MachineBasicBlock &MBB = *MI.getParent();
1223 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
1224 assert((Is8BitOp ||
1225 RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1226 *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1227 "Unexpected type for LEA transform");
1228
1229 // TODO: For a 32-bit target, we need to adjust the LEA variables with
1230 // something like this:
1231 // Opcode = X86::LEA32r;
1232 // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1233 // OutRegLEA =
1234 // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1235 // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1236 if (!Subtarget.is64Bit())
1237 return nullptr;
1238
1239 unsigned Opcode = X86::LEA64_32r;
1240 Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1241 Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1242 Register InRegLEA2;
1243
1244 // Build and insert into an implicit UNDEF value. This is OK because
1245 // we will be shifting and then extracting the lower 8/16-bits.
1246 // This has the potential to cause partial register stall. e.g.
1247 // movw (%rbp,%rcx,2), %dx
1248 // leal -65(%rdx), %esi
1249 // But testing has shown this *does* help performance in 64-bit mode (at
1250 // least on modern x86 machines).
1251 MachineBasicBlock::iterator MBBI = MI.getIterator();
1252 Register Dest = MI.getOperand(0).getReg();
1253 Register Src = MI.getOperand(1).getReg();
1254 unsigned SrcSubReg = MI.getOperand(1).getSubReg();
1255 Register Src2;
1256 unsigned Src2SubReg;
1257 bool IsDead = MI.getOperand(0).isDead();
1258 bool IsKill = MI.getOperand(1).isKill();
1259 unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1260 assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1261 MachineInstr *ImpDef =
1262 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1263 MachineInstr *InsMI =
1264 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1265 .addReg(InRegLEA, RegState::Define, SubReg)
1266 .addReg(Src, getKillRegState(IsKill), SrcSubReg);
1267 MachineInstr *ImpDef2 = nullptr;
1268 MachineInstr *InsMI2 = nullptr;
1269
1271 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1272#define CASE_NF(OP) \
1273 case X86::OP: \
1274 case X86::OP##_NF:
1275 switch (MIOpc) {
1276 default:
1277 llvm_unreachable("Unreachable!");
1278 CASE_NF(SHL8ri)
1279 CASE_NF(SHL16ri) {
1280 unsigned ShAmt = MI.getOperand(2).getImm();
1281 MIB.addReg(0)
1282 .addImm(1LL << ShAmt)
1283 .addReg(InRegLEA, RegState::Kill)
1284 .addImm(0)
1285 .addReg(0);
1286 break;
1287 }
1288 CASE_NF(INC8r)
1289 CASE_NF(INC16r)
1290 addRegOffset(MIB, InRegLEA, true, 1);
1291 break;
1292 CASE_NF(DEC8r)
1293 CASE_NF(DEC16r)
1294 addRegOffset(MIB, InRegLEA, true, -1);
1295 break;
1296 CASE_NF(ADD8ri)
1297 CASE_NF(ADD16ri)
1298 case X86::ADD8ri_DB:
1299 case X86::ADD16ri_DB:
1300 addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1301 break;
1302 CASE_NF(ADD8rr)
1303 CASE_NF(ADD16rr)
1304 case X86::ADD8rr_DB:
1305 case X86::ADD16rr_DB: {
1306 Src2 = MI.getOperand(2).getReg();
1307 Src2SubReg = MI.getOperand(2).getSubReg();
1308 bool IsKill2 = MI.getOperand(2).isKill();
1309 assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1310 if (Src == Src2) {
1311 // ADD8rr/ADD16rr killed %reg1028, %reg1028
1312 // just a single insert_subreg.
1313 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA, false,
1314 X86::NoSubRegister);
1315 } else {
1316 if (Subtarget.is64Bit())
1317 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1318 else
1319 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1320 // Build and insert into an implicit UNDEF value. This is OK because
1321 // we will be shifting and then extracting the lower 8/16-bits.
1322 ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1323 InRegLEA2);
1324 InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1325 .addReg(InRegLEA2, RegState::Define, SubReg)
1326 .addReg(Src2, getKillRegState(IsKill2), Src2SubReg);
1327 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA2, true,
1328 X86::NoSubRegister);
1329 }
1330 if (LV && IsKill2 && InsMI2)
1331 LV->replaceKillInstruction(Src2, MI, *InsMI2);
1332 break;
1333 }
1334 }
1335
1336 MachineInstr *NewMI = MIB;
1337 MachineInstr *ExtMI =
1338 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1340 .addReg(OutRegLEA, RegState::Kill, SubReg);
1341
1342 if (LV) {
1343 // Update live variables.
1344 LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1345 if (InRegLEA2)
1346 LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI);
1347 LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1348 if (IsKill)
1349 LV->replaceKillInstruction(Src, MI, *InsMI);
1350 if (IsDead)
1351 LV->replaceKillInstruction(Dest, MI, *ExtMI);
1352 }
1353
1354 if (LIS) {
1355 LIS->InsertMachineInstrInMaps(*ImpDef);
1356 SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1357 if (ImpDef2)
1358 LIS->InsertMachineInstrInMaps(*ImpDef2);
1359 SlotIndex Ins2Idx;
1360 if (InsMI2)
1361 Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1362 SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1363 SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1364 LIS->getInterval(InRegLEA);
1365 LIS->getInterval(OutRegLEA);
1366 if (InRegLEA2)
1367 LIS->getInterval(InRegLEA2);
1368
1369 // Move the use of Src up to InsMI.
1370 LiveInterval &SrcLI = LIS->getInterval(Src);
1371 LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1372 if (SrcSeg->end == NewIdx.getRegSlot())
1373 SrcSeg->end = InsIdx.getRegSlot();
1374
1375 if (InsMI2) {
1376 // Move the use of Src2 up to InsMI2.
1377 LiveInterval &Src2LI = LIS->getInterval(Src2);
1378 LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1379 if (Src2Seg->end == NewIdx.getRegSlot())
1380 Src2Seg->end = Ins2Idx.getRegSlot();
1381 }
1382
1383 // Move the definition of Dest down to ExtMI.
1384 LiveInterval &DestLI = LIS->getInterval(Dest);
1385 LiveRange::Segment *DestSeg =
1386 DestLI.getSegmentContaining(NewIdx.getRegSlot());
1387 assert(DestSeg->start == NewIdx.getRegSlot() &&
1388 DestSeg->valno->def == NewIdx.getRegSlot());
1389 DestSeg->start = ExtIdx.getRegSlot();
1390 DestSeg->valno->def = ExtIdx.getRegSlot();
1391 }
1392
1393 return ExtMI;
1394}
1395
1396/// This method must be implemented by targets that
1397/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1398/// may be able to convert a two-address instruction into a true
1399/// three-address instruction on demand. This allows the X86 target (for
1400/// example) to convert ADD and SHL instructions into LEA instructions if they
1401/// would require register copies due to two-addressness.
1402///
1403/// This method returns a null pointer if the transformation cannot be
1404/// performed, otherwise it returns the new instruction.
1405///
1407 LiveVariables *LV,
1408 LiveIntervals *LIS) const {
1409 // The following opcodes also sets the condition code register(s). Only
1410 // convert them to equivalent lea if the condition code register def's
1411 // are dead!
1413 return nullptr;
1414
1415 MachineFunction &MF = *MI.getParent()->getParent();
1416 // All instructions input are two-addr instructions. Get the known operands.
1417 const MachineOperand &Dest = MI.getOperand(0);
1418 const MachineOperand &Src = MI.getOperand(1);
1419
1420 // Ideally, operations with undef should be folded before we get here, but we
1421 // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1422 // Without this, we have to forward undef state to new register operands to
1423 // avoid machine verifier errors.
1424 if (Src.isUndef())
1425 return nullptr;
1426 if (MI.getNumOperands() > 2)
1427 if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1428 return nullptr;
1429
1430 MachineInstr *NewMI = nullptr;
1431 Register SrcReg, SrcReg2;
1432 unsigned SrcSubReg, SrcSubReg2;
1433 bool Is64Bit = Subtarget.is64Bit();
1434
1435 bool Is8BitOp = false;
1436 unsigned NumRegOperands = 2;
1437 unsigned MIOpc = MI.getOpcode();
1438 switch (MIOpc) {
1439 default:
1440 llvm_unreachable("Unreachable!");
1441 CASE_NF(SHL64ri) {
1442 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1443 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1444 if (!isTruncatedShiftCountForLEA(ShAmt))
1445 return nullptr;
1446
1447 // LEA can't handle RSP.
1448 if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1449 Src.getReg(), &X86::GR64_NOSPRegClass))
1450 return nullptr;
1451
1452 NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1453 .add(Dest)
1454 .addReg(0)
1455 .addImm(1LL << ShAmt)
1456 .add(Src)
1457 .addImm(0)
1458 .addReg(0);
1459 break;
1460 }
1461 CASE_NF(SHL32ri) {
1462 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1463 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1464 if (!isTruncatedShiftCountForLEA(ShAmt))
1465 return nullptr;
1466
1467 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1468
1469 // LEA can't handle ESP.
1470 bool isKill;
1471 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1472 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1473 isKill, ImplicitOp, LV, LIS))
1474 return nullptr;
1475
1477 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1478 .add(Dest)
1479 .addReg(0)
1480 .addImm(1LL << ShAmt)
1481 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
1482 .addImm(0)
1483 .addReg(0);
1484 if (ImplicitOp.getReg() != 0)
1485 MIB.add(ImplicitOp);
1486 NewMI = MIB;
1487
1488 // Add kills if classifyLEAReg created a new register.
1489 if (LV && SrcReg != Src.getReg())
1490 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1491 break;
1492 }
1493 CASE_NF(SHL8ri)
1494 Is8BitOp = true;
1495 [[fallthrough]];
1496 CASE_NF(SHL16ri) {
1497 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1498 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1499 if (!isTruncatedShiftCountForLEA(ShAmt))
1500 return nullptr;
1501 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1502 }
1503 CASE_NF(INC64r)
1504 CASE_NF(INC32r) {
1505 assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1506 unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
1507 ? X86::LEA64r
1508 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1509 bool isKill;
1510 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1511 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1512 isKill, ImplicitOp, LV, LIS))
1513 return nullptr;
1514
1515 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1516 .add(Dest)
1517 .addReg(SrcReg, getKillRegState(isKill));
1518 if (ImplicitOp.getReg() != 0)
1519 MIB.add(ImplicitOp);
1520
1521 NewMI = addOffset(MIB, 1);
1522
1523 // Add kills if classifyLEAReg created a new register.
1524 if (LV && SrcReg != Src.getReg())
1525 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1526 break;
1527 }
1528 CASE_NF(DEC64r)
1529 CASE_NF(DEC32r) {
1530 assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1531 unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
1532 ? X86::LEA64r
1533 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1534
1535 bool isKill;
1536 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1537 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1538 isKill, ImplicitOp, LV, LIS))
1539 return nullptr;
1540
1541 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1542 .add(Dest)
1543 .addReg(SrcReg, getKillRegState(isKill));
1544 if (ImplicitOp.getReg() != 0)
1545 MIB.add(ImplicitOp);
1546
1547 NewMI = addOffset(MIB, -1);
1548
1549 // Add kills if classifyLEAReg created a new register.
1550 if (LV && SrcReg != Src.getReg())
1551 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1552 break;
1553 }
1554 CASE_NF(DEC8r)
1555 CASE_NF(INC8r)
1556 Is8BitOp = true;
1557 [[fallthrough]];
1558 CASE_NF(DEC16r)
1559 CASE_NF(INC16r)
1560 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1561 CASE_NF(ADD64rr)
1562 CASE_NF(ADD32rr)
1563 case X86::ADD64rr_DB:
1564 case X86::ADD32rr_DB: {
1565 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1566 unsigned Opc;
1567 if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_NF ||
1568 MIOpc == X86::ADD64rr_DB)
1569 Opc = X86::LEA64r;
1570 else
1571 Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1572
1573 const MachineOperand &Src2 = MI.getOperand(2);
1574 bool isKill2;
1575 MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1576 if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, SrcSubReg2,
1577 isKill2, ImplicitOp2, LV, LIS))
1578 return nullptr;
1579
1580 bool isKill;
1581 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1582 if (Src.getReg() == Src2.getReg()) {
1583 // Don't call classify LEAReg a second time on the same register, in case
1584 // the first call inserted a COPY from Src2 and marked it as killed.
1585 isKill = isKill2;
1586 SrcReg = SrcReg2;
1587 SrcSubReg = SrcSubReg2;
1588 } else {
1589 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1590 isKill, ImplicitOp, LV, LIS))
1591 return nullptr;
1592 }
1593
1594 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1595 if (ImplicitOp.getReg() != 0)
1596 MIB.add(ImplicitOp);
1597 if (ImplicitOp2.getReg() != 0)
1598 MIB.add(ImplicitOp2);
1599
1600 NewMI =
1601 addRegReg(MIB, SrcReg, isKill, SrcSubReg, SrcReg2, isKill2, SrcSubReg2);
1602
1603 // Add kills if classifyLEAReg created a new register.
1604 if (LV) {
1605 if (SrcReg2 != Src2.getReg())
1606 LV->getVarInfo(SrcReg2).Kills.push_back(NewMI);
1607 if (SrcReg != SrcReg2 && SrcReg != Src.getReg())
1608 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1609 }
1610 NumRegOperands = 3;
1611 break;
1612 }
1613 CASE_NF(ADD8rr)
1614 case X86::ADD8rr_DB:
1615 Is8BitOp = true;
1616 [[fallthrough]];
1617 CASE_NF(ADD16rr)
1618 case X86::ADD16rr_DB:
1619 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1620 CASE_NF(ADD64ri32)
1621 case X86::ADD64ri32_DB:
1622 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1623 NewMI = addOffset(
1624 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1625 MI.getOperand(2));
1626 break;
1627 CASE_NF(ADD32ri)
1628 case X86::ADD32ri_DB: {
1629 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1630 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1631
1632 bool isKill;
1633 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1634 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1635 isKill, ImplicitOp, LV, LIS))
1636 return nullptr;
1637
1639 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1640 .add(Dest)
1641 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1642 if (ImplicitOp.getReg() != 0)
1643 MIB.add(ImplicitOp);
1644
1645 NewMI = addOffset(MIB, MI.getOperand(2));
1646
1647 // Add kills if classifyLEAReg created a new register.
1648 if (LV && SrcReg != Src.getReg())
1649 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1650 break;
1651 }
1652 CASE_NF(ADD8ri)
1653 case X86::ADD8ri_DB:
1654 Is8BitOp = true;
1655 [[fallthrough]];
1656 CASE_NF(ADD16ri)
1657 case X86::ADD16ri_DB:
1658 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1659 CASE_NF(SUB8ri)
1660 CASE_NF(SUB16ri)
1661 /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1662 return nullptr;
1663 CASE_NF(SUB32ri) {
1664 if (!MI.getOperand(2).isImm())
1665 return nullptr;
1666 int64_t Imm = MI.getOperand(2).getImm();
1667 if (!isInt<32>(-Imm))
1668 return nullptr;
1669
1670 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1671 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1672
1673 bool isKill;
1674 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1675 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1676 isKill, ImplicitOp, LV, LIS))
1677 return nullptr;
1678
1680 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1681 .add(Dest)
1682 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1683 if (ImplicitOp.getReg() != 0)
1684 MIB.add(ImplicitOp);
1685
1686 NewMI = addOffset(MIB, -Imm);
1687
1688 // Add kills if classifyLEAReg created a new register.
1689 if (LV && SrcReg != Src.getReg())
1690 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1691 break;
1692 }
1693
1694 CASE_NF(SUB64ri32) {
1695 if (!MI.getOperand(2).isImm())
1696 return nullptr;
1697 int64_t Imm = MI.getOperand(2).getImm();
1698 if (!isInt<32>(-Imm))
1699 return nullptr;
1700
1701 assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1702
1704 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
1705 NewMI = addOffset(MIB, -Imm);
1706 break;
1707 }
1708
1709 case X86::VMOVDQU8Z128rmk:
1710 case X86::VMOVDQU8Z256rmk:
1711 case X86::VMOVDQU8Zrmk:
1712 case X86::VMOVDQU16Z128rmk:
1713 case X86::VMOVDQU16Z256rmk:
1714 case X86::VMOVDQU16Zrmk:
1715 case X86::VMOVDQU32Z128rmk:
1716 case X86::VMOVDQA32Z128rmk:
1717 case X86::VMOVDQU32Z256rmk:
1718 case X86::VMOVDQA32Z256rmk:
1719 case X86::VMOVDQU32Zrmk:
1720 case X86::VMOVDQA32Zrmk:
1721 case X86::VMOVDQU64Z128rmk:
1722 case X86::VMOVDQA64Z128rmk:
1723 case X86::VMOVDQU64Z256rmk:
1724 case X86::VMOVDQA64Z256rmk:
1725 case X86::VMOVDQU64Zrmk:
1726 case X86::VMOVDQA64Zrmk:
1727 case X86::VMOVUPDZ128rmk:
1728 case X86::VMOVAPDZ128rmk:
1729 case X86::VMOVUPDZ256rmk:
1730 case X86::VMOVAPDZ256rmk:
1731 case X86::VMOVUPDZrmk:
1732 case X86::VMOVAPDZrmk:
1733 case X86::VMOVUPSZ128rmk:
1734 case X86::VMOVAPSZ128rmk:
1735 case X86::VMOVUPSZ256rmk:
1736 case X86::VMOVAPSZ256rmk:
1737 case X86::VMOVUPSZrmk:
1738 case X86::VMOVAPSZrmk:
1739 case X86::VBROADCASTSDZ256rmk:
1740 case X86::VBROADCASTSDZrmk:
1741 case X86::VBROADCASTSSZ128rmk:
1742 case X86::VBROADCASTSSZ256rmk:
1743 case X86::VBROADCASTSSZrmk:
1744 case X86::VPBROADCASTDZ128rmk:
1745 case X86::VPBROADCASTDZ256rmk:
1746 case X86::VPBROADCASTDZrmk:
1747 case X86::VPBROADCASTQZ128rmk:
1748 case X86::VPBROADCASTQZ256rmk:
1749 case X86::VPBROADCASTQZrmk: {
1750 unsigned Opc;
1751 switch (MIOpc) {
1752 default:
1753 llvm_unreachable("Unreachable!");
1754 case X86::VMOVDQU8Z128rmk:
1755 Opc = X86::VPBLENDMBZ128rmk;
1756 break;
1757 case X86::VMOVDQU8Z256rmk:
1758 Opc = X86::VPBLENDMBZ256rmk;
1759 break;
1760 case X86::VMOVDQU8Zrmk:
1761 Opc = X86::VPBLENDMBZrmk;
1762 break;
1763 case X86::VMOVDQU16Z128rmk:
1764 Opc = X86::VPBLENDMWZ128rmk;
1765 break;
1766 case X86::VMOVDQU16Z256rmk:
1767 Opc = X86::VPBLENDMWZ256rmk;
1768 break;
1769 case X86::VMOVDQU16Zrmk:
1770 Opc = X86::VPBLENDMWZrmk;
1771 break;
1772 case X86::VMOVDQU32Z128rmk:
1773 Opc = X86::VPBLENDMDZ128rmk;
1774 break;
1775 case X86::VMOVDQU32Z256rmk:
1776 Opc = X86::VPBLENDMDZ256rmk;
1777 break;
1778 case X86::VMOVDQU32Zrmk:
1779 Opc = X86::VPBLENDMDZrmk;
1780 break;
1781 case X86::VMOVDQU64Z128rmk:
1782 Opc = X86::VPBLENDMQZ128rmk;
1783 break;
1784 case X86::VMOVDQU64Z256rmk:
1785 Opc = X86::VPBLENDMQZ256rmk;
1786 break;
1787 case X86::VMOVDQU64Zrmk:
1788 Opc = X86::VPBLENDMQZrmk;
1789 break;
1790 case X86::VMOVUPDZ128rmk:
1791 Opc = X86::VBLENDMPDZ128rmk;
1792 break;
1793 case X86::VMOVUPDZ256rmk:
1794 Opc = X86::VBLENDMPDZ256rmk;
1795 break;
1796 case X86::VMOVUPDZrmk:
1797 Opc = X86::VBLENDMPDZrmk;
1798 break;
1799 case X86::VMOVUPSZ128rmk:
1800 Opc = X86::VBLENDMPSZ128rmk;
1801 break;
1802 case X86::VMOVUPSZ256rmk:
1803 Opc = X86::VBLENDMPSZ256rmk;
1804 break;
1805 case X86::VMOVUPSZrmk:
1806 Opc = X86::VBLENDMPSZrmk;
1807 break;
1808 case X86::VMOVDQA32Z128rmk:
1809 Opc = X86::VPBLENDMDZ128rmk;
1810 break;
1811 case X86::VMOVDQA32Z256rmk:
1812 Opc = X86::VPBLENDMDZ256rmk;
1813 break;
1814 case X86::VMOVDQA32Zrmk:
1815 Opc = X86::VPBLENDMDZrmk;
1816 break;
1817 case X86::VMOVDQA64Z128rmk:
1818 Opc = X86::VPBLENDMQZ128rmk;
1819 break;
1820 case X86::VMOVDQA64Z256rmk:
1821 Opc = X86::VPBLENDMQZ256rmk;
1822 break;
1823 case X86::VMOVDQA64Zrmk:
1824 Opc = X86::VPBLENDMQZrmk;
1825 break;
1826 case X86::VMOVAPDZ128rmk:
1827 Opc = X86::VBLENDMPDZ128rmk;
1828 break;
1829 case X86::VMOVAPDZ256rmk:
1830 Opc = X86::VBLENDMPDZ256rmk;
1831 break;
1832 case X86::VMOVAPDZrmk:
1833 Opc = X86::VBLENDMPDZrmk;
1834 break;
1835 case X86::VMOVAPSZ128rmk:
1836 Opc = X86::VBLENDMPSZ128rmk;
1837 break;
1838 case X86::VMOVAPSZ256rmk:
1839 Opc = X86::VBLENDMPSZ256rmk;
1840 break;
1841 case X86::VMOVAPSZrmk:
1842 Opc = X86::VBLENDMPSZrmk;
1843 break;
1844 case X86::VBROADCASTSDZ256rmk:
1845 Opc = X86::VBLENDMPDZ256rmbk;
1846 break;
1847 case X86::VBROADCASTSDZrmk:
1848 Opc = X86::VBLENDMPDZrmbk;
1849 break;
1850 case X86::VBROADCASTSSZ128rmk:
1851 Opc = X86::VBLENDMPSZ128rmbk;
1852 break;
1853 case X86::VBROADCASTSSZ256rmk:
1854 Opc = X86::VBLENDMPSZ256rmbk;
1855 break;
1856 case X86::VBROADCASTSSZrmk:
1857 Opc = X86::VBLENDMPSZrmbk;
1858 break;
1859 case X86::VPBROADCASTDZ128rmk:
1860 Opc = X86::VPBLENDMDZ128rmbk;
1861 break;
1862 case X86::VPBROADCASTDZ256rmk:
1863 Opc = X86::VPBLENDMDZ256rmbk;
1864 break;
1865 case X86::VPBROADCASTDZrmk:
1866 Opc = X86::VPBLENDMDZrmbk;
1867 break;
1868 case X86::VPBROADCASTQZ128rmk:
1869 Opc = X86::VPBLENDMQZ128rmbk;
1870 break;
1871 case X86::VPBROADCASTQZ256rmk:
1872 Opc = X86::VPBLENDMQZ256rmbk;
1873 break;
1874 case X86::VPBROADCASTQZrmk:
1875 Opc = X86::VPBLENDMQZrmbk;
1876 break;
1877 }
1878
1879 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1880 .add(Dest)
1881 .add(MI.getOperand(2))
1882 .add(Src)
1883 .add(MI.getOperand(3))
1884 .add(MI.getOperand(4))
1885 .add(MI.getOperand(5))
1886 .add(MI.getOperand(6))
1887 .add(MI.getOperand(7));
1888 NumRegOperands = 4;
1889 break;
1890 }
1891
1892 case X86::VMOVDQU8Z128rrk:
1893 case X86::VMOVDQU8Z256rrk:
1894 case X86::VMOVDQU8Zrrk:
1895 case X86::VMOVDQU16Z128rrk:
1896 case X86::VMOVDQU16Z256rrk:
1897 case X86::VMOVDQU16Zrrk:
1898 case X86::VMOVDQU32Z128rrk:
1899 case X86::VMOVDQA32Z128rrk:
1900 case X86::VMOVDQU32Z256rrk:
1901 case X86::VMOVDQA32Z256rrk:
1902 case X86::VMOVDQU32Zrrk:
1903 case X86::VMOVDQA32Zrrk:
1904 case X86::VMOVDQU64Z128rrk:
1905 case X86::VMOVDQA64Z128rrk:
1906 case X86::VMOVDQU64Z256rrk:
1907 case X86::VMOVDQA64Z256rrk:
1908 case X86::VMOVDQU64Zrrk:
1909 case X86::VMOVDQA64Zrrk:
1910 case X86::VMOVUPDZ128rrk:
1911 case X86::VMOVAPDZ128rrk:
1912 case X86::VMOVUPDZ256rrk:
1913 case X86::VMOVAPDZ256rrk:
1914 case X86::VMOVUPDZrrk:
1915 case X86::VMOVAPDZrrk:
1916 case X86::VMOVUPSZ128rrk:
1917 case X86::VMOVAPSZ128rrk:
1918 case X86::VMOVUPSZ256rrk:
1919 case X86::VMOVAPSZ256rrk:
1920 case X86::VMOVUPSZrrk:
1921 case X86::VMOVAPSZrrk: {
1922 unsigned Opc;
1923 switch (MIOpc) {
1924 default:
1925 llvm_unreachable("Unreachable!");
1926 case X86::VMOVDQU8Z128rrk:
1927 Opc = X86::VPBLENDMBZ128rrk;
1928 break;
1929 case X86::VMOVDQU8Z256rrk:
1930 Opc = X86::VPBLENDMBZ256rrk;
1931 break;
1932 case X86::VMOVDQU8Zrrk:
1933 Opc = X86::VPBLENDMBZrrk;
1934 break;
1935 case X86::VMOVDQU16Z128rrk:
1936 Opc = X86::VPBLENDMWZ128rrk;
1937 break;
1938 case X86::VMOVDQU16Z256rrk:
1939 Opc = X86::VPBLENDMWZ256rrk;
1940 break;
1941 case X86::VMOVDQU16Zrrk:
1942 Opc = X86::VPBLENDMWZrrk;
1943 break;
1944 case X86::VMOVDQU32Z128rrk:
1945 Opc = X86::VPBLENDMDZ128rrk;
1946 break;
1947 case X86::VMOVDQU32Z256rrk:
1948 Opc = X86::VPBLENDMDZ256rrk;
1949 break;
1950 case X86::VMOVDQU32Zrrk:
1951 Opc = X86::VPBLENDMDZrrk;
1952 break;
1953 case X86::VMOVDQU64Z128rrk:
1954 Opc = X86::VPBLENDMQZ128rrk;
1955 break;
1956 case X86::VMOVDQU64Z256rrk:
1957 Opc = X86::VPBLENDMQZ256rrk;
1958 break;
1959 case X86::VMOVDQU64Zrrk:
1960 Opc = X86::VPBLENDMQZrrk;
1961 break;
1962 case X86::VMOVUPDZ128rrk:
1963 Opc = X86::VBLENDMPDZ128rrk;
1964 break;
1965 case X86::VMOVUPDZ256rrk:
1966 Opc = X86::VBLENDMPDZ256rrk;
1967 break;
1968 case X86::VMOVUPDZrrk:
1969 Opc = X86::VBLENDMPDZrrk;
1970 break;
1971 case X86::VMOVUPSZ128rrk:
1972 Opc = X86::VBLENDMPSZ128rrk;
1973 break;
1974 case X86::VMOVUPSZ256rrk:
1975 Opc = X86::VBLENDMPSZ256rrk;
1976 break;
1977 case X86::VMOVUPSZrrk:
1978 Opc = X86::VBLENDMPSZrrk;
1979 break;
1980 case X86::VMOVDQA32Z128rrk:
1981 Opc = X86::VPBLENDMDZ128rrk;
1982 break;
1983 case X86::VMOVDQA32Z256rrk:
1984 Opc = X86::VPBLENDMDZ256rrk;
1985 break;
1986 case X86::VMOVDQA32Zrrk:
1987 Opc = X86::VPBLENDMDZrrk;
1988 break;
1989 case X86::VMOVDQA64Z128rrk:
1990 Opc = X86::VPBLENDMQZ128rrk;
1991 break;
1992 case X86::VMOVDQA64Z256rrk:
1993 Opc = X86::VPBLENDMQZ256rrk;
1994 break;
1995 case X86::VMOVDQA64Zrrk:
1996 Opc = X86::VPBLENDMQZrrk;
1997 break;
1998 case X86::VMOVAPDZ128rrk:
1999 Opc = X86::VBLENDMPDZ128rrk;
2000 break;
2001 case X86::VMOVAPDZ256rrk:
2002 Opc = X86::VBLENDMPDZ256rrk;
2003 break;
2004 case X86::VMOVAPDZrrk:
2005 Opc = X86::VBLENDMPDZrrk;
2006 break;
2007 case X86::VMOVAPSZ128rrk:
2008 Opc = X86::VBLENDMPSZ128rrk;
2009 break;
2010 case X86::VMOVAPSZ256rrk:
2011 Opc = X86::VBLENDMPSZ256rrk;
2012 break;
2013 case X86::VMOVAPSZrrk:
2014 Opc = X86::VBLENDMPSZrrk;
2015 break;
2016 }
2017
2018 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
2019 .add(Dest)
2020 .add(MI.getOperand(2))
2021 .add(Src)
2022 .add(MI.getOperand(3));
2023 NumRegOperands = 4;
2024 break;
2025 }
2026 }
2027#undef CASE_NF
2028
2029 if (!NewMI)
2030 return nullptr;
2031
2032 if (LV) { // Update live variables
2033 for (unsigned I = 0; I < NumRegOperands; ++I) {
2034 MachineOperand &Op = MI.getOperand(I);
2035 if (Op.isReg() && (Op.isDead() || Op.isKill()))
2036 LV->replaceKillInstruction(Op.getReg(), MI, *NewMI);
2037 }
2038 }
2039
2040 MachineBasicBlock &MBB = *MI.getParent();
2041 MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
2042
2043 if (LIS) {
2044 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
2045 if (SrcReg)
2046 LIS->getInterval(SrcReg);
2047 if (SrcReg2)
2048 LIS->getInterval(SrcReg2);
2049 }
2050
2051 return NewMI;
2052}
2053
2054/// This determines which of three possible cases of a three source commute
2055/// the source indexes correspond to taking into account any mask operands.
2056/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
2057/// possible.
2058/// Case 0 - Possible to commute the first and second operands.
2059/// Case 1 - Possible to commute the first and third operands.
2060/// Case 2 - Possible to commute the second and third operands.
2061static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
2062 unsigned SrcOpIdx2) {
2063 // Put the lowest index to SrcOpIdx1 to simplify the checks below.
2064 if (SrcOpIdx1 > SrcOpIdx2)
2065 std::swap(SrcOpIdx1, SrcOpIdx2);
2066
2067 unsigned Op1 = 1, Op2 = 2, Op3 = 3;
2068 if (X86II::isKMasked(TSFlags)) {
2069 Op2++;
2070 Op3++;
2071 }
2072
2073 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
2074 return 0;
2075 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
2076 return 1;
2077 if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
2078 return 2;
2079 llvm_unreachable("Unknown three src commute case.");
2080}
2081
2083 const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
2084 const X86InstrFMA3Group &FMA3Group) const {
2085
2086 unsigned Opc = MI.getOpcode();
2087
2088 // TODO: Commuting the 1st operand of FMA*_Int requires some additional
2089 // analysis. The commute optimization is legal only if all users of FMA*_Int
2090 // use only the lowest element of the FMA*_Int instruction. Such analysis are
2091 // not implemented yet. So, just return 0 in that case.
2092 // When such analysis are available this place will be the right place for
2093 // calling it.
2094 assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
2095 "Intrinsic instructions can't commute operand 1");
2096
2097 // Determine which case this commute is or if it can't be done.
2098 unsigned Case =
2099 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2100 assert(Case < 3 && "Unexpected case number!");
2101
2102 // Define the FMA forms mapping array that helps to map input FMA form
2103 // to output FMA form to preserve the operation semantics after
2104 // commuting the operands.
2105 const unsigned Form132Index = 0;
2106 const unsigned Form213Index = 1;
2107 const unsigned Form231Index = 2;
2108 static const unsigned FormMapping[][3] = {
2109 // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
2110 // FMA132 A, C, b; ==> FMA231 C, A, b;
2111 // FMA213 B, A, c; ==> FMA213 A, B, c;
2112 // FMA231 C, A, b; ==> FMA132 A, C, b;
2113 {Form231Index, Form213Index, Form132Index},
2114 // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
2115 // FMA132 A, c, B; ==> FMA132 B, c, A;
2116 // FMA213 B, a, C; ==> FMA231 C, a, B;
2117 // FMA231 C, a, B; ==> FMA213 B, a, C;
2118 {Form132Index, Form231Index, Form213Index},
2119 // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
2120 // FMA132 a, C, B; ==> FMA213 a, B, C;
2121 // FMA213 b, A, C; ==> FMA132 b, C, A;
2122 // FMA231 c, A, B; ==> FMA231 c, B, A;
2123 {Form213Index, Form132Index, Form231Index}};
2124
2125 unsigned FMAForms[3];
2126 FMAForms[0] = FMA3Group.get132Opcode();
2127 FMAForms[1] = FMA3Group.get213Opcode();
2128 FMAForms[2] = FMA3Group.get231Opcode();
2129
2130 // Everything is ready, just adjust the FMA opcode and return it.
2131 for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
2132 if (Opc == FMAForms[FormIndex])
2133 return FMAForms[FormMapping[Case][FormIndex]];
2134
2135 llvm_unreachable("Illegal FMA3 format");
2136}
2137
2138static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
2139 unsigned SrcOpIdx2) {
2140 // Determine which case this commute is or if it can't be done.
2141 unsigned Case =
2142 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2143 assert(Case < 3 && "Unexpected case value!");
2144
2145 // For each case we need to swap two pairs of bits in the final immediate.
2146 static const uint8_t SwapMasks[3][4] = {
2147 {0x04, 0x10, 0x08, 0x20}, // Swap bits 2/4 and 3/5.
2148 {0x02, 0x10, 0x08, 0x40}, // Swap bits 1/4 and 3/6.
2149 {0x02, 0x04, 0x20, 0x40}, // Swap bits 1/2 and 5/6.
2150 };
2151
2152 uint8_t Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2153 // Clear out the bits we are swapping.
2154 uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
2155 SwapMasks[Case][2] | SwapMasks[Case][3]);
2156 // If the immediate had a bit of the pair set, then set the opposite bit.
2157 if (Imm & SwapMasks[Case][0])
2158 NewImm |= SwapMasks[Case][1];
2159 if (Imm & SwapMasks[Case][1])
2160 NewImm |= SwapMasks[Case][0];
2161 if (Imm & SwapMasks[Case][2])
2162 NewImm |= SwapMasks[Case][3];
2163 if (Imm & SwapMasks[Case][3])
2164 NewImm |= SwapMasks[Case][2];
2165 MI.getOperand(MI.getNumOperands() - 1).setImm(NewImm);
2166}
2167
2168// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
2169// commuted.
2170static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
2171#define VPERM_CASES(Suffix) \
2172 case X86::VPERMI2##Suffix##Z128rr: \
2173 case X86::VPERMT2##Suffix##Z128rr: \
2174 case X86::VPERMI2##Suffix##Z256rr: \
2175 case X86::VPERMT2##Suffix##Z256rr: \
2176 case X86::VPERMI2##Suffix##Zrr: \
2177 case X86::VPERMT2##Suffix##Zrr: \
2178 case X86::VPERMI2##Suffix##Z128rm: \
2179 case X86::VPERMT2##Suffix##Z128rm: \
2180 case X86::VPERMI2##Suffix##Z256rm: \
2181 case X86::VPERMT2##Suffix##Z256rm: \
2182 case X86::VPERMI2##Suffix##Zrm: \
2183 case X86::VPERMT2##Suffix##Zrm: \
2184 case X86::VPERMI2##Suffix##Z128rrkz: \
2185 case X86::VPERMT2##Suffix##Z128rrkz: \
2186 case X86::VPERMI2##Suffix##Z256rrkz: \
2187 case X86::VPERMT2##Suffix##Z256rrkz: \
2188 case X86::VPERMI2##Suffix##Zrrkz: \
2189 case X86::VPERMT2##Suffix##Zrrkz: \
2190 case X86::VPERMI2##Suffix##Z128rmkz: \
2191 case X86::VPERMT2##Suffix##Z128rmkz: \
2192 case X86::VPERMI2##Suffix##Z256rmkz: \
2193 case X86::VPERMT2##Suffix##Z256rmkz: \
2194 case X86::VPERMI2##Suffix##Zrmkz: \
2195 case X86::VPERMT2##Suffix##Zrmkz:
2196
2197#define VPERM_CASES_BROADCAST(Suffix) \
2198 VPERM_CASES(Suffix) \
2199 case X86::VPERMI2##Suffix##Z128rmb: \
2200 case X86::VPERMT2##Suffix##Z128rmb: \
2201 case X86::VPERMI2##Suffix##Z256rmb: \
2202 case X86::VPERMT2##Suffix##Z256rmb: \
2203 case X86::VPERMI2##Suffix##Zrmb: \
2204 case X86::VPERMT2##Suffix##Zrmb: \
2205 case X86::VPERMI2##Suffix##Z128rmbkz: \
2206 case X86::VPERMT2##Suffix##Z128rmbkz: \
2207 case X86::VPERMI2##Suffix##Z256rmbkz: \
2208 case X86::VPERMT2##Suffix##Z256rmbkz: \
2209 case X86::VPERMI2##Suffix##Zrmbkz: \
2210 case X86::VPERMT2##Suffix##Zrmbkz:
2211
2212 switch (Opcode) {
2213 default:
2214 return false;
2215 VPERM_CASES(B)
2220 VPERM_CASES(W)
2221 return true;
2222 }
2223#undef VPERM_CASES_BROADCAST
2224#undef VPERM_CASES
2225}
2226
2227// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
2228// from the I opcode to the T opcode and vice versa.
2229static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
2230#define VPERM_CASES(Orig, New) \
2231 case X86::Orig##Z128rr: \
2232 return X86::New##Z128rr; \
2233 case X86::Orig##Z128rrkz: \
2234 return X86::New##Z128rrkz; \
2235 case X86::Orig##Z128rm: \
2236 return X86::New##Z128rm; \
2237 case X86::Orig##Z128rmkz: \
2238 return X86::New##Z128rmkz; \
2239 case X86::Orig##Z256rr: \
2240 return X86::New##Z256rr; \
2241 case X86::Orig##Z256rrkz: \
2242 return X86::New##Z256rrkz; \
2243 case X86::Orig##Z256rm: \
2244 return X86::New##Z256rm; \
2245 case X86::Orig##Z256rmkz: \
2246 return X86::New##Z256rmkz; \
2247 case X86::Orig##Zrr: \
2248 return X86::New##Zrr; \
2249 case X86::Orig##Zrrkz: \
2250 return X86::New##Zrrkz; \
2251 case X86::Orig##Zrm: \
2252 return X86::New##Zrm; \
2253 case X86::Orig##Zrmkz: \
2254 return X86::New##Zrmkz;
2255
2256#define VPERM_CASES_BROADCAST(Orig, New) \
2257 VPERM_CASES(Orig, New) \
2258 case X86::Orig##Z128rmb: \
2259 return X86::New##Z128rmb; \
2260 case X86::Orig##Z128rmbkz: \
2261 return X86::New##Z128rmbkz; \
2262 case X86::Orig##Z256rmb: \
2263 return X86::New##Z256rmb; \
2264 case X86::Orig##Z256rmbkz: \
2265 return X86::New##Z256rmbkz; \
2266 case X86::Orig##Zrmb: \
2267 return X86::New##Zrmb; \
2268 case X86::Orig##Zrmbkz: \
2269 return X86::New##Zrmbkz;
2270
2271 switch (Opcode) {
2272 VPERM_CASES(VPERMI2B, VPERMT2B)
2273 VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
2274 VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
2275 VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
2276 VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
2277 VPERM_CASES(VPERMI2W, VPERMT2W)
2278 VPERM_CASES(VPERMT2B, VPERMI2B)
2279 VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2280 VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2281 VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2282 VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2283 VPERM_CASES(VPERMT2W, VPERMI2W)
2284 }
2285
2286 llvm_unreachable("Unreachable!");
2287#undef VPERM_CASES_BROADCAST
2288#undef VPERM_CASES
2289}
2290
2292 unsigned OpIdx1,
2293 unsigned OpIdx2) const {
2294 auto CloneIfNew = [&](MachineInstr &MI) {
2295 return std::exchange(NewMI, false)
2296 ? MI.getParent()->getParent()->CloneMachineInstr(&MI)
2297 : &MI;
2298 };
2299 MachineInstr *WorkingMI = nullptr;
2300 unsigned Opc = MI.getOpcode();
2301
2302#define CASE_ND(OP) \
2303 case X86::OP: \
2304 case X86::OP##_ND:
2305
2306 switch (Opc) {
2307 // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
2308 CASE_ND(SHRD16rri8)
2309 CASE_ND(SHLD16rri8)
2310 CASE_ND(SHRD32rri8)
2311 CASE_ND(SHLD32rri8)
2312 CASE_ND(SHRD64rri8)
2313 CASE_ND(SHLD64rri8) {
2314 unsigned Size;
2315 switch (Opc) {
2316 default:
2317 llvm_unreachable("Unreachable!");
2318#define FROM_TO_SIZE(A, B, S) \
2319 case X86::A: \
2320 Opc = X86::B; \
2321 Size = S; \
2322 break; \
2323 case X86::A##_ND: \
2324 Opc = X86::B##_ND; \
2325 Size = S; \
2326 break; \
2327 case X86::B: \
2328 Opc = X86::A; \
2329 Size = S; \
2330 break; \
2331 case X86::B##_ND: \
2332 Opc = X86::A##_ND; \
2333 Size = S; \
2334 break;
2335
2336 FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
2337 FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
2338 FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
2339#undef FROM_TO_SIZE
2340 }
2341 WorkingMI = CloneIfNew(MI);
2342 WorkingMI->setDesc(get(Opc));
2343 WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
2344 break;
2345 }
2346 case X86::PFSUBrr:
2347 case X86::PFSUBRrr:
2348 // PFSUB x, y: x = x - y
2349 // PFSUBR x, y: x = y - x
2350 WorkingMI = CloneIfNew(MI);
2351 WorkingMI->setDesc(
2352 get(X86::PFSUBRrr == Opc ? X86::PFSUBrr : X86::PFSUBRrr));
2353 break;
2354 case X86::BLENDPDrri:
2355 case X86::BLENDPSrri:
2356 case X86::PBLENDWrri:
2357 case X86::VBLENDPDrri:
2358 case X86::VBLENDPSrri:
2359 case X86::VBLENDPDYrri:
2360 case X86::VBLENDPSYrri:
2361 case X86::VPBLENDDrri:
2362 case X86::VPBLENDWrri:
2363 case X86::VPBLENDDYrri:
2364 case X86::VPBLENDWYrri: {
2365 int8_t Mask;
2366 switch (Opc) {
2367 default:
2368 llvm_unreachable("Unreachable!");
2369 case X86::BLENDPDrri:
2370 Mask = (int8_t)0x03;
2371 break;
2372 case X86::BLENDPSrri:
2373 Mask = (int8_t)0x0F;
2374 break;
2375 case X86::PBLENDWrri:
2376 Mask = (int8_t)0xFF;
2377 break;
2378 case X86::VBLENDPDrri:
2379 Mask = (int8_t)0x03;
2380 break;
2381 case X86::VBLENDPSrri:
2382 Mask = (int8_t)0x0F;
2383 break;
2384 case X86::VBLENDPDYrri:
2385 Mask = (int8_t)0x0F;
2386 break;
2387 case X86::VBLENDPSYrri:
2388 Mask = (int8_t)0xFF;
2389 break;
2390 case X86::VPBLENDDrri:
2391 Mask = (int8_t)0x0F;
2392 break;
2393 case X86::VPBLENDWrri:
2394 Mask = (int8_t)0xFF;
2395 break;
2396 case X86::VPBLENDDYrri:
2397 Mask = (int8_t)0xFF;
2398 break;
2399 case X86::VPBLENDWYrri:
2400 Mask = (int8_t)0xFF;
2401 break;
2402 }
2403 // Only the least significant bits of Imm are used.
2404 // Using int8_t to ensure it will be sign extended to the int64_t that
2405 // setImm takes in order to match isel behavior.
2406 int8_t Imm = MI.getOperand(3).getImm() & Mask;
2407 WorkingMI = CloneIfNew(MI);
2408 WorkingMI->getOperand(3).setImm(Mask ^ Imm);
2409 break;
2410 }
2411 case X86::INSERTPSrri:
2412 case X86::VINSERTPSrri:
2413 case X86::VINSERTPSZrri: {
2414 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2415 unsigned ZMask = Imm & 15;
2416 unsigned DstIdx = (Imm >> 4) & 3;
2417 unsigned SrcIdx = (Imm >> 6) & 3;
2418
2419 // We can commute insertps if we zero 2 of the elements, the insertion is
2420 // "inline" and we don't override the insertion with a zero.
2421 if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2422 llvm::popcount(ZMask) == 2) {
2423 unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15);
2424 assert(AltIdx < 4 && "Illegal insertion index");
2425 unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2426 WorkingMI = CloneIfNew(MI);
2427 WorkingMI->getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2428 break;
2429 }
2430 return nullptr;
2431 }
2432 case X86::MOVSDrr:
2433 case X86::MOVSSrr:
2434 case X86::VMOVSDrr:
2435 case X86::VMOVSSrr: {
2436 // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2437 if (Subtarget.hasSSE41()) {
2438 unsigned Mask;
2439 switch (Opc) {
2440 default:
2441 llvm_unreachable("Unreachable!");
2442 case X86::MOVSDrr:
2443 Opc = X86::BLENDPDrri;
2444 Mask = 0x02;
2445 break;
2446 case X86::MOVSSrr:
2447 Opc = X86::BLENDPSrri;
2448 Mask = 0x0E;
2449 break;
2450 case X86::VMOVSDrr:
2451 Opc = X86::VBLENDPDrri;
2452 Mask = 0x02;
2453 break;
2454 case X86::VMOVSSrr:
2455 Opc = X86::VBLENDPSrri;
2456 Mask = 0x0E;
2457 break;
2458 }
2459
2460 WorkingMI = CloneIfNew(MI);
2461 WorkingMI->setDesc(get(Opc));
2462 WorkingMI->addOperand(MachineOperand::CreateImm(Mask));
2463 break;
2464 }
2465
2466 assert(Opc == X86::MOVSDrr && "Only MOVSD can commute to SHUFPD");
2467 WorkingMI = CloneIfNew(MI);
2468 WorkingMI->setDesc(get(X86::SHUFPDrri));
2469 WorkingMI->addOperand(MachineOperand::CreateImm(0x02));
2470 break;
2471 }
2472 case X86::SHUFPDrri: {
2473 // Commute to MOVSD.
2474 assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2475 WorkingMI = CloneIfNew(MI);
2476 WorkingMI->setDesc(get(X86::MOVSDrr));
2477 WorkingMI->removeOperand(3);
2478 break;
2479 }
2480 case X86::PCLMULQDQrri:
2481 case X86::VPCLMULQDQrri:
2482 case X86::VPCLMULQDQYrri:
2483 case X86::VPCLMULQDQZrri:
2484 case X86::VPCLMULQDQZ128rri:
2485 case X86::VPCLMULQDQZ256rri: {
2486 // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2487 // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2488 unsigned Imm = MI.getOperand(3).getImm();
2489 unsigned Src1Hi = Imm & 0x01;
2490 unsigned Src2Hi = Imm & 0x10;
2491 WorkingMI = CloneIfNew(MI);
2492 WorkingMI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2493 break;
2494 }
2495 case X86::VPCMPBZ128rri:
2496 case X86::VPCMPUBZ128rri:
2497 case X86::VPCMPBZ256rri:
2498 case X86::VPCMPUBZ256rri:
2499 case X86::VPCMPBZrri:
2500 case X86::VPCMPUBZrri:
2501 case X86::VPCMPDZ128rri:
2502 case X86::VPCMPUDZ128rri:
2503 case X86::VPCMPDZ256rri:
2504 case X86::VPCMPUDZ256rri:
2505 case X86::VPCMPDZrri:
2506 case X86::VPCMPUDZrri:
2507 case X86::VPCMPQZ128rri:
2508 case X86::VPCMPUQZ128rri:
2509 case X86::VPCMPQZ256rri:
2510 case X86::VPCMPUQZ256rri:
2511 case X86::VPCMPQZrri:
2512 case X86::VPCMPUQZrri:
2513 case X86::VPCMPWZ128rri:
2514 case X86::VPCMPUWZ128rri:
2515 case X86::VPCMPWZ256rri:
2516 case X86::VPCMPUWZ256rri:
2517 case X86::VPCMPWZrri:
2518 case X86::VPCMPUWZrri:
2519 case X86::VPCMPBZ128rrik:
2520 case X86::VPCMPUBZ128rrik:
2521 case X86::VPCMPBZ256rrik:
2522 case X86::VPCMPUBZ256rrik:
2523 case X86::VPCMPBZrrik:
2524 case X86::VPCMPUBZrrik:
2525 case X86::VPCMPDZ128rrik:
2526 case X86::VPCMPUDZ128rrik:
2527 case X86::VPCMPDZ256rrik:
2528 case X86::VPCMPUDZ256rrik:
2529 case X86::VPCMPDZrrik:
2530 case X86::VPCMPUDZrrik:
2531 case X86::VPCMPQZ128rrik:
2532 case X86::VPCMPUQZ128rrik:
2533 case X86::VPCMPQZ256rrik:
2534 case X86::VPCMPUQZ256rrik:
2535 case X86::VPCMPQZrrik:
2536 case X86::VPCMPUQZrrik:
2537 case X86::VPCMPWZ128rrik:
2538 case X86::VPCMPUWZ128rrik:
2539 case X86::VPCMPWZ256rrik:
2540 case X86::VPCMPUWZ256rrik:
2541 case X86::VPCMPWZrrik:
2542 case X86::VPCMPUWZrrik:
2543 WorkingMI = CloneIfNew(MI);
2544 // Flip comparison mode immediate (if necessary).
2545 WorkingMI->getOperand(MI.getNumOperands() - 1)
2547 MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7));
2548 break;
2549 case X86::VPCOMBri:
2550 case X86::VPCOMUBri:
2551 case X86::VPCOMDri:
2552 case X86::VPCOMUDri:
2553 case X86::VPCOMQri:
2554 case X86::VPCOMUQri:
2555 case X86::VPCOMWri:
2556 case X86::VPCOMUWri:
2557 WorkingMI = CloneIfNew(MI);
2558 // Flip comparison mode immediate (if necessary).
2559 WorkingMI->getOperand(3).setImm(
2560 X86::getSwappedVPCOMImm(MI.getOperand(3).getImm() & 0x7));
2561 break;
2562 case X86::VCMPSDZrri:
2563 case X86::VCMPSSZrri:
2564 case X86::VCMPPDZrri:
2565 case X86::VCMPPSZrri:
2566 case X86::VCMPSHZrri:
2567 case X86::VCMPPHZrri:
2568 case X86::VCMPPHZ128rri:
2569 case X86::VCMPPHZ256rri:
2570 case X86::VCMPPDZ128rri:
2571 case X86::VCMPPSZ128rri:
2572 case X86::VCMPPDZ256rri:
2573 case X86::VCMPPSZ256rri:
2574 case X86::VCMPPDZrrik:
2575 case X86::VCMPPSZrrik:
2576 case X86::VCMPPHZrrik:
2577 case X86::VCMPPDZ128rrik:
2578 case X86::VCMPPSZ128rrik:
2579 case X86::VCMPPHZ128rrik:
2580 case X86::VCMPPDZ256rrik:
2581 case X86::VCMPPSZ256rrik:
2582 case X86::VCMPPHZ256rrik:
2583 WorkingMI = CloneIfNew(MI);
2584 WorkingMI->getOperand(MI.getNumExplicitOperands() - 1)
2586 MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f));
2587 break;
2588 case X86::VPERM2F128rri:
2589 case X86::VPERM2I128rri:
2590 // Flip permute source immediate.
2591 // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2592 // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2593 WorkingMI = CloneIfNew(MI);
2594 WorkingMI->getOperand(3).setImm((MI.getOperand(3).getImm() & 0xFF) ^ 0x22);
2595 break;
2596 case X86::MOVHLPSrr:
2597 case X86::UNPCKHPDrr:
2598 case X86::VMOVHLPSrr:
2599 case X86::VUNPCKHPDrr:
2600 case X86::VMOVHLPSZrr:
2601 case X86::VUNPCKHPDZ128rr:
2602 assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2603
2604 switch (Opc) {
2605 default:
2606 llvm_unreachable("Unreachable!");
2607 case X86::MOVHLPSrr:
2608 Opc = X86::UNPCKHPDrr;
2609 break;
2610 case X86::UNPCKHPDrr:
2611 Opc = X86::MOVHLPSrr;
2612 break;
2613 case X86::VMOVHLPSrr:
2614 Opc = X86::VUNPCKHPDrr;
2615 break;
2616 case X86::VUNPCKHPDrr:
2617 Opc = X86::VMOVHLPSrr;
2618 break;
2619 case X86::VMOVHLPSZrr:
2620 Opc = X86::VUNPCKHPDZ128rr;
2621 break;
2622 case X86::VUNPCKHPDZ128rr:
2623 Opc = X86::VMOVHLPSZrr;
2624 break;
2625 }
2626 WorkingMI = CloneIfNew(MI);
2627 WorkingMI->setDesc(get(Opc));
2628 break;
2629 CASE_ND(CMOV16rr)
2630 CASE_ND(CMOV32rr)
2631 CASE_ND(CMOV64rr) {
2632 WorkingMI = CloneIfNew(MI);
2633 unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2634 X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2636 break;
2637 }
2638 case X86::VPTERNLOGDZrri:
2639 case X86::VPTERNLOGDZrmi:
2640 case X86::VPTERNLOGDZ128rri:
2641 case X86::VPTERNLOGDZ128rmi:
2642 case X86::VPTERNLOGDZ256rri:
2643 case X86::VPTERNLOGDZ256rmi:
2644 case X86::VPTERNLOGQZrri:
2645 case X86::VPTERNLOGQZrmi:
2646 case X86::VPTERNLOGQZ128rri:
2647 case X86::VPTERNLOGQZ128rmi:
2648 case X86::VPTERNLOGQZ256rri:
2649 case X86::VPTERNLOGQZ256rmi:
2650 case X86::VPTERNLOGDZrrik:
2651 case X86::VPTERNLOGDZ128rrik:
2652 case X86::VPTERNLOGDZ256rrik:
2653 case X86::VPTERNLOGQZrrik:
2654 case X86::VPTERNLOGQZ128rrik:
2655 case X86::VPTERNLOGQZ256rrik:
2656 case X86::VPTERNLOGDZrrikz:
2657 case X86::VPTERNLOGDZrmikz:
2658 case X86::VPTERNLOGDZ128rrikz:
2659 case X86::VPTERNLOGDZ128rmikz:
2660 case X86::VPTERNLOGDZ256rrikz:
2661 case X86::VPTERNLOGDZ256rmikz:
2662 case X86::VPTERNLOGQZrrikz:
2663 case X86::VPTERNLOGQZrmikz:
2664 case X86::VPTERNLOGQZ128rrikz:
2665 case X86::VPTERNLOGQZ128rmikz:
2666 case X86::VPTERNLOGQZ256rrikz:
2667 case X86::VPTERNLOGQZ256rmikz:
2668 case X86::VPTERNLOGDZ128rmbi:
2669 case X86::VPTERNLOGDZ256rmbi:
2670 case X86::VPTERNLOGDZrmbi:
2671 case X86::VPTERNLOGQZ128rmbi:
2672 case X86::VPTERNLOGQZ256rmbi:
2673 case X86::VPTERNLOGQZrmbi:
2674 case X86::VPTERNLOGDZ128rmbikz:
2675 case X86::VPTERNLOGDZ256rmbikz:
2676 case X86::VPTERNLOGDZrmbikz:
2677 case X86::VPTERNLOGQZ128rmbikz:
2678 case X86::VPTERNLOGQZ256rmbikz:
2679 case X86::VPTERNLOGQZrmbikz: {
2680 WorkingMI = CloneIfNew(MI);
2681 commuteVPTERNLOG(*WorkingMI, OpIdx1, OpIdx2);
2682 break;
2683 }
2684 default:
2686 WorkingMI = CloneIfNew(MI);
2688 break;
2689 }
2690
2691 if (auto *FMA3Group = getFMA3Group(Opc, MI.getDesc().TSFlags)) {
2692 WorkingMI = CloneIfNew(MI);
2693 WorkingMI->setDesc(
2694 get(getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group)));
2695 break;
2696 }
2697 }
2698 return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2699}
2700
2701bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2702 unsigned &SrcOpIdx1,
2703 unsigned &SrcOpIdx2,
2704 bool IsIntrinsic) const {
2705 uint64_t TSFlags = MI.getDesc().TSFlags;
2706
2707 unsigned FirstCommutableVecOp = 1;
2708 unsigned LastCommutableVecOp = 3;
2709 unsigned KMaskOp = -1U;
2710 if (X86II::isKMasked(TSFlags)) {
2711 // For k-zero-masked operations it is Ok to commute the first vector
2712 // operand. Unless this is an intrinsic instruction.
2713 // For regular k-masked operations a conservative choice is done as the
2714 // elements of the first vector operand, for which the corresponding bit
2715 // in the k-mask operand is set to 0, are copied to the result of the
2716 // instruction.
2717 // TODO/FIXME: The commute still may be legal if it is known that the
2718 // k-mask operand is set to either all ones or all zeroes.
2719 // It is also Ok to commute the 1st operand if all users of MI use only
2720 // the elements enabled by the k-mask operand. For example,
2721 // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2722 // : v1[i];
2723 // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2724 // // Ok, to commute v1 in FMADD213PSZrk.
2725
2726 // The k-mask operand has index = 2 for masked and zero-masked operations.
2727 KMaskOp = 2;
2728
2729 // The operand with index = 1 is used as a source for those elements for
2730 // which the corresponding bit in the k-mask is set to 0.
2731 if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2732 FirstCommutableVecOp = 3;
2733
2734 LastCommutableVecOp++;
2735 } else if (IsIntrinsic) {
2736 // Commuting the first operand of an intrinsic instruction isn't possible
2737 // unless we can prove that only the lowest element of the result is used.
2738 FirstCommutableVecOp = 2;
2739 }
2740
2741 if (isMem(MI, LastCommutableVecOp))
2742 LastCommutableVecOp--;
2743
2744 // Only the first RegOpsNum operands are commutable.
2745 // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2746 // that the operand is not specified/fixed.
2747 if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2748 (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2749 SrcOpIdx1 == KMaskOp))
2750 return false;
2751 if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2752 (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2753 SrcOpIdx2 == KMaskOp))
2754 return false;
2755
2756 // Look for two different register operands assumed to be commutable
2757 // regardless of the FMA opcode. The FMA opcode is adjusted later.
2758 if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2759 SrcOpIdx2 == CommuteAnyOperandIndex) {
2760 unsigned CommutableOpIdx2 = SrcOpIdx2;
2761
2762 // At least one of operands to be commuted is not specified and
2763 // this method is free to choose appropriate commutable operands.
2764 if (SrcOpIdx1 == SrcOpIdx2)
2765 // Both of operands are not fixed. By default set one of commutable
2766 // operands to the last register operand of the instruction.
2767 CommutableOpIdx2 = LastCommutableVecOp;
2768 else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2769 // Only one of operands is not fixed.
2770 CommutableOpIdx2 = SrcOpIdx1;
2771
2772 // CommutableOpIdx2 is well defined now. Let's choose another commutable
2773 // operand and assign its index to CommutableOpIdx1.
2774 Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2775
2776 unsigned CommutableOpIdx1;
2777 for (CommutableOpIdx1 = LastCommutableVecOp;
2778 CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2779 // Just ignore and skip the k-mask operand.
2780 if (CommutableOpIdx1 == KMaskOp)
2781 continue;
2782
2783 // The commuted operands must have different registers.
2784 // Otherwise, the commute transformation does not change anything and
2785 // is useless then.
2786 if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2787 break;
2788 }
2789
2790 // No appropriate commutable operands were found.
2791 if (CommutableOpIdx1 < FirstCommutableVecOp)
2792 return false;
2793
2794 // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2795 // to return those values.
2796 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
2797 CommutableOpIdx2))
2798 return false;
2799 }
2800
2801 return true;
2802}
2803
2805 unsigned &SrcOpIdx1,
2806 unsigned &SrcOpIdx2) const {
2807 const MCInstrDesc &Desc = MI.getDesc();
2808 if (!Desc.isCommutable())
2809 return false;
2810
2811 switch (MI.getOpcode()) {
2812 case X86::CMPSDrri:
2813 case X86::CMPSSrri:
2814 case X86::CMPPDrri:
2815 case X86::CMPPSrri:
2816 case X86::VCMPSDrri:
2817 case X86::VCMPSSrri:
2818 case X86::VCMPPDrri:
2819 case X86::VCMPPSrri:
2820 case X86::VCMPPDYrri:
2821 case X86::VCMPPSYrri:
2822 case X86::VCMPSDZrri:
2823 case X86::VCMPSSZrri:
2824 case X86::VCMPPDZrri:
2825 case X86::VCMPPSZrri:
2826 case X86::VCMPSHZrri:
2827 case X86::VCMPPHZrri:
2828 case X86::VCMPPHZ128rri:
2829 case X86::VCMPPHZ256rri:
2830 case X86::VCMPPDZ128rri:
2831 case X86::VCMPPSZ128rri:
2832 case X86::VCMPPDZ256rri:
2833 case X86::VCMPPSZ256rri:
2834 case X86::VCMPPDZrrik:
2835 case X86::VCMPPSZrrik:
2836 case X86::VCMPPHZrrik:
2837 case X86::VCMPPDZ128rrik:
2838 case X86::VCMPPSZ128rrik:
2839 case X86::VCMPPHZ128rrik:
2840 case X86::VCMPPDZ256rrik:
2841 case X86::VCMPPSZ256rrik:
2842 case X86::VCMPPHZ256rrik: {
2843 unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2844
2845 // Float comparison can be safely commuted for
2846 // Ordered/Unordered/Equal/NotEqual tests
2847 unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2848 switch (Imm) {
2849 default:
2850 // EVEX versions can be commuted.
2851 if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2852 break;
2853 return false;
2854 case 0x00: // EQUAL
2855 case 0x03: // UNORDERED
2856 case 0x04: // NOT EQUAL
2857 case 0x07: // ORDERED
2858 break;
2859 }
2860
2861 // The indices of the commutable operands are 1 and 2 (or 2 and 3
2862 // when masked).
2863 // Assign them to the returned operand indices here.
2864 return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2865 2 + OpOffset);
2866 }
2867 case X86::MOVSSrr:
2868 // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2869 // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2870 // AVX implies sse4.1.
2871 if (Subtarget.hasSSE41())
2872 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2873 return false;
2874 case X86::SHUFPDrri:
2875 // We can commute this to MOVSD.
2876 if (MI.getOperand(3).getImm() == 0x02)
2877 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2878 return false;
2879 case X86::MOVHLPSrr:
2880 case X86::UNPCKHPDrr:
2881 case X86::VMOVHLPSrr:
2882 case X86::VUNPCKHPDrr:
2883 case X86::VMOVHLPSZrr:
2884 case X86::VUNPCKHPDZ128rr:
2885 if (Subtarget.hasSSE2())
2886 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2887 return false;
2888 case X86::VPTERNLOGDZrri:
2889 case X86::VPTERNLOGDZrmi:
2890 case X86::VPTERNLOGDZ128rri:
2891 case X86::VPTERNLOGDZ128rmi:
2892 case X86::VPTERNLOGDZ256rri:
2893 case X86::VPTERNLOGDZ256rmi:
2894 case X86::VPTERNLOGQZrri:
2895 case X86::VPTERNLOGQZrmi:
2896 case X86::VPTERNLOGQZ128rri:
2897 case X86::VPTERNLOGQZ128rmi:
2898 case X86::VPTERNLOGQZ256rri:
2899 case X86::VPTERNLOGQZ256rmi:
2900 case X86::VPTERNLOGDZrrik:
2901 case X86::VPTERNLOGDZ128rrik:
2902 case X86::VPTERNLOGDZ256rrik:
2903 case X86::VPTERNLOGQZrrik:
2904 case X86::VPTERNLOGQZ128rrik:
2905 case X86::VPTERNLOGQZ256rrik:
2906 case X86::VPTERNLOGDZrrikz:
2907 case X86::VPTERNLOGDZrmikz:
2908 case X86::VPTERNLOGDZ128rrikz:
2909 case X86::VPTERNLOGDZ128rmikz:
2910 case X86::VPTERNLOGDZ256rrikz:
2911 case X86::VPTERNLOGDZ256rmikz:
2912 case X86::VPTERNLOGQZrrikz:
2913 case X86::VPTERNLOGQZrmikz:
2914 case X86::VPTERNLOGQZ128rrikz:
2915 case X86::VPTERNLOGQZ128rmikz:
2916 case X86::VPTERNLOGQZ256rrikz:
2917 case X86::VPTERNLOGQZ256rmikz:
2918 case X86::VPTERNLOGDZ128rmbi:
2919 case X86::VPTERNLOGDZ256rmbi:
2920 case X86::VPTERNLOGDZrmbi:
2921 case X86::VPTERNLOGQZ128rmbi:
2922 case X86::VPTERNLOGQZ256rmbi:
2923 case X86::VPTERNLOGQZrmbi:
2924 case X86::VPTERNLOGDZ128rmbikz:
2925 case X86::VPTERNLOGDZ256rmbikz:
2926 case X86::VPTERNLOGDZrmbikz:
2927 case X86::VPTERNLOGQZ128rmbikz:
2928 case X86::VPTERNLOGQZ256rmbikz:
2929 case X86::VPTERNLOGQZrmbikz:
2930 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2931 case X86::VPDPWSSDYrr:
2932 case X86::VPDPWSSDrr:
2933 case X86::VPDPWSSDSYrr:
2934 case X86::VPDPWSSDSrr:
2935 case X86::VPDPWUUDrr:
2936 case X86::VPDPWUUDYrr:
2937 case X86::VPDPWUUDSrr:
2938 case X86::VPDPWUUDSYrr:
2939 case X86::VPDPBSSDSrr:
2940 case X86::VPDPBSSDSYrr:
2941 case X86::VPDPBSSDrr:
2942 case X86::VPDPBSSDYrr:
2943 case X86::VPDPBUUDSrr:
2944 case X86::VPDPBUUDSYrr:
2945 case X86::VPDPBUUDrr:
2946 case X86::VPDPBUUDYrr:
2947 case X86::VPDPBSSDSZ128rr:
2948 case X86::VPDPBSSDSZ128rrk:
2949 case X86::VPDPBSSDSZ128rrkz:
2950 case X86::VPDPBSSDSZ256rr:
2951 case X86::VPDPBSSDSZ256rrk:
2952 case X86::VPDPBSSDSZ256rrkz:
2953 case X86::VPDPBSSDSZrr:
2954 case X86::VPDPBSSDSZrrk:
2955 case X86::VPDPBSSDSZrrkz:
2956 case X86::VPDPBSSDZ128rr:
2957 case X86::VPDPBSSDZ128rrk:
2958 case X86::VPDPBSSDZ128rrkz:
2959 case X86::VPDPBSSDZ256rr:
2960 case X86::VPDPBSSDZ256rrk:
2961 case X86::VPDPBSSDZ256rrkz:
2962 case X86::VPDPBSSDZrr:
2963 case X86::VPDPBSSDZrrk:
2964 case X86::VPDPBSSDZrrkz:
2965 case X86::VPDPBUUDSZ128rr:
2966 case X86::VPDPBUUDSZ128rrk:
2967 case X86::VPDPBUUDSZ128rrkz:
2968 case X86::VPDPBUUDSZ256rr:
2969 case X86::VPDPBUUDSZ256rrk:
2970 case X86::VPDPBUUDSZ256rrkz:
2971 case X86::VPDPBUUDSZrr:
2972 case X86::VPDPBUUDSZrrk:
2973 case X86::VPDPBUUDSZrrkz:
2974 case X86::VPDPBUUDZ128rr:
2975 case X86::VPDPBUUDZ128rrk:
2976 case X86::VPDPBUUDZ128rrkz:
2977 case X86::VPDPBUUDZ256rr:
2978 case X86::VPDPBUUDZ256rrk:
2979 case X86::VPDPBUUDZ256rrkz:
2980 case X86::VPDPBUUDZrr:
2981 case X86::VPDPBUUDZrrk:
2982 case X86::VPDPBUUDZrrkz:
2983 case X86::VPDPWSSDZ128rr:
2984 case X86::VPDPWSSDZ128rrk:
2985 case X86::VPDPWSSDZ128rrkz:
2986 case X86::VPDPWSSDZ256rr:
2987 case X86::VPDPWSSDZ256rrk:
2988 case X86::VPDPWSSDZ256rrkz:
2989 case X86::VPDPWSSDZrr:
2990 case X86::VPDPWSSDZrrk:
2991 case X86::VPDPWSSDZrrkz:
2992 case X86::VPDPWSSDSZ128rr:
2993 case X86::VPDPWSSDSZ128rrk:
2994 case X86::VPDPWSSDSZ128rrkz:
2995 case X86::VPDPWSSDSZ256rr:
2996 case X86::VPDPWSSDSZ256rrk:
2997 case X86::VPDPWSSDSZ256rrkz:
2998 case X86::VPDPWSSDSZrr:
2999 case X86::VPDPWSSDSZrrk:
3000 case X86::VPDPWSSDSZrrkz:
3001 case X86::VPDPWUUDZ128rr:
3002 case X86::VPDPWUUDZ128rrk:
3003 case X86::VPDPWUUDZ128rrkz:
3004 case X86::VPDPWUUDZ256rr:
3005 case X86::VPDPWUUDZ256rrk:
3006 case X86::VPDPWUUDZ256rrkz:
3007 case X86::VPDPWUUDZrr:
3008 case X86::VPDPWUUDZrrk:
3009 case X86::VPDPWUUDZrrkz:
3010 case X86::VPDPWUUDSZ128rr:
3011 case X86::VPDPWUUDSZ128rrk:
3012 case X86::VPDPWUUDSZ128rrkz:
3013 case X86::VPDPWUUDSZ256rr:
3014 case X86::VPDPWUUDSZ256rrk:
3015 case X86::VPDPWUUDSZ256rrkz:
3016 case X86::VPDPWUUDSZrr:
3017 case X86::VPDPWUUDSZrrk:
3018 case X86::VPDPWUUDSZrrkz:
3019 case X86::VPMADD52HUQrr:
3020 case X86::VPMADD52HUQYrr:
3021 case X86::VPMADD52HUQZ128r:
3022 case X86::VPMADD52HUQZ128rk:
3023 case X86::VPMADD52HUQZ128rkz:
3024 case X86::VPMADD52HUQZ256r:
3025 case X86::VPMADD52HUQZ256rk:
3026 case X86::VPMADD52HUQZ256rkz:
3027 case X86::VPMADD52HUQZr:
3028 case X86::VPMADD52HUQZrk:
3029 case X86::VPMADD52HUQZrkz:
3030 case X86::VPMADD52LUQrr:
3031 case X86::VPMADD52LUQYrr:
3032 case X86::VPMADD52LUQZ128r:
3033 case X86::VPMADD52LUQZ128rk:
3034 case X86::VPMADD52LUQZ128rkz:
3035 case X86::VPMADD52LUQZ256r:
3036 case X86::VPMADD52LUQZ256rk:
3037 case X86::VPMADD52LUQZ256rkz:
3038 case X86::VPMADD52LUQZr:
3039 case X86::VPMADD52LUQZrk:
3040 case X86::VPMADD52LUQZrkz:
3041 case X86::VFMADDCPHZr:
3042 case X86::VFMADDCPHZrk:
3043 case X86::VFMADDCPHZrkz:
3044 case X86::VFMADDCPHZ128r:
3045 case X86::VFMADDCPHZ128rk:
3046 case X86::VFMADDCPHZ128rkz:
3047 case X86::VFMADDCPHZ256r:
3048 case X86::VFMADDCPHZ256rk:
3049 case X86::VFMADDCPHZ256rkz:
3050 case X86::VFMADDCSHZr:
3051 case X86::VFMADDCSHZrk:
3052 case X86::VFMADDCSHZrkz: {
3053 unsigned CommutableOpIdx1 = 2;
3054 unsigned CommutableOpIdx2 = 3;
3055 if (X86II::isKMasked(Desc.TSFlags)) {
3056 // Skip the mask register.
3057 ++CommutableOpIdx1;
3058 ++CommutableOpIdx2;
3059 }
3060 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3061 CommutableOpIdx2))
3062 return false;
3063 if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
3064 // No idea.
3065 return false;
3066 return true;
3067 }
3068
3069 default:
3070 const X86InstrFMA3Group *FMA3Group =
3071 getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
3072 if (FMA3Group)
3073 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
3074 FMA3Group->isIntrinsic());
3075
3076 // Handled masked instructions since we need to skip over the mask input
3077 // and the preserved input.
3078 if (X86II::isKMasked(Desc.TSFlags)) {
3079 // First assume that the first input is the mask operand and skip past it.
3080 unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
3081 unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
3082 // Check if the first input is tied. If there isn't one then we only
3083 // need to skip the mask operand which we did above.
3084 if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
3085 MCOI::TIED_TO) != -1)) {
3086 // If this is zero masking instruction with a tied operand, we need to
3087 // move the first index back to the first input since this must
3088 // be a 3 input instruction and we want the first two non-mask inputs.
3089 // Otherwise this is a 2 input instruction with a preserved input and
3090 // mask, so we need to move the indices to skip one more input.
3091 if (X86II::isKMergeMasked(Desc.TSFlags)) {
3092 ++CommutableOpIdx1;
3093 ++CommutableOpIdx2;
3094 } else {
3095 --CommutableOpIdx1;
3096 }
3097 }
3098
3099 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3100 CommutableOpIdx2))
3101 return false;
3102
3103 if (!MI.getOperand(SrcOpIdx1).isReg() ||
3104 !MI.getOperand(SrcOpIdx2).isReg())
3105 // No idea.
3106 return false;
3107 return true;
3108 }
3109
3110 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
3111 }
3112 return false;
3113}
3114
3116 unsigned Opcode = MI->getOpcode();
3117 if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
3118 Opcode != X86::LEA64_32r)
3119 return false;
3120
3121 const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
3122 const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
3123 const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
3124
3125 if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
3126 Scale.getImm() > 1)
3127 return false;
3128
3129 return true;
3130}
3131
3133 // Currently we're interested in following sequence only.
3134 // r3 = lea r1, r2
3135 // r5 = add r3, r4
3136 // Both r3 and r4 are killed in add, we hope the add instruction has the
3137 // operand order
3138 // r5 = add r4, r3
3139 // So later in X86FixupLEAs the lea instruction can be rewritten as add.
3140 unsigned Opcode = MI.getOpcode();
3141 if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
3142 return false;
3143
3144 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3145 Register Reg1 = MI.getOperand(1).getReg();
3146 Register Reg2 = MI.getOperand(2).getReg();
3147
3148 // Check if Reg1 comes from LEA in the same MBB.
3149 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
3150 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3151 Commute = true;
3152 return true;
3153 }
3154 }
3155
3156 // Check if Reg2 comes from LEA in the same MBB.
3157 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
3158 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3159 Commute = false;
3160 return true;
3161 }
3162 }
3163
3164 return false;
3165}
3166
3168 unsigned Opcode = MCID.getOpcode();
3169 if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isSETZUCC(Opcode) ||
3170 X86::isCMOVCC(Opcode) || X86::isCFCMOVCC(Opcode) ||
3171 X86::isCCMPCC(Opcode) || X86::isCTESTCC(Opcode)))
3172 return -1;
3173 // Assume that condition code is always the last use operand.
3174 unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
3175 return NumUses - 1;
3176}
3177
3179 const MCInstrDesc &MCID = MI.getDesc();
3180 int CondNo = getCondSrcNoFromDesc(MCID);
3181 if (CondNo < 0)
3182 return X86::COND_INVALID;
3183 CondNo += MCID.getNumDefs();
3184 return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
3185}
3186
3188 return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3190}
3191
3193 return X86::isSETCC(MI.getOpcode()) || X86::isSETZUCC(MI.getOpcode())
3196}
3197
3199 return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3201}
3202
3204 return X86::isCFCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3206}
3207
3209 return X86::isCCMPCC(MI.getOpcode()) || X86::isCTESTCC(MI.getOpcode())
3212}
3213
3215 // CCMP/CTEST has two conditional operands:
3216 // - SCC: source conditonal code (same as CMOV)
3217 // - DCF: destination conditional flags, which has 4 valid bits
3218 //
3219 // +----+----+----+----+
3220 // | OF | SF | ZF | CF |
3221 // +----+----+----+----+
3222 //
3223 // If SCC(source conditional code) evaluates to false, CCMP/CTEST will updates
3224 // the conditional flags by as follows:
3225 //
3226 // OF = DCF.OF
3227 // SF = DCF.SF
3228 // ZF = DCF.ZF
3229 // CF = DCF.CF
3230 // PF = DCF.CF
3231 // AF = 0 (Auxiliary Carry Flag)
3232 //
3233 // Otherwise, the CMP or TEST is executed and it updates the
3234 // CSPAZO flags normally.
3235 //
3236 // NOTE:
3237 // If SCC = P, then SCC evaluates to true regardless of the CSPAZO value.
3238 // If SCC = NP, then SCC evaluates to false regardless of the CSPAZO value.
3239
3240 enum { CF = 1, ZF = 2, SF = 4, OF = 8, PF = CF };
3241
3242 switch (CC) {
3243 default:
3244 llvm_unreachable("Illegal condition code!");
3245 case X86::COND_NO:
3246 case X86::COND_NE:
3247 case X86::COND_GE:
3248 case X86::COND_G:
3249 case X86::COND_AE:
3250 case X86::COND_A:
3251 case X86::COND_NS:
3252 case X86::COND_NP:
3253 return 0;
3254 case X86::COND_O:
3255 return OF;
3256 case X86::COND_B:
3257 case X86::COND_BE:
3258 return CF;
3259 break;
3260 case X86::COND_E:
3261 case X86::COND_LE:
3262 return ZF;
3263 case X86::COND_S:
3264 case X86::COND_L:
3265 return SF;
3266 case X86::COND_P:
3267 return PF;
3268 }
3269}
3270
3271#define GET_X86_NF_TRANSFORM_TABLE
3272#define GET_X86_ND2NONND_TABLE
3273#include "X86GenInstrMapping.inc"
3274
3276 unsigned Opc) {
3277 const auto I = llvm::lower_bound(Table, Opc);
3278 return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc;
3279}
3280unsigned X86::getNFVariant(unsigned Opc) {
3281#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3282 // Make sure the tables are sorted.
3283 static std::atomic<bool> NFTableChecked(false);
3284 if (!NFTableChecked.load(std::memory_order_relaxed)) {
3285 assert(llvm::is_sorted(X86NFTransformTable) &&
3286 "X86NFTransformTable is not sorted!");
3287 NFTableChecked.store(true, std::memory_order_relaxed);
3288 }
3289#endif
3290 return getNewOpcFromTable(X86NFTransformTable, Opc);
3291}
3292
3293unsigned X86::getNonNDVariant(unsigned Opc) {
3294#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3295 // Make sure the tables are sorted.
3296 static std::atomic<bool> NDTableChecked(false);
3297 if (!NDTableChecked.load(std::memory_order_relaxed)) {
3298 assert(llvm::is_sorted(X86ND2NonNDTable) &&
3299 "X86ND2NonNDTableis not sorted!");
3300 NDTableChecked.store(true, std::memory_order_relaxed);
3301 }
3302#endif
3303 return getNewOpcFromTable(X86ND2NonNDTable, Opc);
3304}
3305
3306/// Return the inverse of the specified condition,
3307/// e.g. turning COND_E to COND_NE.
3309 switch (CC) {
3310 default:
3311 llvm_unreachable("Illegal condition code!");
3312 case X86::COND_E:
3313 return X86::COND_NE;
3314 case X86::COND_NE:
3315 return X86::COND_E;
3316 case X86::COND_L:
3317 return X86::COND_GE;
3318 case X86::COND_LE:
3319 return X86::COND_G;
3320 case X86::COND_G:
3321 return X86::COND_LE;
3322 case X86::COND_GE:
3323 return X86::COND_L;
3324 case X86::COND_B:
3325 return X86::COND_AE;
3326 case X86::COND_BE:
3327 return X86::COND_A;
3328 case X86::COND_A:
3329 return X86::COND_BE;
3330 case X86::COND_AE:
3331 return X86::COND_B;
3332 case X86::COND_S:
3333 return X86::COND_NS;
3334 case X86::COND_NS:
3335 return X86::COND_S;
3336 case X86::COND_P:
3337 return X86::COND_NP;
3338 case X86::COND_NP:
3339 return X86::COND_P;
3340 case X86::COND_O:
3341 return X86::COND_NO;
3342 case X86::COND_NO:
3343 return X86::COND_O;
3344 case X86::COND_NE_OR_P:
3345 return X86::COND_E_AND_NP;
3346 case X86::COND_E_AND_NP:
3347 return X86::COND_NE_OR_P;
3348 }
3349}
3350
3351/// Assuming the flags are set by MI(a,b), return the condition code if we
3352/// modify the instructions such that flags are set by MI(b,a).
3354 switch (CC) {
3355 default:
3356 return X86::COND_INVALID;
3357 case X86::COND_E:
3358 return X86::COND_E;
3359 case X86::COND_NE:
3360 return X86::COND_NE;
3361 case X86::COND_L:
3362 return X86::COND_G;
3363 case X86::COND_LE:
3364 return X86::COND_GE;
3365 case X86::COND_G:
3366 return X86::COND_L;
3367 case X86::COND_GE:
3368 return X86::COND_LE;
3369 case X86::COND_B:
3370 return X86::COND_A;
3371 case X86::COND_BE:
3372 return X86::COND_AE;
3373 case X86::COND_A:
3374 return X86::COND_B;
3375 case X86::COND_AE:
3376 return X86::COND_BE;
3377 }
3378}
3379
3380std::pair<X86::CondCode, bool>
3383 bool NeedSwap = false;
3384 switch (Predicate) {
3385 default:
3386 break;
3387 // Floating-point Predicates
3388 case CmpInst::FCMP_UEQ:
3389 CC = X86::COND_E;
3390 break;
3391 case CmpInst::FCMP_OLT:
3392 NeedSwap = true;
3393 [[fallthrough]];
3394 case CmpInst::FCMP_OGT:
3395 CC = X86::COND_A;
3396 break;
3397 case CmpInst::FCMP_OLE:
3398 NeedSwap = true;
3399 [[fallthrough]];
3400 case CmpInst::FCMP_OGE:
3401 CC = X86::COND_AE;
3402 break;
3403 case CmpInst::FCMP_UGT:
3404 NeedSwap = true;
3405 [[fallthrough]];
3406 case CmpInst::FCMP_ULT:
3407 CC = X86::COND_B;
3408 break;
3409 case CmpInst::FCMP_UGE:
3410 NeedSwap = true;
3411 [[fallthrough]];
3412 case CmpInst::FCMP_ULE:
3413 CC = X86::COND_BE;
3414 break;
3415 case CmpInst::FCMP_ONE:
3416 CC = X86::COND_NE;
3417 break;
3418 case CmpInst::FCMP_UNO:
3419 CC = X86::COND_P;
3420 break;
3421 case CmpInst::FCMP_ORD:
3422 CC = X86::COND_NP;
3423 break;
3424 case CmpInst::FCMP_OEQ:
3425 [[fallthrough]];
3426 case CmpInst::FCMP_UNE:
3427 CC = X86::COND_INVALID;
3428 break;
3429
3430 // Integer Predicates
3431 case CmpInst::ICMP_EQ:
3432 CC = X86::COND_E;
3433 break;
3434 case CmpInst::ICMP_NE:
3435 CC = X86::COND_NE;
3436 break;
3437 case CmpInst::ICMP_UGT:
3438 CC = X86::COND_A;
3439 break;
3440 case CmpInst::ICMP_UGE:
3441 CC = X86::COND_AE;
3442 break;
3443 case CmpInst::ICMP_ULT:
3444 CC = X86::COND_B;
3445 break;
3446 case CmpInst::ICMP_ULE:
3447 CC = X86::COND_BE;
3448 break;
3449 case CmpInst::ICMP_SGT:
3450 CC = X86::COND_G;
3451 break;
3452 case CmpInst::ICMP_SGE:
3453 CC = X86::COND_GE;
3454 break;
3455 case CmpInst::ICMP_SLT:
3456 CC = X86::COND_L;
3457 break;
3458 case CmpInst::ICMP_SLE:
3459 CC = X86::COND_LE;
3460 break;
3461 }
3462
3463 return std::make_pair(CC, NeedSwap);
3464}
3465
3466/// Return a cmov opcode for the given register size in bytes, and operand type.
3467unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand,
3468 bool HasNDD) {
3469 switch (RegBytes) {
3470 default:
3471 llvm_unreachable("Illegal register size!");
3472#define GET_ND_IF_ENABLED(OPC) (HasNDD ? OPC##_ND : OPC)
3473 case 2:
3474 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV16rm)
3475 : GET_ND_IF_ENABLED(X86::CMOV16rr);
3476 case 4:
3477 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV32rm)
3478 : GET_ND_IF_ENABLED(X86::CMOV32rr);
3479 case 8:
3480 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV64rm)
3481 : GET_ND_IF_ENABLED(X86::CMOV64rr);
3482 }
3483}
3484
3485unsigned X86::getMOVriOpcode(bool Use64BitReg, int64_t Imm) {
3486 if (!Use64BitReg)
3487 return X86::MOV32ri;
3488
3489 if (isUInt<32>(Imm))
3490 return X86::MOV32ri64;
3491 if (isInt<32>(Imm))
3492 return X86::MOV64ri32;
3493 return X86::MOV64ri;
3494}
3495
3496/// Get the VPCMP immediate for the given condition.
3498 switch (CC) {
3499 default:
3500 llvm_unreachable("Unexpected SETCC condition");
3501 case ISD::SETNE:
3502 return 4;
3503 case ISD::SETEQ:
3504 return 0;
3505 case ISD::SETULT:
3506 case ISD::SETLT:
3507 return 1;
3508 case ISD::SETUGT:
3509 case ISD::SETGT:
3510 return 6;
3511 case ISD::SETUGE:
3512 case ISD::SETGE:
3513 return 5;
3514 case ISD::SETULE:
3515 case ISD::SETLE:
3516 return 2;
3517 }
3518}
3519
3520/// Get the VPCMP immediate if the operands are swapped.
3521unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
3522 switch (Imm) {
3523 default:
3524 llvm_unreachable("Unreachable!");
3525 case 0x01:
3526 Imm = 0x06;
3527 break; // LT -> NLE
3528 case 0x02:
3529 Imm = 0x05;
3530 break; // LE -> NLT
3531 case 0x05:
3532 Imm = 0x02;
3533 break; // NLT -> LE
3534 case 0x06:
3535 Imm = 0x01;
3536 break; // NLE -> LT
3537 case 0x00: // EQ
3538 case 0x03: // FALSE
3539 case 0x04: // NE
3540 case 0x07: // TRUE
3541 break;
3542 }
3543
3544 return Imm;
3545}
3546
3547/// Get the VPCOM immediate if the operands are swapped.
3548unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
3549 switch (Imm) {
3550 default:
3551 llvm_unreachable("Unreachable!");
3552 case 0x00:
3553 Imm = 0x02;
3554 break; // LT -> GT
3555 case 0x01:
3556 Imm = 0x03;
3557 break; // LE -> GE
3558 case 0x02:
3559 Imm = 0x00;
3560 break; // GT -> LT
3561 case 0x03:
3562 Imm = 0x01;
3563 break; // GE -> LE
3564 case 0x04: // EQ
3565 case 0x05: // NE
3566 case 0x06: // FALSE
3567 case 0x07: // TRUE
3568 break;
3569 }
3570
3571 return Imm;
3572}
3573
3574/// Get the VCMP immediate if the operands are swapped.
3575unsigned X86::getSwappedVCMPImm(unsigned Imm) {
3576 // Only need the lower 2 bits to distinquish.
3577 switch (Imm & 0x3) {
3578 default:
3579 llvm_unreachable("Unreachable!");
3580 case 0x00:
3581 case 0x03:
3582 // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
3583 break;
3584 case 0x01:
3585 case 0x02:
3586 // Need to toggle bits 3:0. Bit 4 stays the same.
3587 Imm ^= 0xf;
3588 break;
3589 }
3590
3591 return Imm;
3592}
3593
3595 if (Info.RegClass == X86::VR128RegClassID ||
3596 Info.RegClass == X86::VR128XRegClassID)
3597 return 128;
3598 if (Info.RegClass == X86::VR256RegClassID ||
3599 Info.RegClass == X86::VR256XRegClassID)
3600 return 256;
3601 if (Info.RegClass == X86::VR512RegClassID)
3602 return 512;
3603 llvm_unreachable("Unknown register class!");
3604}
3605
3606/// Return true if the Reg is X87 register.
3607static bool isX87Reg(Register Reg) {
3608 return (Reg == X86::FPCW || Reg == X86::FPSW ||
3609 (Reg >= X86::ST0 && Reg <= X86::ST7));
3610}
3611
3612/// check if the instruction is X87 instruction
3614 // Call and inlineasm defs X87 register, so we special case it here because
3615 // otherwise calls are incorrectly flagged as x87 instructions
3616 // as a result.
3617 if (MI.isCall() || MI.isInlineAsm())
3618 return false;
3619 for (const MachineOperand &MO : MI.operands()) {
3620 if (!MO.isReg())
3621 continue;
3622 if (isX87Reg(MO.getReg()))
3623 return true;
3624 }
3625 return false;
3626}
3627
3629 auto IsMemOp = [](const MCOperandInfo &OpInfo) {
3630 return OpInfo.OperandType == MCOI::OPERAND_MEMORY;
3631 };
3632
3633 const MCInstrDesc &Desc = MI.getDesc();
3634
3635 // Directly invoke the MC-layer routine for real (i.e., non-pseudo)
3636 // instructions (fast case).
3637 if (!X86II::isPseudo(Desc.TSFlags)) {
3638 int MemRefIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
3639 if (MemRefIdx >= 0)
3640 return MemRefIdx + X86II::getOperandBias(Desc);
3641#ifdef EXPENSIVE_CHECKS
3642 assert(none_of(Desc.operands(), IsMemOp) &&
3643 "Got false negative from X86II::getMemoryOperandNo()!");
3644#endif
3645 return -1;
3646 }
3647
3648 // Otherwise, handle pseudo instructions by examining the type of their
3649 // operands (slow case). An instruction cannot have a memory reference if it
3650 // has fewer than AddrNumOperands (= 5) explicit operands.
3651 unsigned NumOps = Desc.getNumOperands();
3653#ifdef EXPENSIVE_CHECKS
3654 assert(none_of(Desc.operands(), IsMemOp) &&
3655 "Expected no operands to have OPERAND_MEMORY type!");
3656#endif
3657 return -1;
3658 }
3659
3660 // The first operand with type OPERAND_MEMORY indicates the start of a memory
3661 // reference. We expect the following AddrNumOperand-1 operands to also have
3662 // OPERAND_MEMORY type.
3663 for (unsigned I = 0, E = NumOps - X86::AddrNumOperands; I != E; ++I) {
3664 if (IsMemOp(Desc.operands()[I])) {
3665#ifdef EXPENSIVE_CHECKS
3666 assert(std::all_of(Desc.operands().begin() + I,
3667 Desc.operands().begin() + I + X86::AddrNumOperands,
3668 IsMemOp) &&
3669 "Expected all five operands in the memory reference to have "
3670 "OPERAND_MEMORY type!");
3671#endif
3672 return I;
3673 }
3674 }
3675
3676 return -1;
3677}
3678
3680 unsigned OpNo) {
3681 assert(MI.getNumOperands() >= (OpNo + X86::AddrNumOperands) &&
3682 "Unexpected number of operands!");
3683
3684 const MachineOperand &Index = MI.getOperand(OpNo + X86::AddrIndexReg);
3685 if (!Index.isReg() || Index.getReg() != X86::NoRegister)
3686 return nullptr;
3687
3688 const MachineOperand &Disp = MI.getOperand(OpNo + X86::AddrDisp);
3689 if (!Disp.isCPI() || Disp.getOffset() != 0)
3690 return nullptr;
3691
3693 MI.getParent()->getParent()->getConstantPool()->getConstants();
3694 const MachineConstantPoolEntry &ConstantEntry = Constants[Disp.getIndex()];
3695
3696 // Bail if this is a machine constant pool entry, we won't be able to dig out
3697 // anything useful.
3698 if (ConstantEntry.isMachineConstantPoolEntry())
3699 return nullptr;
3700
3701 return ConstantEntry.Val.ConstVal;
3702}
3703
3705 switch (MI.getOpcode()) {
3706 case X86::TCRETURNdi:
3707 case X86::TCRETURNri:
3708 case X86::TCRETURNmi:
3709 case X86::TCRETURNdi64:
3710 case X86::TCRETURNri64:
3711 case X86::TCRETURNri64_ImpCall:
3712 case X86::TCRETURNmi64:
3713 return true;
3714 default:
3715 return false;
3716 }
3717}
3718
3721 const MachineInstr &TailCall) const {
3722
3723 const MachineFunction *MF = TailCall.getMF();
3724
3725 if (MF->getTarget().getCodeModel() == CodeModel::Kernel) {
3726 // Kernel patches thunk calls in runtime, these should never be conditional.
3727 const MachineOperand &Target = TailCall.getOperand(0);
3728 if (Target.isSymbol()) {
3729 StringRef Symbol(Target.getSymbolName());
3730 // this is currently only relevant to r11/kernel indirect thunk.
3731 if (Symbol == "__x86_indirect_thunk_r11")
3732 return false;
3733 }
3734 }
3735
3736 if (TailCall.getOpcode() != X86::TCRETURNdi &&
3737 TailCall.getOpcode() != X86::TCRETURNdi64) {
3738 // Only direct calls can be done with a conditional branch.
3739 return false;
3740 }
3741
3742 if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
3743 // Conditional tail calls confuse the Win64 unwinder.
3744 return false;
3745 }
3746
3747 assert(BranchCond.size() == 1);
3748 if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
3749 // Can't make a conditional tail call with this condition.
3750 return false;
3751 }
3752
3754 if (X86FI->getTCReturnAddrDelta() != 0 ||
3755 TailCall.getOperand(1).getImm() != 0) {
3756 // A conditional tail call cannot do any stack adjustment.
3757 return false;
3758 }
3759
3760 return true;
3761}
3762
3765 const MachineInstr &TailCall) const {
3766 assert(canMakeTailCallConditional(BranchCond, TailCall));
3767
3769 while (I != MBB.begin()) {
3770 --I;
3771 if (I->isDebugInstr())
3772 continue;
3773 if (!I->isBranch())
3774 assert(0 && "Can't find the branch to replace!");
3775
3777 assert(BranchCond.size() == 1);
3778 if (CC != BranchCond[0].getImm())
3779 continue;
3780
3781 break;
3782 }
3783
3784 unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3785 : X86::TCRETURNdi64cc;
3786
3787 auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3788 MIB->addOperand(TailCall.getOperand(0)); // Destination.
3789 MIB.addImm(0); // Stack offset (not used).
3790 MIB->addOperand(BranchCond[0]); // Condition.
3791 MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
3792
3793 // Add implicit uses and defs of all live regs potentially clobbered by the
3794 // call. This way they still appear live across the call.
3796 LiveRegs.addLiveOuts(MBB);
3798 LiveRegs.stepForward(*MIB, Clobbers);
3799 for (const auto &C : Clobbers) {
3800 MIB.addReg(C.first, RegState::Implicit);
3802 }
3803
3804 I->eraseFromParent();
3805}
3806
3807// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3808// not be a fallthrough MBB now due to layout changes). Return nullptr if the
3809// fallthrough MBB cannot be identified.
3812 // Look for non-EHPad successors other than TBB. If we find exactly one, it
3813 // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3814 // and fallthrough MBB. If we find more than one, we cannot identify the
3815 // fallthrough MBB and should return nullptr.
3816 MachineBasicBlock *FallthroughBB = nullptr;
3817 for (MachineBasicBlock *Succ : MBB->successors()) {
3818 if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
3819 continue;
3820 // Return a nullptr if we found more than one fallthrough successor.
3821 if (FallthroughBB && FallthroughBB != TBB)
3822 return nullptr;
3823 FallthroughBB = Succ;
3824 }
3825 return FallthroughBB;
3826}
3827
3828bool X86InstrInfo::analyzeBranchImpl(
3831 SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3832
3833 // Start from the bottom of the block and work up, examining the
3834 // terminator instructions.
3836 MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3837 while (I != MBB.begin()) {
3838 --I;
3839 if (I->isDebugInstr())
3840 continue;
3841
3842 // Working from the bottom, when we see a non-terminator instruction, we're
3843 // done.
3844 if (!isUnpredicatedTerminator(*I))
3845 break;
3846
3847 // A terminator that isn't a branch can't easily be handled by this
3848 // analysis.
3849 if (!I->isBranch())
3850 return true;
3851
3852 // Handle unconditional branches.
3853 if (I->getOpcode() == X86::JMP_1) {
3854 UnCondBrIter = I;
3855
3856 if (!AllowModify) {
3857 TBB = I->getOperand(0).getMBB();
3858 continue;
3859 }
3860
3861 // If the block has any instructions after a JMP, delete them.
3862 MBB.erase(std::next(I), MBB.end());
3863
3864 Cond.clear();
3865 FBB = nullptr;
3866
3867 // Delete the JMP if it's equivalent to a fall-through.
3868 if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3869 TBB = nullptr;
3870 I->eraseFromParent();
3871 I = MBB.end();
3872 UnCondBrIter = MBB.end();
3873 continue;
3874 }
3875
3876 // TBB is used to indicate the unconditional destination.
3877 TBB = I->getOperand(0).getMBB();
3878 continue;
3879 }
3880
3881 // Handle conditional branches.
3882 X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3883 if (BranchCode == X86::COND_INVALID)
3884 return true; // Can't handle indirect branch.
3885
3886 // In practice we should never have an undef eflags operand, if we do
3887 // abort here as we are not prepared to preserve the flag.
3888 if (I->findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->isUndef())
3889 return true;
3890
3891 // Working from the bottom, handle the first conditional branch.
3892 if (Cond.empty()) {
3893 FBB = TBB;
3894 TBB = I->getOperand(0).getMBB();
3896 CondBranches.push_back(&*I);
3897 continue;
3898 }
3899
3900 // Handle subsequent conditional branches. Only handle the case where all
3901 // conditional branches branch to the same destination and their condition
3902 // opcodes fit one of the special multi-branch idioms.
3903 assert(Cond.size() == 1);
3904 assert(TBB);
3905
3906 // If the conditions are the same, we can leave them alone.
3907 X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3908 auto NewTBB = I->getOperand(0).getMBB();
3909 if (OldBranchCode == BranchCode && TBB == NewTBB)
3910 continue;
3911
3912 // If they differ, see if they fit one of the known patterns. Theoretically,
3913 // we could handle more patterns here, but we shouldn't expect to see them
3914 // if instruction selection has done a reasonable job.
3915 if (TBB == NewTBB &&
3916 ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3917 (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3918 BranchCode = X86::COND_NE_OR_P;
3919 } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3920 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3921 if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3922 return true;
3923
3924 // X86::COND_E_AND_NP usually has two different branch destinations.
3925 //
3926 // JP B1
3927 // JE B2
3928 // JMP B1
3929 // B1:
3930 // B2:
3931 //
3932 // Here this condition branches to B2 only if NP && E. It has another
3933 // equivalent form:
3934 //
3935 // JNE B1
3936 // JNP B2
3937 // JMP B1
3938 // B1:
3939 // B2:
3940 //
3941 // Similarly it branches to B2 only if E && NP. That is why this condition
3942 // is named with COND_E_AND_NP.
3943 BranchCode = X86::COND_E_AND_NP;
3944 } else
3945 return true;
3946
3947 // Update the MachineOperand.
3948 Cond[0].setImm(BranchCode);
3949 CondBranches.push_back(&*I);
3950 }
3951
3952 return false;
3953}
3954
3957 MachineBasicBlock *&FBB,
3959 bool AllowModify) const {
3960 SmallVector<MachineInstr *, 4> CondBranches;
3961 return analyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3962}
3963
3965 const MCInstrDesc &Desc = MI.getDesc();
3966 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3967 assert(MemRefBegin >= 0 && "instr should have memory operand");
3968 MemRefBegin += X86II::getOperandBias(Desc);
3969
3970 const MachineOperand &MO = MI.getOperand(MemRefBegin + X86::AddrDisp);
3971 if (!MO.isJTI())
3972 return -1;
3973
3974 return MO.getIndex();
3975}
3976
3978 Register Reg) {
3979 if (!Reg.isVirtual())
3980 return -1;
3982 if (MI == nullptr)
3983 return -1;
3984 unsigned Opcode = MI->getOpcode();
3985 if (Opcode != X86::LEA64r && Opcode != X86::LEA32r)
3986 return -1;
3988}
3989
3991 unsigned Opcode = MI.getOpcode();
3992 // Switch-jump pattern for non-PIC code looks like:
3993 // JMP64m $noreg, 8, %X, %jump-table.X, $noreg
3994 if (Opcode == X86::JMP64m || Opcode == X86::JMP32m) {
3996 }
3997 // The pattern for PIC code looks like:
3998 // %0 = LEA64r $rip, 1, $noreg, %jump-table.X
3999 // %1 = MOVSX64rm32 %0, 4, XX, 0, $noreg
4000 // %2 = ADD64rr %1, %0
4001 // JMP64r %2
4002 if (Opcode == X86::JMP64r || Opcode == X86::JMP32r) {
4003 Register Reg = MI.getOperand(0).getReg();
4004 if (!Reg.isVirtual())
4005 return -1;
4006 const MachineFunction &MF = *MI.getParent()->getParent();
4007 const MachineRegisterInfo &MRI = MF.getRegInfo();
4008 MachineInstr *Add = MRI.getUniqueVRegDef(Reg);
4009 if (Add == nullptr)
4010 return -1;
4011 if (Add->getOpcode() != X86::ADD64rr && Add->getOpcode() != X86::ADD32rr)
4012 return -1;
4013 int JTI1 = getJumpTableIndexFromReg(MRI, Add->getOperand(1).getReg());
4014 if (JTI1 >= 0)
4015 return JTI1;
4016 int JTI2 = getJumpTableIndexFromReg(MRI, Add->getOperand(2).getReg());
4017 if (JTI2 >= 0)
4018 return JTI2;
4019 }
4020 return -1;
4021}
4022
4024 MachineBranchPredicate &MBP,
4025 bool AllowModify) const {
4026 using namespace std::placeholders;
4027
4029 SmallVector<MachineInstr *, 4> CondBranches;
4030 if (analyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
4031 AllowModify))
4032 return true;
4033
4034 if (Cond.size() != 1)
4035 return true;
4036
4037 assert(MBP.TrueDest && "expected!");
4038
4039 if (!MBP.FalseDest)
4040 MBP.FalseDest = MBB.getNextNode();
4041
4043
4044 MachineInstr *ConditionDef = nullptr;
4045 bool SingleUseCondition = true;
4046
4048 if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
4049 ConditionDef = &MI;
4050 break;
4051 }
4052
4053 if (MI.readsRegister(X86::EFLAGS, TRI))
4054 SingleUseCondition = false;
4055 }
4056
4057 if (!ConditionDef)
4058 return true;
4059
4060 if (SingleUseCondition) {
4061 for (auto *Succ : MBB.successors())
4062 if (Succ->isLiveIn(X86::EFLAGS))
4063 SingleUseCondition = false;
4064 }
4065
4066 MBP.ConditionDef = ConditionDef;
4067 MBP.SingleUseCondition = SingleUseCondition;
4068
4069 // Currently we only recognize the simple pattern:
4070 //
4071 // test %reg, %reg
4072 // je %label
4073 //
4074 const unsigned TestOpcode =
4075 Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
4076
4077 if (ConditionDef->getOpcode() == TestOpcode &&
4078 ConditionDef->getNumOperands() == 3 &&
4079 ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
4080 (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
4081 MBP.LHS = ConditionDef->getOperand(0);
4082 MBP.RHS = MachineOperand::CreateImm(0);
4083 MBP.Predicate = Cond[0].getImm() == X86::COND_NE
4084 ? MachineBranchPredicate::PRED_NE
4085 : MachineBranchPredicate::PRED_EQ;
4086 return false;
4087 }
4088
4089 return true;
4090}
4091
4093 int *BytesRemoved) const {
4094 assert(!BytesRemoved && "code size not handled");
4095
4097 unsigned Count = 0;
4098
4099 while (I != MBB.begin()) {
4100 --I;
4101 if (I->isDebugInstr())
4102 continue;
4103 if (I->getOpcode() != X86::JMP_1 &&
4105 break;
4106 // Remove the branch.
4107 I->eraseFromParent();
4108 I = MBB.end();
4109 ++Count;
4110 }
4111
4112 return Count;
4113}
4114
4117 MachineBasicBlock *FBB,
4119 const DebugLoc &DL, int *BytesAdded) const {
4120 // Shouldn't be a fall through.
4121 assert(TBB && "insertBranch must not be told to insert a fallthrough");
4122 assert((Cond.size() == 1 || Cond.size() == 0) &&
4123 "X86 branch conditions have one component!");
4124 assert(!BytesAdded && "code size not handled");
4125
4126 if (Cond.empty()) {
4127 // Unconditional branch?
4128 assert(!FBB && "Unconditional branch with multiple successors!");
4129 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
4130 return 1;
4131 }
4132
4133 // If FBB is null, it is implied to be a fall-through block.
4134 bool FallThru = FBB == nullptr;
4135
4136 // Conditional branch.
4137 unsigned Count = 0;
4139 switch (CC) {
4140 case X86::COND_NE_OR_P:
4141 // Synthesize NE_OR_P with two branches.
4142 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
4143 ++Count;
4144 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
4145 ++Count;
4146 break;
4147 case X86::COND_E_AND_NP:
4148 // Use the next block of MBB as FBB if it is null.
4149 if (FBB == nullptr) {
4150 FBB = getFallThroughMBB(&MBB, TBB);
4151 assert(FBB && "MBB cannot be the last block in function when the false "
4152 "body is a fall-through.");
4153 }
4154 // Synthesize COND_E_AND_NP with two branches.
4155 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
4156 ++Count;
4157 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
4158 ++Count;
4159 break;
4160 default: {
4161 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
4162 ++Count;
4163 }
4164 }
4165 if (!FallThru) {
4166 // Two-way Conditional branch. Insert the second branch.
4167 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
4168 ++Count;
4169 }
4170 return Count;
4171}
4172
4175 Register DstReg, Register TrueReg,
4176 Register FalseReg, int &CondCycles,
4177 int &TrueCycles, int &FalseCycles) const {
4178 // Not all subtargets have cmov instructions.
4179 if (!Subtarget.canUseCMOV())
4180 return false;
4181 if (Cond.size() != 1)
4182 return false;
4183 // We cannot do the composite conditions, at least not in SSA form.
4185 return false;
4186
4187 // Check register classes.
4188 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4189 const TargetRegisterClass *RC =
4190 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
4191 if (!RC)
4192 return false;
4193
4194 // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
4195 if (X86::GR16RegClass.hasSubClassEq(RC) ||
4196 X86::GR32RegClass.hasSubClassEq(RC) ||
4197 X86::GR64RegClass.hasSubClassEq(RC)) {
4198 // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
4199 // Bridge. Probably Ivy Bridge as well.
4200 CondCycles = 2;
4201 TrueCycles = 2;
4202 FalseCycles = 2;
4203 return true;
4204 }
4205
4206 // Can't do vectors.
4207 return false;
4208}
4209
4212 const DebugLoc &DL, Register DstReg,
4214 Register FalseReg) const {
4215 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4217 const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
4218 assert(Cond.size() == 1 && "Invalid Cond array");
4219 unsigned Opc =
4220 X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
4221 false /*HasMemoryOperand*/, Subtarget.hasNDD());
4222 BuildMI(MBB, I, DL, get(Opc), DstReg)
4223 .addReg(FalseReg)
4224 .addReg(TrueReg)
4225 .addImm(Cond[0].getImm());
4226}
4227
4228/// Test if the given register is a physical h register.
4229static bool isHReg(Register Reg) {
4230 return X86::GR8_ABCD_HRegClass.contains(Reg);
4231}
4232
4233// Try and copy between VR128/VR64 and GR64 registers.
4234static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg,
4235 const X86Subtarget &Subtarget) {
4236 bool HasAVX = Subtarget.hasAVX();
4237 bool HasAVX512 = Subtarget.hasAVX512();
4238 bool HasEGPR = Subtarget.hasEGPR();
4239
4240 // SrcReg(MaskReg) -> DestReg(GR64)
4241 // SrcReg(MaskReg) -> DestReg(GR32)
4242
4243 // All KMASK RegClasses hold the same k registers, can be tested against
4244 // anyone.
4245 if (X86::VK16RegClass.contains(SrcReg)) {
4246 if (X86::GR64RegClass.contains(DestReg)) {
4247 assert(Subtarget.hasBWI());
4248 return HasEGPR ? X86::KMOVQrk_EVEX : X86::KMOVQrk;
4249 }
4250 if (X86::GR32RegClass.contains(DestReg))
4251 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDrk_EVEX : X86::KMOVDrk)
4252 : (HasEGPR ? X86::KMOVWrk_EVEX : X86::KMOVWrk);
4253 }
4254
4255 // SrcReg(GR64) -> DestReg(MaskReg)
4256 // SrcReg(GR32) -> DestReg(MaskReg)
4257
4258 // All KMASK RegClasses hold the same k registers, can be tested against
4259 // anyone.
4260 if (X86::VK16RegClass.contains(DestReg)) {
4261 if (X86::GR64RegClass.contains(SrcReg)) {
4262 assert(Subtarget.hasBWI());
4263 return HasEGPR ? X86::KMOVQkr_EVEX : X86::KMOVQkr;
4264 }
4265 if (X86::GR32RegClass.contains(SrcReg))
4266 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDkr_EVEX : X86::KMOVDkr)
4267 : (HasEGPR ? X86::KMOVWkr_EVEX : X86::KMOVWkr);
4268 }
4269
4270 // SrcReg(VR128) -> DestReg(GR64)
4271 // SrcReg(VR64) -> DestReg(GR64)
4272 // SrcReg(GR64) -> DestReg(VR128)
4273 // SrcReg(GR64) -> DestReg(VR64)
4274
4275 if (X86::GR64RegClass.contains(DestReg)) {
4276 if (X86::VR128XRegClass.contains(SrcReg))
4277 // Copy from a VR128 register to a GR64 register.
4278 return HasAVX512 ? X86::VMOVPQIto64Zrr
4279 : HasAVX ? X86::VMOVPQIto64rr
4280 : X86::MOVPQIto64rr;
4281 if (X86::VR64RegClass.contains(SrcReg))
4282 // Copy from a VR64 register to a GR64 register.
4283 return X86::MMX_MOVD64from64rr;
4284 } else if (X86::GR64RegClass.contains(SrcReg)) {
4285 // Copy from a GR64 register to a VR128 register.
4286 if (X86::VR128XRegClass.contains(DestReg))
4287 return HasAVX512 ? X86::VMOV64toPQIZrr
4288 : HasAVX ? X86::VMOV64toPQIrr
4289 : X86::MOV64toPQIrr;
4290 // Copy from a GR64 register to a VR64 register.
4291 if (X86::VR64RegClass.contains(DestReg))
4292 return X86::MMX_MOVD64to64rr;
4293 }
4294
4295 // SrcReg(VR128) -> DestReg(GR32)
4296 // SrcReg(GR32) -> DestReg(VR128)
4297
4298 if (X86::GR32RegClass.contains(DestReg) &&
4299 X86::VR128XRegClass.contains(SrcReg))
4300 // Copy from a VR128 register to a GR32 register.
4301 return HasAVX512 ? X86::VMOVPDI2DIZrr
4302 : HasAVX ? X86::VMOVPDI2DIrr
4303 : X86::MOVPDI2DIrr;
4304
4305 if (X86::VR128XRegClass.contains(DestReg) &&
4306 X86::GR32RegClass.contains(SrcReg))
4307 // Copy from a GR32 register to a VR128 register.
4308 return HasAVX512 ? X86::VMOVDI2PDIZrr
4309 : HasAVX ? X86::VMOVDI2PDIrr
4310 : X86::MOVDI2PDIrr;
4311
4312 return 0;
4313}
4314
4317 const DebugLoc &DL, Register DestReg,
4318 Register SrcReg, bool KillSrc,
4319 bool RenamableDest, bool RenamableSrc) const {
4320 // First deal with the normal symmetric copies.
4321 bool HasAVX = Subtarget.hasAVX();
4322 bool HasVLX = Subtarget.hasVLX();
4323 bool HasEGPR = Subtarget.hasEGPR();
4324 unsigned Opc = 0;
4325 if (X86::GR64RegClass.contains(DestReg, SrcReg))
4326 Opc = X86::MOV64rr;
4327 else if (X86::GR32RegClass.contains(DestReg, SrcReg))
4328 Opc = X86::MOV32rr;
4329 else if (X86::GR16RegClass.contains(DestReg, SrcReg))
4330 Opc = X86::MOV16rr;
4331 else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
4332 // Copying to or from a physical H register on x86-64 requires a NOREX
4333 // move. Otherwise use a normal move.
4334 if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) {
4335 Opc = X86::MOV8rr_NOREX;
4336 // Both operands must be encodable without an REX prefix.
4337 assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
4338 "8-bit H register can not be copied outside GR8_NOREX");
4339 } else
4340 Opc = X86::MOV8rr;
4341 } else if (X86::VR64RegClass.contains(DestReg, SrcReg))
4342 Opc = X86::MMX_MOVQ64rr;
4343 else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
4344 if (HasVLX)
4345 Opc = X86::VMOVAPSZ128rr;
4346 else if (X86::VR128RegClass.contains(DestReg, SrcReg))
4347 Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
4348 else {
4349 // If this an extended register and we don't have VLX we need to use a
4350 // 512-bit move.
4351 Opc = X86::VMOVAPSZrr;
4353 DestReg =
4354 TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass);
4355 SrcReg =
4356 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4357 }
4358 } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
4359 if (HasVLX)
4360 Opc = X86::VMOVAPSZ256rr;
4361 else if (X86::VR256RegClass.contains(DestReg, SrcReg))
4362 Opc = X86::VMOVAPSYrr;
4363 else {
4364 // If this an extended register and we don't have VLX we need to use a
4365 // 512-bit move.
4366 Opc = X86::VMOVAPSZrr;
4368 DestReg =
4369 TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass);
4370 SrcReg =
4371 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4372 }
4373 } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
4374 Opc = X86::VMOVAPSZrr;
4375 // All KMASK RegClasses hold the same k registers, can be tested against
4376 // anyone.
4377 else if (X86::VK16RegClass.contains(DestReg, SrcReg))
4378 Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk)
4379 : (HasEGPR ? X86::KMOVWkk_EVEX : X86::KMOVWkk);
4380
4381 if (!Opc)
4382 Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
4383
4384 if (Opc) {
4385 BuildMI(MBB, MI, DL, get(Opc), DestReg)
4386 .addReg(SrcReg, getKillRegState(KillSrc));
4387 return;
4388 }
4389
4390 if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
4391 // FIXME: We use a fatal error here because historically LLVM has tried
4392 // lower some of these physreg copies and we want to ensure we get
4393 // reasonable bug reports if someone encounters a case no other testing
4394 // found. This path should be removed after the LLVM 7 release.
4395 report_fatal_error("Unable to copy EFLAGS physical register!");
4396 }
4397
4398 LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
4399 << RI.getName(DestReg) << '\n');
4400 report_fatal_error("Cannot emit physreg copy instruction");
4401}
4402
4403std::optional<DestSourcePair>
4405 if (MI.isMoveReg()) {
4406 // FIXME: Dirty hack for apparent invariant that doesn't hold when
4407 // subreg_to_reg is coalesced with ordinary copies, such that the bits that
4408 // were asserted as 0 are now undef.
4409 if (MI.getOperand(0).isUndef() && MI.getOperand(0).getSubReg())
4410 return std::nullopt;
4411
4412 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
4413 }
4414 return std::nullopt;
4415}
4416
4417static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
4418 if (STI.hasFP16())
4419 return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
4420 if (Load)
4421 return X86::MOVSHPrm;
4422 return X86::MOVSHPmr;
4423}
4424
4426 const TargetRegisterClass *RC,
4427 bool IsStackAligned,
4428 const X86Subtarget &STI, bool Load) {
4429 bool HasAVX = STI.hasAVX();
4430 bool HasAVX512 = STI.hasAVX512();
4431 bool HasVLX = STI.hasVLX();
4432 bool HasEGPR = STI.hasEGPR();
4433
4434 assert(RC != nullptr && "Invalid target register class");
4435 switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
4436 default:
4437 llvm_unreachable("Unknown spill size");
4438 case 1:
4439 assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
4440 if (STI.is64Bit())
4441 // Copying to or from a physical H register on x86-64 requires a NOREX
4442 // move. Otherwise use a normal move.
4443 if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
4444 return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
4445 return Load ? X86::MOV8rm : X86::MOV8mr;
4446 case 2:
4447 if (X86::VK16RegClass.hasSubClassEq(RC))
4448 return Load ? (HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm)
4449 : (HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk);
4450 assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
4451 return Load ? X86::MOV16rm : X86::MOV16mr;
4452 case 4:
4453 if (X86::GR32RegClass.hasSubClassEq(RC))
4454 return Load ? X86::MOV32rm : X86::MOV32mr;
4455 if (X86::FR32XRegClass.hasSubClassEq(RC))
4456 return Load ? (HasAVX512 ? X86::VMOVSSZrm_alt
4457 : HasAVX ? X86::VMOVSSrm_alt
4458 : X86::MOVSSrm_alt)
4459 : (HasAVX512 ? X86::VMOVSSZmr
4460 : HasAVX ? X86::VMOVSSmr
4461 : X86::MOVSSmr);
4462 if (X86::RFP32RegClass.hasSubClassEq(RC))
4463 return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
4464 if (X86::VK32RegClass.hasSubClassEq(RC)) {
4465 assert(STI.hasBWI() && "KMOVD requires BWI");
4466 return Load ? (HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm)
4467 : (HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk);
4468 }
4469 // All of these mask pair classes have the same spill size, the same kind
4470 // of kmov instructions can be used with all of them.
4471 if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
4472 X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
4473 X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
4474 X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
4475 X86::VK16PAIRRegClass.hasSubClassEq(RC))
4476 return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
4477 if (X86::FR16RegClass.hasSubClassEq(RC) ||
4478 X86::FR16XRegClass.hasSubClassEq(RC))
4479 return getLoadStoreOpcodeForFP16(Load, STI);
4480 llvm_unreachable("Unknown 4-byte regclass");
4481 case 8:
4482 if (X86::GR64RegClass.hasSubClassEq(RC))
4483 return Load ? X86::MOV64rm : X86::MOV64mr;
4484 if (X86::FR64XRegClass.hasSubClassEq(RC))
4485 return Load ? (HasAVX512 ? X86::VMOVSDZrm_alt
4486 : HasAVX ? X86::VMOVSDrm_alt
4487 : X86::MOVSDrm_alt)
4488 : (HasAVX512 ? X86::VMOVSDZmr
4489 : HasAVX ? X86::VMOVSDmr
4490 : X86::MOVSDmr);
4491 if (X86::VR64RegClass.hasSubClassEq(RC))
4492 return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
4493 if (X86::RFP64RegClass.hasSubClassEq(RC))
4494 return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
4495 if (X86::VK64RegClass.hasSubClassEq(RC)) {
4496 assert(STI.hasBWI() && "KMOVQ requires BWI");
4497 return Load ? (HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm)
4498 : (HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk);
4499 }
4500 llvm_unreachable("Unknown 8-byte regclass");
4501 case 10:
4502 assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
4503 return Load ? X86::LD_Fp80m : X86::ST_FpP80m;
4504 case 16: {
4505 if (X86::VR128XRegClass.hasSubClassEq(RC)) {
4506 // If stack is realigned we can use aligned stores.
4507 if (IsStackAligned)
4508 return Load ? (HasVLX ? X86::VMOVAPSZ128rm
4509 : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX
4510 : HasAVX ? X86::VMOVAPSrm
4511 : X86::MOVAPSrm)
4512 : (HasVLX ? X86::VMOVAPSZ128mr
4513 : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX
4514 : HasAVX ? X86::VMOVAPSmr
4515 : X86::MOVAPSmr);
4516 else
4517 return Load ? (HasVLX ? X86::VMOVUPSZ128rm
4518 : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX
4519 : HasAVX ? X86::VMOVUPSrm
4520 : X86::MOVUPSrm)
4521 : (HasVLX ? X86::VMOVUPSZ128mr
4522 : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX
4523 : HasAVX ? X86::VMOVUPSmr
4524 : X86::MOVUPSmr);
4525 }
4526 llvm_unreachable("Unknown 16-byte regclass");
4527 }
4528 case 32:
4529 assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
4530 // If stack is realigned we can use aligned stores.
4531 if (IsStackAligned)
4532 return Load ? (HasVLX ? X86::VMOVAPSZ256rm
4533 : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
4534 : X86::VMOVAPSYrm)
4535 : (HasVLX ? X86::VMOVAPSZ256mr
4536 : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
4537 : X86::VMOVAPSYmr);
4538 else
4539 return Load ? (HasVLX ? X86::VMOVUPSZ256rm
4540 : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
4541 : X86::VMOVUPSYrm)
4542 : (HasVLX ? X86::VMOVUPSZ256mr
4543 : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
4544 : X86::VMOVUPSYmr);
4545 case 64:
4546 assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
4547 assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
4548 if (IsStackAligned)
4549 return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
4550 else
4551 return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
4552 case 1024:
4553 assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
4554 assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
4555#define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC)
4556 return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
4557 : GET_EGPR_IF_ENABLED(X86::TILESTORED);
4558#undef GET_EGPR_IF_ENABLED
4559 }
4560}
4561
4562std::optional<ExtAddrMode>
4564 const TargetRegisterInfo *TRI) const {
4565 const MCInstrDesc &Desc = MemI.getDesc();
4566 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4567 if (MemRefBegin < 0)
4568 return std::nullopt;
4569
4570 MemRefBegin += X86II::getOperandBias(Desc);
4571
4572 auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
4573 if (!BaseOp.isReg()) // Can be an MO_FrameIndex
4574 return std::nullopt;
4575
4576 const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
4577 // Displacement can be symbolic
4578 if (!DispMO.isImm())
4579 return std::nullopt;
4580
4581 ExtAddrMode AM;
4582 AM.BaseReg = BaseOp.getReg();
4583 AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
4584 AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
4585 AM.Displacement = DispMO.getImm();
4586 return AM;
4587}
4588
4590 StringRef &ErrInfo) const {
4591 std::optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
4592 if (!AMOrNone)
4593 return true;
4594
4595 ExtAddrMode AM = *AMOrNone;
4597 if (AM.ScaledReg != X86::NoRegister) {
4598 switch (AM.Scale) {
4599 case 1:
4600 case 2:
4601 case 4:
4602 case 8:
4603 break;
4604 default:
4605 ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
4606 return false;
4607 }
4608 }
4609 if (!isInt<32>(AM.Displacement)) {
4610 ErrInfo = "Displacement in address must fit into 32-bit signed "
4611 "integer";
4612 return false;
4613 }
4614
4615 return true;
4616}
4617
4619 const Register Reg,
4620 int64_t &ImmVal) const {
4621 Register MovReg = Reg;
4622 const MachineInstr *MovMI = &MI;
4623
4624 // Follow use-def for SUBREG_TO_REG to find the real move immediate
4625 // instruction. It is quite common for x86-64.
4626 if (MI.isSubregToReg()) {
4627 // We use following pattern to setup 64b immediate.
4628 // %8:gr32 = MOV32r0 implicit-def dead $eflags
4629 // %6:gr64 = SUBREG_TO_REG killed %8:gr32, %subreg.sub_32bit
4630 unsigned SubIdx = MI.getOperand(2).getImm();
4631 MovReg = MI.getOperand(1).getReg();
4632 if (SubIdx != X86::sub_32bit)
4633 return false;
4634 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4635 MovMI = MRI.getUniqueVRegDef(MovReg);
4636 if (!MovMI)
4637 return false;
4638 }
4639
4640 if (MovMI->getOpcode() == X86::MOV32r0 &&
4641 MovMI->getOperand(0).getReg() == MovReg) {
4642 ImmVal = 0;
4643 return true;
4644 }
4645
4646 if (MovMI->getOpcode() != X86::MOV32ri &&
4647 MovMI->getOpcode() != X86::MOV64ri &&
4648 MovMI->getOpcode() != X86::MOV32ri64 && MovMI->getOpcode() != X86::MOV8ri)
4649 return false;
4650 // Mov Src can be a global address.
4651 if (!MovMI->getOperand(1).isImm() || MovMI->getOperand(0).getReg() != MovReg)
4652 return false;
4653 ImmVal = MovMI->getOperand(1).getImm();
4654 return true;
4655}
4656
4658 const MachineInstr *MI, const Register NullValueReg,
4659 const TargetRegisterInfo *TRI) const {
4660 if (!MI->modifiesRegister(NullValueReg, TRI))
4661 return true;
4662 switch (MI->getOpcode()) {
4663 // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
4664 // X.
4665 case X86::SHR64ri:
4666 case X86::SHR32ri:
4667 case X86::SHL64ri:
4668 case X86::SHL32ri:
4669 assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
4670 "expected for shift opcode!");
4671 return MI->getOperand(0).getReg() == NullValueReg &&
4672 MI->getOperand(1).getReg() == NullValueReg;
4673 // Zero extend of a sub-reg of NullValueReg into itself does not change the
4674 // null value.
4675 case X86::MOV32rr:
4676 return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
4677 return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
4678 });
4679 default:
4680 return false;
4681 }
4682 llvm_unreachable("Should be handled above!");
4683}
4684
4687 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
4688 const TargetRegisterInfo *TRI) const {
4689 const MCInstrDesc &Desc = MemOp.getDesc();
4690 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4691 if (MemRefBegin < 0)
4692 return false;
4693
4694 MemRefBegin += X86II::getOperandBias(Desc);
4695
4696 const MachineOperand *BaseOp =
4697 &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
4698 if (!BaseOp->isReg()) // Can be an MO_FrameIndex
4699 return false;
4700
4701 if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
4702 return false;
4703
4704 if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
4705 X86::NoRegister)
4706 return false;
4707
4708 const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
4709
4710 // Displacement can be symbolic
4711 if (!DispMO.isImm())
4712 return false;
4713
4714 Offset = DispMO.getImm();
4715
4716 if (!BaseOp->isReg())
4717 return false;
4718
4719 OffsetIsScalable = false;
4720 // FIXME: Relying on memoperands() may not be right thing to do here. Check
4721 // with X86 maintainers, and fix it accordingly. For now, it is ok, since
4722 // there is no use of `Width` for X86 back-end at the moment.
4723 Width = !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize()
4725 BaseOps.push_back(BaseOp);
4726 return true;
4727}
4728
4729static unsigned getStoreRegOpcode(Register SrcReg,
4730 const TargetRegisterClass *RC,
4731 bool IsStackAligned,
4732 const X86Subtarget &STI) {
4733 return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
4734}
4735
4736static unsigned getLoadRegOpcode(Register DestReg,
4737 const TargetRegisterClass *RC,
4738 bool IsStackAligned, const X86Subtarget &STI) {
4739 return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
4740}
4741
4742static bool isAMXOpcode(unsigned Opc) {
4743 switch (Opc) {
4744 default:
4745 return false;
4746 case X86::TILELOADD:
4747 case X86::TILESTORED:
4748 case X86::TILELOADD_EVEX:
4749 case X86::TILESTORED_EVEX:
4750 return true;
4751 }
4752}
4753
4756 unsigned Opc, Register Reg, int FrameIdx,
4757 bool isKill) const {
4758 switch (Opc) {
4759 default:
4760 llvm_unreachable("Unexpected special opcode!");
4761 case X86::TILESTORED:
4762 case X86::TILESTORED_EVEX: {
4763 // tilestored %tmm, (%sp, %idx)
4764 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4765 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4766 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4767 MachineInstr *NewMI =
4768 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4769 .addReg(Reg, getKillRegState(isKill));
4771 MO.setReg(VirtReg);
4772 MO.setIsKill(true);
4773 break;
4774 }
4775 case X86::TILELOADD:
4776 case X86::TILELOADD_EVEX: {
4777 // tileloadd (%sp, %idx), %tmm
4778 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4779 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4780 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4782 BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx);
4784 MO.setReg(VirtReg);
4785 MO.setIsKill(true);
4786 break;
4787 }
4788 }
4789}
4790
4793 bool isKill, int FrameIdx, const TargetRegisterClass *RC,
4794
4795 Register VReg, MachineInstr::MIFlag Flags) const {
4796 const MachineFunction &MF = *MBB.getParent();
4797 const MachineFrameInfo &MFI = MF.getFrameInfo();
4798 assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
4799 "Stack slot too small for store");
4800
4801 unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
4802 bool isAligned =
4803 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4804 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4805
4806 unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
4807 if (isAMXOpcode(Opc))
4808 loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill);
4809 else
4810 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4811 .addReg(SrcReg, getKillRegState(isKill))
4812 .setMIFlag(Flags);
4813}
4814
4817 Register DestReg, int FrameIdx,
4818 const TargetRegisterClass *RC,
4819 Register VReg, unsigned SubReg,
4820 MachineInstr::MIFlag Flags) const {
4821 const MachineFunction &MF = *MBB.getParent();
4822 const MachineFrameInfo &MFI = MF.getFrameInfo();
4823 assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
4824 "Load size exceeds stack slot");
4825 unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
4826 bool isAligned =
4827 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4828 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4829
4830 unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
4831 if (isAMXOpcode(Opc))
4832 loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx);
4833 else
4834 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx)
4835 .setMIFlag(Flags);
4836}
4837
4839 Register &SrcReg2, int64_t &CmpMask,
4840 int64_t &CmpValue) const {
4841 switch (MI.getOpcode()) {
4842 default:
4843 break;
4844 case X86::CMP64ri32:
4845 case X86::CMP32ri:
4846 case X86::CMP16ri:
4847 case X86::CMP8ri:
4848 SrcReg = MI.getOperand(0).getReg();
4849 SrcReg2 = 0;
4850 if (MI.getOperand(1).isImm()) {
4851 CmpMask = ~0;
4852 CmpValue = MI.getOperand(1).getImm();
4853 } else {
4854 CmpMask = CmpValue = 0;
4855 }
4856 return true;
4857 // A SUB can be used to perform comparison.
4858 CASE_ND(SUB64rm)
4859 CASE_ND(SUB32rm)
4860 CASE_ND(SUB16rm)
4861 CASE_ND(SUB8rm)
4862 SrcReg = MI.getOperand(1).getReg();
4863 SrcReg2 = 0;
4864 CmpMask = 0;
4865 CmpValue = 0;
4866 return true;
4867 CASE_ND(SUB64rr)
4868 CASE_ND(SUB32rr)
4869 CASE_ND(SUB16rr)
4870 CASE_ND(SUB8rr)
4871 SrcReg = MI.getOperand(1).getReg();
4872 SrcReg2 = MI.getOperand(2).getReg();
4873 CmpMask = 0;
4874 CmpValue = 0;
4875 return true;
4876 CASE_ND(SUB64ri32)
4877 CASE_ND(SUB32ri)
4878 CASE_ND(SUB16ri)
4879 CASE_ND(SUB8ri)
4880 SrcReg = MI.getOperand(1).getReg();
4881 SrcReg2 = 0;
4882 if (MI.getOperand(2).isImm()) {
4883 CmpMask = ~0;
4884 CmpValue = MI.getOperand(2).getImm();
4885 } else {
4886 CmpMask = CmpValue = 0;
4887 }
4888 return true;
4889 case X86::CMP64rr:
4890 case X86::CMP32rr:
4891 case X86::CMP16rr:
4892 case X86::CMP8rr:
4893 SrcReg = MI.getOperand(0).getReg();
4894 SrcReg2 = MI.getOperand(1).getReg();
4895 CmpMask = 0;
4896 CmpValue = 0;
4897 return true;
4898 case X86::TEST8rr:
4899 case X86::TEST16rr:
4900 case X86::TEST32rr:
4901 case X86::TEST64rr:
4902 SrcReg = MI.getOperand(0).getReg();
4903 if (MI.getOperand(1).getReg() != SrcReg)
4904 return false;
4905 // Compare against zero.
4906 SrcReg2 = 0;
4907 CmpMask = ~0;
4908 CmpValue = 0;
4909 return true;
4910 case X86::TEST64ri32:
4911 case X86::TEST32ri:
4912 case X86::TEST16ri:
4913 case X86::TEST8ri:
4914 SrcReg = MI.getOperand(0).getReg();
4915 SrcReg2 = 0;
4916 // Force identical compare.
4917 CmpMask = 0;
4918 CmpValue = 0;
4919 return true;
4920 }
4921 return false;
4922}
4923
4924bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
4925 Register SrcReg, Register SrcReg2,
4926 int64_t ImmMask, int64_t ImmValue,
4927 const MachineInstr &OI, bool *IsSwapped,
4928 int64_t *ImmDelta) const {
4929 switch (OI.getOpcode()) {
4930 case X86::CMP64rr:
4931 case X86::CMP32rr:
4932 case X86::CMP16rr:
4933 case X86::CMP8rr:
4934 CASE_ND(SUB64rr)
4935 CASE_ND(SUB32rr)
4936 CASE_ND(SUB16rr)
4937 CASE_ND(SUB8rr) {
4938 Register OISrcReg;
4939 Register OISrcReg2;
4940 int64_t OIMask;
4941 int64_t OIValue;
4942 if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4943 OIMask != ImmMask || OIValue != ImmValue)
4944 return false;
4945 if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4946 *IsSwapped = false;
4947 return true;
4948 }
4949 if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4950 *IsSwapped = true;
4951 return true;
4952 }
4953 return false;
4954 }
4955 case X86::CMP64ri32:
4956 case X86::CMP32ri:
4957 case X86::CMP16ri:
4958 case X86::CMP8ri:
4959 case X86::TEST64ri32:
4960 case X86::TEST32ri:
4961 case X86::TEST16ri:
4962 case X86::TEST8ri:
4963 CASE_ND(SUB64ri32)
4964 CASE_ND(SUB32ri)
4965 CASE_ND(SUB16ri)
4966 CASE_ND(SUB8ri)
4967 case X86::TEST64rr:
4968 case X86::TEST32rr:
4969 case X86::TEST16rr:
4970 case X86::TEST8rr: {
4971 if (ImmMask != 0) {
4972 Register OISrcReg;
4973 Register OISrcReg2;
4974 int64_t OIMask;
4975 int64_t OIValue;
4976 if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4977 SrcReg == OISrcReg && ImmMask == OIMask) {
4978 if (OIValue == ImmValue) {
4979 *ImmDelta = 0;
4980 return true;
4981 } else if (static_cast<uint64_t>(ImmValue) ==
4982 static_cast<uint64_t>(OIValue) - 1) {
4983 *ImmDelta = -1;
4984 return true;
4985 } else if (static_cast<uint64_t>(ImmValue) ==
4986 static_cast<uint64_t>(OIValue) + 1) {
4987 *ImmDelta = 1;
4988 return true;
4989 } else {
4990 return false;
4991 }
4992 }
4993 }
4994 return FlagI.isIdenticalTo(OI);
4995 }
4996 default:
4997 return false;
4998 }
4999}
5000
5001/// Check whether the definition can be converted
5002/// to remove a comparison against zero.
5003inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
5004 bool &ClearsOverflowFlag) {
5005 NoSignFlag = false;
5006 ClearsOverflowFlag = false;
5007
5008 // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and
5009 // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during
5010 // Initial Exec to Local Exec relaxation. In these cases, we must not depend
5011 // on the EFLAGS modification of ADD actually happening in the final binary.
5012 if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) {
5013 unsigned Flags = MI.getOperand(5).getTargetFlags();
5014 if (Flags == X86II::MO_GOTTPOFF || Flags == X86II::MO_INDNTPOFF ||
5015 Flags == X86II::MO_GOTNTPOFF)
5016 return false;
5017 }
5018
5019 switch (MI.getOpcode()) {
5020 default:
5021 return false;
5022
5023 // The shift instructions only modify ZF if their shift count is non-zero.
5024 // N.B.: The processor truncates the shift count depending on the encoding.
5025 CASE_ND(SAR8ri)
5026 CASE_ND(SAR16ri)
5027 CASE_ND(SAR32ri)
5028 CASE_ND(SAR64ri)
5029 CASE_ND(SHR8ri)
5030 CASE_ND(SHR16ri)
5031 CASE_ND(SHR32ri)
5032 CASE_ND(SHR64ri)
5033 return getTruncatedShiftCount(MI, 2) != 0;
5034
5035 // Some left shift instructions can be turned into LEA instructions but only
5036 // if their flags aren't used. Avoid transforming such instructions.
5037 CASE_ND(SHL8ri)
5038 CASE_ND(SHL16ri)
5039 CASE_ND(SHL32ri)
5040 CASE_ND(SHL64ri) {
5041 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
5042 if (isTruncatedShiftCountForLEA(ShAmt))
5043 return false;
5044 return ShAmt != 0;
5045 }
5046
5047 CASE_ND(SHRD16rri8)
5048 CASE_ND(SHRD32rri8)
5049 CASE_ND(SHRD64rri8)
5050 CASE_ND(SHLD16rri8)
5051 CASE_ND(SHLD32rri8)
5052 CASE_ND(SHLD64rri8)
5053 return getTruncatedShiftCount(MI, 3) != 0;
5054
5055 CASE_ND(SUB64ri32)
5056 CASE_ND(SUB32ri)
5057 CASE_ND(SUB16ri)
5058 CASE_ND(SUB8ri)
5059 CASE_ND(SUB64rr)
5060 CASE_ND(SUB32rr)
5061 CASE_ND(SUB16rr)
5062 CASE_ND(SUB8rr)
5063 CASE_ND(SUB64rm)
5064 CASE_ND(SUB32rm)
5065 CASE_ND(SUB16rm)
5066 CASE_ND(SUB8rm)
5067 CASE_ND(DEC64r)
5068 CASE_ND(DEC32r)
5069 CASE_ND(DEC16r)
5070 CASE_ND(DEC8r)
5071 CASE_ND(ADD64ri32)
5072 CASE_ND(ADD32ri)
5073 CASE_ND(ADD16ri)
5074 CASE_ND(ADD8ri)
5075 CASE_ND(ADD64rr)
5076 CASE_ND(ADD32rr)
5077 CASE_ND(ADD16rr)
5078 CASE_ND(ADD8rr)
5079 CASE_ND(ADD64rm)
5080 CASE_ND(ADD32rm)
5081 CASE_ND(ADD16rm)
5082 CASE_ND(ADD8rm)
5083 CASE_ND(INC64r)
5084 CASE_ND(INC32r)
5085 CASE_ND(INC16r)
5086 CASE_ND(INC8r)
5087 CASE_ND(ADC64ri32)
5088 CASE_ND(ADC32ri)
5089 CASE_ND(ADC16ri)
5090 CASE_ND(ADC8ri)
5091 CASE_ND(ADC64rr)
5092 CASE_ND(ADC32rr)
5093 CASE_ND(ADC16rr)
5094 CASE_ND(ADC8rr)
5095 CASE_ND(ADC64rm)
5096 CASE_ND(ADC32rm)
5097 CASE_ND(ADC16rm)
5098 CASE_ND(ADC8rm)
5099 CASE_ND(SBB64ri32)
5100 CASE_ND(SBB32ri)
5101 CASE_ND(SBB16ri)
5102 CASE_ND(SBB8ri)
5103 CASE_ND(SBB64rr)
5104 CASE_ND(SBB32rr)
5105 CASE_ND(SBB16rr)
5106 CASE_ND(SBB8rr)
5107 CASE_ND(SBB64rm)
5108 CASE_ND(SBB32rm)
5109 CASE_ND(SBB16rm)
5110 CASE_ND(SBB8rm)
5111 CASE_ND(NEG8r)
5112 CASE_ND(NEG16r)
5113 CASE_ND(NEG32r)
5114 CASE_ND(NEG64r)
5115 case X86::LZCNT16rr:
5116 case X86::LZCNT16rm:
5117 case X86::LZCNT32rr:
5118 case X86::LZCNT32rm:
5119 case X86::LZCNT64rr:
5120 case X86::LZCNT64rm:
5121 case X86::POPCNT16rr:
5122 case X86::POPCNT16rm:
5123 case X86::POPCNT32rr:
5124 case X86::POPCNT32rm:
5125 case X86::POPCNT64rr:
5126 case X86::POPCNT64rm:
5127 case X86::TZCNT16rr:
5128 case X86::TZCNT16rm:
5129 case X86::TZCNT32rr:
5130 case X86::TZCNT32rm:
5131 case X86::TZCNT64rr:
5132 case X86::TZCNT64rm:
5133 return true;
5134 CASE_ND(AND64ri32)
5135 CASE_ND(AND32ri)
5136 CASE_ND(AND16ri)
5137 CASE_ND(AND8ri)
5138 CASE_ND(AND64rr)
5139 CASE_ND(AND32rr)
5140 CASE_ND(AND16rr)
5141 CASE_ND(AND8rr)
5142 CASE_ND(AND64rm)
5143 CASE_ND(AND32rm)
5144 CASE_ND(AND16rm)
5145 CASE_ND(AND8rm)
5146 CASE_ND(XOR64ri32)
5147 CASE_ND(XOR32ri)
5148 CASE_ND(XOR16ri)
5149 CASE_ND(XOR8ri)
5150 CASE_ND(XOR64rr)
5151 CASE_ND(XOR32rr)
5152 CASE_ND(XOR16rr)
5153 CASE_ND(XOR8rr)
5154 CASE_ND(XOR64rm)
5155 CASE_ND(XOR32rm)
5156 CASE_ND(XOR16rm)
5157 CASE_ND(XOR8rm)
5158 CASE_ND(OR64ri32)
5159 CASE_ND(OR32ri)
5160 CASE_ND(OR16ri)
5161 CASE_ND(OR8ri)
5162 CASE_ND(OR64rr)
5163 CASE_ND(OR32rr)
5164 CASE_ND(OR16rr)
5165 CASE_ND(OR8rr)
5166 CASE_ND(OR64rm)
5167 CASE_ND(OR32rm)
5168 CASE_ND(OR16rm)
5169 CASE_ND(OR8rm)
5170 case X86::ANDN32rr:
5171 case X86::ANDN32rm:
5172 case X86::ANDN64rr:
5173 case X86::ANDN64rm:
5174 case X86::BLSI32rr:
5175 case X86::BLSI32rm:
5176 case X86::BLSI64rr:
5177 case X86::BLSI64rm:
5178 case X86::BLSMSK32rr:
5179 case X86::BLSMSK32rm:
5180 case X86::BLSMSK64rr:
5181 case X86::BLSMSK64rm:
5182 case X86::BLSR32rr:
5183 case X86::BLSR32rm:
5184 case X86::BLSR64rr:
5185 case X86::BLSR64rm:
5186 case X86::BLCFILL32rr:
5187 case X86::BLCFILL32rm:
5188 case X86::BLCFILL64rr:
5189 case X86::BLCFILL64rm:
5190 case X86::BLCI32rr:
5191 case X86::BLCI32rm:
5192 case X86::BLCI64rr:
5193 case X86::BLCI64rm:
5194 case X86::BLCIC32rr:
5195 case X86::BLCIC32rm:
5196 case X86::BLCIC64rr:
5197 case X86::BLCIC64rm:
5198 case X86::BLCMSK32rr:
5199 case X86::BLCMSK32rm:
5200 case X86::BLCMSK64rr:
5201 case X86::BLCMSK64rm:
5202 case X86::BLCS32rr:
5203 case X86::BLCS32rm:
5204 case X86::BLCS64rr:
5205 case X86::BLCS64rm:
5206 case X86::BLSFILL32rr:
5207 case X86::BLSFILL32rm:
5208 case X86::BLSFILL64rr:
5209 case X86::BLSFILL64rm:
5210 case X86::BLSIC32rr:
5211 case X86::BLSIC32rm:
5212 case X86::BLSIC64rr:
5213 case X86::BLSIC64rm:
5214 case X86::BZHI32rr:
5215 case X86::BZHI32rm:
5216 case X86::BZHI64rr:
5217 case X86::BZHI64rm:
5218 case X86::T1MSKC32rr:
5219 case X86::T1MSKC32rm:
5220 case X86::T1MSKC64rr:
5221 case X86::T1MSKC64rm:
5222 case X86::TZMSK32rr:
5223 case X86::TZMSK32rm:
5224 case X86::TZMSK64rr:
5225 case X86::TZMSK64rm:
5226 // These instructions clear the overflow flag just like TEST.
5227 // FIXME: These are not the only instructions in this switch that clear the
5228 // overflow flag.
5229 ClearsOverflowFlag = true;
5230 return true;
5231 case X86::BEXTR32rr:
5232 case X86::BEXTR64rr:
5233 case X86::BEXTR32rm:
5234 case X86::BEXTR64rm:
5235 case X86::BEXTRI32ri:
5236 case X86::BEXTRI32mi:
5237 case X86::BEXTRI64ri:
5238 case X86::BEXTRI64mi:
5239 // BEXTR doesn't update the sign flag so we can't use it. It does clear
5240 // the overflow flag, but that's not useful without the sign flag.
5241 NoSignFlag = true;
5242 return true;
5243 }
5244}
5245
5246/// Check whether the use can be converted to remove a comparison against zero.
5247/// Returns the EFLAGS condition and the operand that we are comparing against zero.
5248static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
5249 switch (MI.getOpcode()) {
5250 default:
5251 return std::make_pair(X86::COND_INVALID, ~0U);
5252 CASE_ND(NEG8r)
5253 CASE_ND(NEG16r)
5254 CASE_ND(NEG32r)
5255 CASE_ND(NEG64r)
5256 return std::make_pair(X86::COND_AE, 1U);
5257 case X86::LZCNT16rr:
5258 case X86::LZCNT32rr:
5259 case X86::LZCNT64rr:
5260 return std::make_pair(X86::COND_B, 1U);
5261 case X86::POPCNT16rr:
5262 case X86::POPCNT32rr:
5263 case X86::POPCNT64rr:
5264 return std::make_pair(X86::COND_E, 1U);
5265 case X86::TZCNT16rr:
5266 case X86::TZCNT32rr:
5267 case X86::TZCNT64rr:
5268 return std::make_pair(X86::COND_B, 1U);
5269 case X86::BSF16rr:
5270 case X86::BSF32rr:
5271 case X86::BSF64rr:
5272 case X86::BSR16rr:
5273 case X86::BSR32rr:
5274 case X86::BSR64rr:
5275 return std::make_pair(X86::COND_E, 2U);
5276 case X86::BLSI32rr:
5277 case X86::BLSI64rr:
5278 return std::make_pair(X86::COND_AE, 1U);
5279 case X86::BLSR32rr:
5280 case X86::BLSR64rr:
5281 case X86::BLSMSK32rr:
5282 case X86::BLSMSK64rr:
5283 return std::make_pair(X86::COND_B, 1U);
5284 // TODO: TBM instructions.
5285 }
5286}
5287
5288/// Check if there exists an earlier instruction that
5289/// operates on the same source operands and sets flags in the same way as
5290/// Compare; remove Compare if possible.
5292 Register SrcReg2, int64_t CmpMask,
5293 int64_t CmpValue,
5294 const MachineRegisterInfo *MRI) const {
5295 // Check whether we can replace SUB with CMP.
5296 switch (CmpInstr.getOpcode()) {
5297 default:
5298 break;
5299 CASE_ND(SUB64ri32)
5300 CASE_ND(SUB32ri)
5301 CASE_ND(SUB16ri)
5302 CASE_ND(SUB8ri)
5303 CASE_ND(SUB64rm)
5304 CASE_ND(SUB32rm)
5305 CASE_ND(SUB16rm)
5306 CASE_ND(SUB8rm)
5307 CASE_ND(SUB64rr)
5308 CASE_ND(SUB32rr)
5309 CASE_ND(SUB16rr)
5310 CASE_ND(SUB8rr) {
5311 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
5312 return false;
5313 // There is no use of the destination register, we can replace SUB with CMP.
5314 unsigned NewOpcode = 0;
5315#define FROM_TO(A, B) \
5316 CASE_ND(A) NewOpcode = X86::B; \
5317 break;
5318 switch (CmpInstr.getOpcode()) {
5319 default:
5320 llvm_unreachable("Unreachable!");
5321 FROM_TO(SUB64rm, CMP64rm)
5322 FROM_TO(SUB32rm, CMP32rm)
5323 FROM_TO(SUB16rm, CMP16rm)
5324 FROM_TO(SUB8rm, CMP8rm)
5325 FROM_TO(SUB64rr, CMP64rr)
5326 FROM_TO(SUB32rr, CMP32rr)
5327 FROM_TO(SUB16rr, CMP16rr)
5328 FROM_TO(SUB8rr, CMP8rr)
5329 FROM_TO(SUB64ri32, CMP64ri32)
5330 FROM_TO(SUB32ri, CMP32ri)
5331 FROM_TO(SUB16ri, CMP16ri)
5332 FROM_TO(SUB8ri, CMP8ri)
5333 }
5334#undef FROM_TO
5335 CmpInstr.setDesc(get(NewOpcode));
5336 CmpInstr.removeOperand(0);
5337 // Mutating this instruction invalidates any debug data associated with it.
5338 CmpInstr.dropDebugNumber();
5339 // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
5340 if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
5341 NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
5342 return false;
5343 }
5344 }
5345
5346 // The following code tries to remove the comparison by re-using EFLAGS
5347 // from earlier instructions.
5348
5349 bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
5350
5351 // Transformation currently requires SSA values.
5352 if (SrcReg2.isPhysical())
5353 return false;
5354 MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
5355 assert(SrcRegDef && "Must have a definition (SSA)");
5356
5357 MachineInstr *MI = nullptr;
5358 MachineInstr *Sub = nullptr;
5359 MachineInstr *Movr0Inst = nullptr;
5361 bool NoSignFlag = false;
5362 bool ClearsOverflowFlag = false;
5363 bool ShouldUpdateCC = false;
5364 bool IsSwapped = false;
5365 bool HasNF = Subtarget.hasNF();
5366 unsigned OpNo = 0;
5368 int64_t ImmDelta = 0;
5369
5370 // Search backward from CmpInstr for the next instruction defining EFLAGS.
5372 MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
5374 std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
5375 for (MachineBasicBlock *MBB = &CmpMBB;;) {
5376 for (MachineInstr &Inst : make_range(From, MBB->rend())) {
5377 // Try to use EFLAGS from the instruction defining %SrcReg. Example:
5378 // %eax = addl ...
5379 // ... // EFLAGS not changed
5380 // testl %eax, %eax // <-- can be removed
5381 if (&Inst == SrcRegDef) {
5382 if (IsCmpZero &&
5383 isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
5384 MI = &Inst;
5385 break;
5386 }
5387
5388 // Look back for the following pattern, in which case the
5389 // test16rr/test64rr instruction could be erased.
5390 //
5391 // Example for test16rr:
5392 // %reg = and32ri %in_reg, 5
5393 // ... // EFLAGS not changed.
5394 // %src_reg = copy %reg.sub_16bit:gr32
5395 // test16rr %src_reg, %src_reg, implicit-def $eflags
5396 // Example for test64rr:
5397 // %reg = and32ri %in_reg, 5
5398 // ... // EFLAGS not changed.
5399 // %src_reg = subreg_to_reg %reg, %subreg.sub_index
5400 // test64rr %src_reg, %src_reg, implicit-def $eflags
5401 MachineInstr *AndInstr = nullptr;
5402 if (IsCmpZero &&
5403 findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
5404 Subtarget, NoSignFlag, ClearsOverflowFlag)) {
5405 assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
5406 MI = AndInstr;
5407 break;
5408 }
5409 // Cannot find other candidates before definition of SrcReg.
5410 return false;
5411 }
5412
5413 if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
5414 // Try to use EFLAGS produced by an instruction reading %SrcReg.
5415 // Example:
5416 // %eax = ...
5417 // ...
5418 // popcntl %eax
5419 // ... // EFLAGS not changed
5420 // testl %eax, %eax // <-- can be removed
5421 if (IsCmpZero) {
5422 std::tie(NewCC, OpNo) = isUseDefConvertible(Inst);
5423 if (NewCC != X86::COND_INVALID && Inst.getOperand(OpNo).isReg() &&
5424 Inst.getOperand(OpNo).getReg() == SrcReg) {
5425 ShouldUpdateCC = true;
5426 MI = &Inst;
5427 break;
5428 }
5429 }
5430
5431 // Try to use EFLAGS from an instruction with similar flag results.
5432 // Example:
5433 // sub x, y or cmp x, y
5434 // ... // EFLAGS not changed
5435 // cmp x, y // <-- can be removed
5436 if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
5437 Inst, &IsSwapped, &ImmDelta)) {
5438 Sub = &Inst;
5439 break;
5440 }
5441
5442 // MOV32r0 is implemented with xor which clobbers condition code. It is
5443 // safe to move up, if the definition to EFLAGS is dead and earlier
5444 // instructions do not read or write EFLAGS.
5445 if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
5446 Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
5447 Movr0Inst = &Inst;
5448 continue;
5449 }
5450
5451 // For the instructions are ADDrm/ADDmr with relocation, we'll skip the
5452 // optimization for replacing non-NF with NF. This is to keep backward
5453 // compatiblity with old version of linkers without APX relocation type
5454 // support on Linux OS.
5455 bool IsWithReloc = X86EnableAPXForRelocation
5456 ? false
5458
5459 // Try to replace non-NF with NF instructions.
5460 if (HasNF && Inst.registerDefIsDead(X86::EFLAGS, TRI) && !IsWithReloc) {
5461 unsigned NewOp = X86::getNFVariant(Inst.getOpcode());
5462 if (!NewOp)
5463 return false;
5464
5465 InstsToUpdate.push_back(std::make_pair(&Inst, NewOp));
5466 continue;
5467 }
5468
5469 // Cannot do anything for any other EFLAG changes.
5470 return false;
5471 }
5472 }
5473
5474 if (MI || Sub)
5475 break;
5476
5477 // Reached begin of basic block. Continue in predecessor if there is
5478 // exactly one.
5479 if (MBB->pred_size() != 1)
5480 return false;
5481 MBB = *MBB->pred_begin();
5482 From = MBB->rbegin();
5483 }
5484
5485 // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
5486 // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
5487 // If we are done with the basic block, we need to check whether EFLAGS is
5488 // live-out.
5489 bool FlagsMayLiveOut = true;
5491 MachineBasicBlock::iterator AfterCmpInstr =
5492 std::next(MachineBasicBlock::iterator(CmpInstr));
5493 for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
5494 bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
5495 bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
5496 // We should check the usage if this instruction uses and updates EFLAGS.
5497 if (!UseEFLAGS && ModifyEFLAGS) {
5498 // It is safe to remove CmpInstr if EFLAGS is updated again.
5499 FlagsMayLiveOut = false;
5500 break;
5501 }
5502 if (!UseEFLAGS && !ModifyEFLAGS)
5503 continue;
5504
5505 // EFLAGS is used by this instruction.
5506 X86::CondCode OldCC = X86::getCondFromMI(Instr);
5507 if ((MI || IsSwapped || ImmDelta != 0) && OldCC == X86::COND_INVALID)
5508 return false;
5509
5510 X86::CondCode ReplacementCC = X86::COND_INVALID;
5511 if (MI) {
5512 switch (OldCC) {
5513 default:
5514 break;
5515 case X86::COND_A:
5516 case X86::COND_AE:
5517 case X86::COND_B:
5518 case X86::COND_BE:
5519 // CF is used, we can't perform this optimization.
5520 return false;
5521 case X86::COND_G:
5522 case X86::COND_GE:
5523 case X86::COND_L:
5524 case X86::COND_LE:
5525 // If SF is used, but the instruction doesn't update the SF, then we
5526 // can't do the optimization.
5527 if (NoSignFlag)
5528 return false;
5529 [[fallthrough]];
5530 case X86::COND_O:
5531 case X86::COND_NO:
5532 // If OF is used, the instruction needs to clear it like CmpZero does.
5533 if (!ClearsOverflowFlag)
5534 return false;
5535 break;
5536 case X86::COND_S:
5537 case X86::COND_NS:
5538 // If SF is used, but the instruction doesn't update the SF, then we
5539 // can't do the optimization.
5540 if (NoSignFlag)
5541 return false;
5542 break;
5543 }
5544
5545 // If we're updating the condition code check if we have to reverse the
5546 // condition.
5547 if (ShouldUpdateCC)
5548 switch (OldCC) {
5549 default:
5550 return false;
5551 case X86::COND_E:
5552 ReplacementCC = NewCC;
5553 break;
5554 case X86::COND_NE:
5555 ReplacementCC = GetOppositeBranchCondition(NewCC);
5556 break;
5557 }
5558 } else if (IsSwapped) {
5559 // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
5560 // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
5561 // We swap the condition code and synthesize the new opcode.
5562 ReplacementCC = getSwappedCondition(OldCC);
5563 if (ReplacementCC == X86::COND_INVALID)
5564 return false;
5565 ShouldUpdateCC = true;
5566 } else if (ImmDelta != 0) {
5567 unsigned BitWidth = RI.getRegSizeInBits(*MRI->getRegClass(SrcReg));
5568 // Shift amount for min/max constants to adjust for 8/16/32 instruction
5569 // sizes.
5570 switch (OldCC) {
5571 case X86::COND_L: // x <s (C + 1) --> x <=s C
5572 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5573 return false;
5574 ReplacementCC = X86::COND_LE;
5575 break;
5576 case X86::COND_B: // x <u (C + 1) --> x <=u C
5577 if (ImmDelta != 1 || CmpValue == 0)
5578 return false;
5579 ReplacementCC = X86::COND_BE;
5580 break;
5581 case X86::COND_GE: // x >=s (C + 1) --> x >s C
5582 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5583 return false;
5584 ReplacementCC = X86::COND_G;
5585 break;
5586 case X86::COND_AE: // x >=u (C + 1) --> x >u C
5587 if (ImmDelta != 1 || CmpValue == 0)
5588 return false;
5589 ReplacementCC = X86::COND_A;
5590 break;
5591 case X86::COND_G: // x >s (C - 1) --> x >=s C
5592 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5593 return false;
5594 ReplacementCC = X86::COND_GE;
5595 break;
5596 case X86::COND_A: // x >u (C - 1) --> x >=u C
5597 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5598 return false;
5599 ReplacementCC = X86::COND_AE;
5600 break;
5601 case X86::COND_LE: // x <=s (C - 1) --> x <s C
5602 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5603 return false;
5604 ReplacementCC = X86::COND_L;
5605 break;
5606 case X86::COND_BE: // x <=u (C - 1) --> x <u C
5607 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5608 return false;
5609 ReplacementCC = X86::COND_B;
5610 break;
5611 default:
5612 return false;
5613 }
5614 ShouldUpdateCC = true;
5615 }
5616
5617 if (ShouldUpdateCC && ReplacementCC != OldCC) {
5618 // Push the MachineInstr to OpsToUpdate.
5619 // If it is safe to remove CmpInstr, the condition code of these
5620 // instructions will be modified.
5621 OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
5622 }
5623 if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
5624 // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
5625 FlagsMayLiveOut = false;
5626 break;
5627 }
5628 }
5629
5630 // If we have to update users but EFLAGS is live-out abort, since we cannot
5631 // easily find all of the users.
5632 if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
5633 for (MachineBasicBlock *Successor : CmpMBB.successors())
5634 if (Successor->isLiveIn(X86::EFLAGS))
5635 return false;
5636 }
5637
5638 // The instruction to be updated is either Sub or MI.
5639 assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
5640 Sub = MI != nullptr ? MI : Sub;
5641 MachineBasicBlock *SubBB = Sub->getParent();
5642 // Move Movr0Inst to the appropriate place before Sub.
5643 if (Movr0Inst) {
5644 // Only move within the same block so we don't accidentally move to a
5645 // block with higher execution frequency.
5646 if (&CmpMBB != SubBB)
5647 return false;
5648 // Look backwards until we find a def that doesn't use the current EFLAGS.
5650 InsertE = Sub->getParent()->rend();
5651 for (; InsertI != InsertE; ++InsertI) {
5652 MachineInstr *Instr = &*InsertI;
5653 if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
5654 Instr->modifiesRegister(X86::EFLAGS, TRI)) {
5655 Movr0Inst->getParent()->remove(Movr0Inst);
5656 Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
5657 Movr0Inst);
5658 break;
5659 }
5660 }
5661 if (InsertI == InsertE)
5662 return false;
5663 }
5664
5665 // Replace non-NF with NF instructions.
5666 for (auto &Inst : InstsToUpdate) {
5667 Inst.first->setDesc(get(Inst.second));
5668 Inst.first->removeOperand(
5669 Inst.first->findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5670 }
5671
5672 // Make sure Sub instruction defines EFLAGS and mark the def live.
5673 MachineOperand *FlagDef =
5674 Sub->findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
5675 assert(FlagDef && "Unable to locate a def EFLAGS operand");
5676 FlagDef->setIsDead(false);
5677
5678 CmpInstr.eraseFromParent();
5679
5680 // Modify the condition code of instructions in OpsToUpdate.
5681 for (auto &Op : OpsToUpdate) {
5682 Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
5683 .setImm(Op.second);
5684 }
5685 // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
5686 for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
5687 MBB = *MBB->pred_begin()) {
5688 assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
5689 if (!MBB->isLiveIn(X86::EFLAGS))
5690 MBB->addLiveIn(X86::EFLAGS);
5691 }
5692 return true;
5693}
5694
5695/// \returns true if the instruction can be changed to COPY when imm is 0.
5696static bool canConvert2Copy(unsigned Opc) {
5697 switch (Opc) {
5698 default:
5699 return false;
5700 CASE_ND(ADD64ri32)
5701 CASE_ND(SUB64ri32)
5702 CASE_ND(OR64ri32)
5703 CASE_ND(XOR64ri32)
5704 CASE_ND(ADD32ri)
5705 CASE_ND(SUB32ri)
5706 CASE_ND(OR32ri)
5707 CASE_ND(XOR32ri)
5708 return true;
5709 }
5710}
5711
5712/// Convert an ALUrr opcode to corresponding ALUri opcode. Such as
5713/// ADD32rr ==> ADD32ri
5714static unsigned convertALUrr2ALUri(unsigned Opc) {
5715 switch (Opc) {
5716 default:
5717 return 0;
5718#define FROM_TO(FROM, TO) \
5719 case X86::FROM: \
5720 return X86::TO; \
5721 case X86::FROM##_ND: \
5722 return X86::TO##_ND;
5723 FROM_TO(ADC64rr, ADC64ri32)
5724 FROM_TO(SBB64rr, SBB64ri32)
5725 FROM_TO(AND64rr, AND64ri32)
5726 FROM_TO(OR64rr, OR64ri32)
5727 FROM_TO(XOR64rr, XOR64ri32)
5728 FROM_TO(SHR64rCL, SHR64ri)
5729 FROM_TO(SHL64rCL, SHL64ri)
5730 FROM_TO(SAR64rCL, SAR64ri)
5731 FROM_TO(ROL64rCL, ROL64ri)
5732 FROM_TO(ROR64rCL, ROR64ri)
5733 FROM_TO(RCL64rCL, RCL64ri)
5734 FROM_TO(RCR64rCL, RCR64ri)
5735 FROM_TO(ADD32rr, ADD32ri)
5736 FROM_TO(ADC32rr, ADC32ri)
5737 FROM_TO(SUB32rr, SUB32ri)
5738 FROM_TO(SBB32rr, SBB32ri)
5739 FROM_TO(AND32rr, AND32ri)
5740 FROM_TO(OR32rr, OR32ri)
5741 FROM_TO(XOR32rr, XOR32ri)
5742 FROM_TO(SHR32rCL, SHR32ri)
5743 FROM_TO(SHL32rCL, SHL32ri)
5744 FROM_TO(SAR32rCL, SAR32ri)
5745 FROM_TO(ROL32rCL, ROL32ri)
5746 FROM_TO(ROR32rCL, ROR32ri)
5747 FROM_TO(RCL32rCL, RCL32ri)
5748 FROM_TO(RCR32rCL, RCR32ri)
5749#undef FROM_TO
5750#define FROM_TO(FROM, TO) \
5751 case X86::FROM: \
5752 return X86::TO;
5753 FROM_TO(ADD64rr, ADD64ri32)
5754 FROM_TO(SUB64rr, SUB64ri32)
5755 FROM_TO(TEST64rr, TEST64ri32)
5756 FROM_TO(CTEST64rr, CTEST64ri32)
5757 FROM_TO(CMP64rr, CMP64ri32)
5758 FROM_TO(CCMP64rr, CCMP64ri32)
5759 FROM_TO(TEST32rr, TEST32ri)
5760 FROM_TO(CTEST32rr, CTEST32ri)
5761 FROM_TO(CMP32rr, CMP32ri)
5762 FROM_TO(CCMP32rr, CCMP32ri)
5763#undef FROM_TO
5764 case X86::ADD64rr_ND:
5765 return X86::ADD64ri32_ND;
5766 case X86::SUB64rr_ND:
5767 return X86::SUB64ri32_ND;
5768 }
5769}
5770
5771/// Reg is assigned ImmVal in DefMI, and is used in UseMI.
5772/// If MakeChange is true, this function tries to replace Reg by ImmVal in
5773/// UseMI. If MakeChange is false, just check if folding is possible.
5774//
5775/// \returns true if folding is successful or possible.
5776bool X86InstrInfo::foldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
5777 Register Reg, int64_t ImmVal,
5779 bool MakeChange) const {
5780 bool Modified = false;
5781
5782 // 64 bit operations accept sign extended 32 bit immediates.
5783 // 32 bit operations accept all 32 bit immediates, so we don't need to check
5784 // them.
5785 const TargetRegisterClass *RC = nullptr;
5786 if (Reg.isVirtual())
5787 RC = MRI->getRegClass(Reg);
5788 if ((Reg.isPhysical() && X86::GR64RegClass.contains(Reg)) ||
5789 (Reg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC))) {
5790 if (!isInt<32>(ImmVal))
5791 return false;
5792 }
5793
5794 if (UseMI.findRegisterUseOperand(Reg, /*TRI=*/nullptr)->getSubReg())
5795 return false;
5796 // Immediate has larger code size than register. So avoid folding the
5797 // immediate if it has more than 1 use and we are optimizing for size.
5798 if (UseMI.getMF()->getFunction().hasOptSize() && Reg.isVirtual() &&
5799 !MRI->hasOneNonDBGUse(Reg))
5800 return false;
5801
5802 unsigned Opc = UseMI.getOpcode();
5803 unsigned NewOpc;
5804 if (Opc == TargetOpcode::COPY) {
5805 Register ToReg = UseMI.getOperand(0).getReg();
5806 const TargetRegisterClass *RC = nullptr;
5807 if (ToReg.isVirtual())
5808 RC = MRI->getRegClass(ToReg);
5809 bool GR32Reg = (ToReg.isVirtual() && X86::GR32RegClass.hasSubClassEq(RC)) ||
5810 (ToReg.isPhysical() && X86::GR32RegClass.contains(ToReg));
5811 bool GR64Reg = (ToReg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC)) ||
5812 (ToReg.isPhysical() && X86::GR64RegClass.contains(ToReg));
5813 bool GR8Reg = (ToReg.isVirtual() && X86::GR8RegClass.hasSubClassEq(RC)) ||
5814 (ToReg.isPhysical() && X86::GR8RegClass.contains(ToReg));
5815
5816 if (ImmVal == 0) {
5817 // We have MOV32r0 only.
5818 if (!GR32Reg)
5819 return false;
5820 }
5821
5822 if (GR64Reg) {
5823 if (isUInt<32>(ImmVal))
5824 NewOpc = X86::MOV32ri64;
5825 else
5826 NewOpc = X86::MOV64ri;
5827 } else if (GR32Reg) {
5828 NewOpc = X86::MOV32ri;
5829 if (ImmVal == 0) {
5830 // MOV32r0 clobbers EFLAGS.
5831 const TargetRegisterInfo *TRI = &getRegisterInfo();
5832 if (UseMI.getParent()->computeRegisterLiveness(
5833 TRI, X86::EFLAGS, UseMI) != MachineBasicBlock::LQR_Dead)
5834 return false;
5835
5836 // MOV32r0 is different than other cases because it doesn't encode the
5837 // immediate in the instruction. So we directly modify it here.
5838 if (!MakeChange)
5839 return true;
5840 UseMI.setDesc(get(X86::MOV32r0));
5841 UseMI.removeOperand(
5842 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5843 UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/true,
5844 /*isImp=*/true,
5845 /*isKill=*/false,
5846 /*isDead=*/true));
5847 Modified = true;
5848 }
5849 } else if (GR8Reg)
5850 NewOpc = X86::MOV8ri;
5851 else
5852 return false;
5853 } else
5854 NewOpc = convertALUrr2ALUri(Opc);
5855
5856 if (!NewOpc)
5857 return false;
5858
5859 // For SUB instructions the immediate can only be the second source operand.
5860 if ((NewOpc == X86::SUB64ri32 || NewOpc == X86::SUB32ri ||
5861 NewOpc == X86::SBB64ri32 || NewOpc == X86::SBB32ri ||
5862 NewOpc == X86::SUB64ri32_ND || NewOpc == X86::SUB32ri_ND ||
5863 NewOpc == X86::SBB64ri32_ND || NewOpc == X86::SBB32ri_ND) &&
5864 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 2)
5865 return false;
5866 // For CMP instructions the immediate can only be at index 1.
5867 if (((NewOpc == X86::CMP64ri32 || NewOpc == X86::CMP32ri) ||
5868 (NewOpc == X86::CCMP64ri32 || NewOpc == X86::CCMP32ri)) &&
5869 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 1)
5870 return false;
5871
5872 using namespace X86;
5873 if (isSHL(Opc) || isSHR(Opc) || isSAR(Opc) || isROL(Opc) || isROR(Opc) ||
5874 isRCL(Opc) || isRCR(Opc)) {
5875 unsigned RegIdx = UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr);
5876 if (RegIdx < 2)
5877 return false;
5878 if (!isInt<8>(ImmVal))
5879 return false;
5880 assert(Reg == X86::CL);
5881
5882 if (!MakeChange)
5883 return true;
5884 UseMI.setDesc(get(NewOpc));
5885 UseMI.removeOperand(RegIdx);
5886 UseMI.addOperand(MachineOperand::CreateImm(ImmVal));
5887 // Reg is physical register $cl, so we don't know if DefMI is dead through
5888 // MRI. Let the caller handle it, or pass dead-mi-elimination can delete
5889 // the dead physical register define instruction.
5890 return true;
5891 }
5892
5893 if (!MakeChange)
5894 return true;
5895
5896 if (!Modified) {
5897 // Modify the instruction.
5898 if (ImmVal == 0 && canConvert2Copy(NewOpc) &&
5899 UseMI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr)) {
5900 // %100 = add %101, 0
5901 // ==>
5902 // %100 = COPY %101
5903 UseMI.setDesc(get(TargetOpcode::COPY));
5904 UseMI.removeOperand(
5905 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5906 UseMI.removeOperand(
5907 UseMI.findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5908 UseMI.untieRegOperand(0);
5911 } else {
5912 unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
5913 unsigned ImmOpNum = 2;
5914 if (!UseMI.getOperand(0).isDef()) {
5915 Op1 = 0; // TEST, CMP, CTEST, CCMP
5916 ImmOpNum = 1;
5917 }
5918 if (Opc == TargetOpcode::COPY)
5919 ImmOpNum = 1;
5920 if (findCommutedOpIndices(UseMI, Op1, Op2) &&
5921 UseMI.getOperand(Op1).getReg() == Reg)
5922 commuteInstruction(UseMI);
5923
5924 assert(UseMI.getOperand(ImmOpNum).getReg() == Reg);
5925 UseMI.setDesc(get(NewOpc));
5926 UseMI.getOperand(ImmOpNum).ChangeToImmediate(ImmVal);
5927 }
5928 }
5929
5930 if (Reg.isVirtual() && MRI->use_nodbg_empty(Reg))
5932
5933 return true;
5934}
5935
5936/// foldImmediate - 'Reg' is known to be defined by a move immediate
5937/// instruction, try to fold the immediate into the use instruction.
5939 Register Reg, MachineRegisterInfo *MRI) const {
5940 int64_t ImmVal;
5941 if (!getConstValDefinedInReg(DefMI, Reg, ImmVal))
5942 return false;
5943
5944 return foldImmediateImpl(UseMI, &DefMI, Reg, ImmVal, MRI, true);
5945}
5946
5947/// Expand a single-def pseudo instruction to a two-addr
5948/// instruction with two undef reads of the register being defined.
5949/// This is used for mapping:
5950/// %xmm4 = V_SET0
5951/// to:
5952/// %xmm4 = PXORrr undef %xmm4, undef %xmm4
5953///
5955 const MCInstrDesc &Desc) {
5956 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5957 Register Reg = MIB.getReg(0);
5958 MIB->setDesc(Desc);
5959
5960 // MachineInstr::addOperand() will insert explicit operands before any
5961 // implicit operands.
5963 // But we don't trust that.
5964 assert(MIB.getReg(1) == Reg && MIB.getReg(2) == Reg && "Misplaced operand");
5965 return true;
5966}
5967
5968/// Expand a single-def pseudo instruction to a two-addr
5969/// instruction with two %k0 reads.
5970/// This is used for mapping:
5971/// %k4 = K_SET1
5972/// to:
5973/// %k4 = KXNORrr %k0, %k0
5975 Register Reg) {
5976 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5977 MIB->setDesc(Desc);
5979 return true;
5980}
5981
5983 bool MinusOne) {
5984 MachineBasicBlock &MBB = *MIB->getParent();
5985 const DebugLoc &DL = MIB->getDebugLoc();
5986 Register Reg = MIB.getReg(0);
5987
5988 // Insert the XOR.
5989 BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
5992
5993 // Turn the pseudo into an INC or DEC.
5994 MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
5995 MIB.addReg(Reg);
5996
5997 return true;
5998}
5999
6001 const TargetInstrInfo &TII,
6002 const X86Subtarget &Subtarget) {
6003 MachineBasicBlock &MBB = *MIB->getParent();
6004 const DebugLoc &DL = MIB->getDebugLoc();
6005 int64_t Imm = MIB->getOperand(1).getImm();
6006 assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
6008
6009 int StackAdjustment;
6010
6011 if (Subtarget.is64Bit()) {
6012 assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
6013 MIB->getOpcode() == X86::MOV32ImmSExti8);
6014
6015 // Can't use push/pop lowering if the function might write to the red zone.
6016 X86MachineFunctionInfo *X86FI =
6017 MBB.getParent()->getInfo<X86MachineFunctionInfo>();
6018 if (X86FI->getUsesRedZone()) {
6019 MIB->setDesc(TII.get(MIB->getOpcode() == X86::MOV32ImmSExti8
6020 ? X86::MOV32ri
6021 : X86::MOV64ri));
6022 return true;
6023 }
6024
6025 // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
6026 // widen the register if necessary.
6027 StackAdjustment = 8;
6028 BuildMI(MBB, I, DL, TII.get(X86::PUSH64i32)).addImm(Imm);
6029 MIB->setDesc(TII.get(X86::POP64r));
6030 MIB->getOperand(0).setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
6031 } else {
6032 assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
6033 StackAdjustment = 4;
6034 BuildMI(MBB, I, DL, TII.get(X86::PUSH32i)).addImm(Imm);
6035 MIB->setDesc(TII.get(X86::POP32r));
6036 }
6037 MIB->removeOperand(1);
6038 MIB->addImplicitDefUseOperands(*MBB.getParent());
6039
6040 // Build CFI if necessary.
6041 MachineFunction &MF = *MBB.getParent();
6042 const X86FrameLowering *TFL = Subtarget.getFrameLowering();
6043 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo().usesWindowsCFI();
6044 bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
6045 bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
6046 if (EmitCFI) {
6047 TFL->BuildCFI(
6048 MBB, I, DL,
6049 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
6050 TFL->BuildCFI(
6051 MBB, std::next(I), DL,
6052 MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
6053 }
6054
6055 return true;
6056}
6057
6058// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
6059// code sequence is needed for other targets.
6061 const TargetInstrInfo &TII) {
6062 MachineBasicBlock &MBB = *MIB->getParent();
6063 const DebugLoc &DL = MIB->getDebugLoc();
6064 Register Reg = MIB.getReg(0);
6065 const GlobalValue *GV =
6066 cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
6067 auto Flags = MachineMemOperand::MOLoad |
6070 MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
6071 MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
6073
6074 BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg)
6075 .addReg(X86::RIP)
6076 .addImm(1)
6077 .addReg(0)
6079 .addReg(0)
6080 .addMemOperand(MMO);
6081 MIB->setDebugLoc(DL);
6082 MIB->setDesc(TII.get(X86::MOV64rm));
6084}
6085
6087 MachineBasicBlock &MBB = *MIB->getParent();
6088 MachineFunction &MF = *MBB.getParent();
6089 const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
6090 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
6091 unsigned XorOp =
6092 MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
6093 MIB->setDesc(TII.get(XorOp));
6094 MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
6095 return true;
6096}
6097
6098// This is used to handle spills for 128/256-bit registers when we have AVX512,
6099// but not VLX. If it uses an extended register we need to use an instruction
6100// that loads the lower 128/256-bit, but is available with only AVX512F.
6102 const TargetRegisterInfo *TRI,
6103 const MCInstrDesc &LoadDesc,
6104 const MCInstrDesc &BroadcastDesc, unsigned SubIdx) {
6105 Register DestReg = MIB.getReg(0);
6106 // Check if DestReg is XMM16-31 or YMM16-31.
6107 if (TRI->getEncodingValue(DestReg) < 16) {
6108 // We can use a normal VEX encoded load.
6109 MIB->setDesc(LoadDesc);
6110 } else {
6111 // Use a 128/256-bit VBROADCAST instruction.
6112 MIB->setDesc(BroadcastDesc);
6113 // Change the destination to a 512-bit register.
6114 DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
6115 MIB->getOperand(0).setReg(DestReg);
6116 }
6117 return true;
6118}
6119
6120// This is used to handle spills for 128/256-bit registers when we have AVX512,
6121// but not VLX. If it uses an extended register we need to use an instruction
6122// that stores the lower 128/256-bit, but is available with only AVX512F.
6124 const TargetRegisterInfo *TRI,
6125 const MCInstrDesc &StoreDesc,
6126 const MCInstrDesc &ExtractDesc, unsigned SubIdx) {
6127 Register SrcReg = MIB.getReg(X86::AddrNumOperands);
6128 // Check if DestReg is XMM16-31 or YMM16-31.
6129 if (TRI->getEncodingValue(SrcReg) < 16) {
6130 // We can use a normal VEX encoded store.
6131 MIB->setDesc(StoreDesc);
6132 } else {
6133 // Use a VEXTRACTF instruction.
6134 MIB->setDesc(ExtractDesc);
6135 // Change the destination to a 512-bit register.
6136 SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
6138 MIB.addImm(0x0); // Append immediate to extract from the lower bits.
6139 }
6140
6141 return true;
6142}
6143
6145 MIB->setDesc(Desc);
6146 int64_t ShiftAmt = MIB->getOperand(2).getImm();
6147 // Temporarily remove the immediate so we can add another source register.
6148 MIB->removeOperand(2);
6149 // Add the register. Don't copy the kill flag if there is one.
6150 MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef()));
6151 // Add back the immediate.
6152 MIB.addImm(ShiftAmt);
6153 return true;
6154}
6155
6157 const TargetInstrInfo &TII, bool HasAVX) {
6158 unsigned NewOpc;
6159 if (MI.getOpcode() == X86::MOVSHPrm) {
6160 NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
6161 Register Reg = MI.getOperand(0).getReg();
6162 if (Reg > X86::XMM15)
6163 NewOpc = X86::VMOVSSZrm;
6164 } else {
6165 NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
6166 Register Reg = MI.getOperand(5).getReg();
6167 if (Reg > X86::XMM15)
6168 NewOpc = X86::VMOVSSZmr;
6169 }
6170
6171 MIB->setDesc(TII.get(NewOpc));
6172 return true;
6173}
6174
6176 bool HasAVX = Subtarget.hasAVX();
6177 MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
6178 switch (MI.getOpcode()) {
6179 case X86::MOV32r0:
6180 return Expand2AddrUndef(MIB, get(X86::XOR32rr));
6181 case X86::MOV32r1:
6182 return expandMOV32r1(MIB, *this, /*MinusOne=*/false);
6183 case X86::MOV32r_1:
6184 return expandMOV32r1(MIB, *this, /*MinusOne=*/true);
6185 case X86::MOV32ImmSExti8:
6186 case X86::MOV64ImmSExti8:
6187 return ExpandMOVImmSExti8(MIB, *this, Subtarget);
6188 case X86::SETB_C32r:
6189 return Expand2AddrUndef(MIB, get(X86::SBB32rr));
6190 case X86::SETB_C64r:
6191 return Expand2AddrUndef(MIB, get(X86::SBB64rr));
6192 case X86::MMX_SET0:
6193 return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
6194 case X86::V_SET0:
6195 case X86::FsFLD0SS:
6196 case X86::FsFLD0SD:
6197 case X86::FsFLD0SH:
6198 case X86::FsFLD0F128:
6199 return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
6200 case X86::AVX_SET0: {
6201 assert(HasAVX && "AVX not supported");
6203 Register SrcReg = MIB.getReg(0);
6204 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6205 MIB->getOperand(0).setReg(XReg);
6206 Expand2AddrUndef(MIB, get(X86::VXORPSrr));
6207 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6208 return true;
6209 }
6210 case X86::AVX512_128_SET0:
6211 case X86::AVX512_FsFLD0SH:
6212 case X86::AVX512_FsFLD0SS:
6213 case X86::AVX512_FsFLD0SD:
6214 case X86::AVX512_FsFLD0F128: {
6215 bool HasVLX = Subtarget.hasVLX();
6216 Register SrcReg = MIB.getReg(0);
6218 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
6219 return Expand2AddrUndef(MIB,
6220 get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6221 // Extended register without VLX. Use a larger XOR.
6222 SrcReg =
6223 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
6224 MIB->getOperand(0).setReg(SrcReg);
6225 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6226 }
6227 case X86::AVX512_256_SET0:
6228 case X86::AVX512_512_SET0: {
6229 bool HasVLX = Subtarget.hasVLX();
6230 Register SrcReg = MIB.getReg(0);
6232 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
6233 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6234 MIB->getOperand(0).setReg(XReg);
6235 Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6236 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6237 return true;
6238 }
6239 if (MI.getOpcode() == X86::AVX512_256_SET0) {
6240 // No VLX so we must reference a zmm.
6241 MCRegister ZReg =
6242 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
6243 MIB->getOperand(0).setReg(ZReg);
6244 }
6245 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6246 }
6247 case X86::MOVSHPmr:
6248 case X86::MOVSHPrm:
6249 return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX());
6250 case X86::V_SETALLONES:
6251 return Expand2AddrUndef(MIB,
6252 get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
6253 case X86::AVX2_SETALLONES:
6254 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6255 case X86::AVX1_SETALLONES: {
6256 Register Reg = MIB.getReg(0);
6257 // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
6258 MIB->setDesc(get(X86::VCMPPSYrri));
6259 MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
6260 return true;
6261 }
6262 case X86::AVX512_128_SETALLONES:
6263 case X86::AVX512_256_SETALLONES:
6264 case X86::AVX512_512_SETALLONES: {
6265 Register Reg = MIB.getReg(0);
6266 unsigned Opc;
6267 switch (MI.getOpcode()) {
6268 case X86::AVX512_128_SETALLONES: {
6269 if (X86::VR128RegClass.contains(Reg))
6270 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDrr));
6271
6272 Opc = X86::VPTERNLOGDZ128rri;
6273 break;
6274 }
6275 case X86::AVX512_256_SETALLONES: {
6276 if (X86::VR256RegClass.contains(Reg))
6277 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6278
6279 Opc = X86::VPTERNLOGDZ256rri;
6280 break;
6281 }
6282 case X86::AVX512_512_SETALLONES:
6283 Opc = X86::VPTERNLOGDZrri;
6284 break;
6285 }
6286 MIB->setDesc(get(Opc));
6287 // VPTERNLOGD needs 3 register inputs and an immediate.
6288 // 0xff will return 1s for any input.
6289 MIB.addReg(Reg, RegState::Undef)
6290 .addReg(Reg, RegState::Undef)
6291 .addReg(Reg, RegState::Undef)
6292 .addImm(0xff);
6293 return true;
6294 }
6295 case X86::AVX512_512_SEXT_MASK_32:
6296 case X86::AVX512_512_SEXT_MASK_64: {
6297 Register Reg = MIB.getReg(0);
6298 Register MaskReg = MIB.getReg(1);
6299 RegState MaskState = getRegState(MIB->getOperand(1));
6300 unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64)
6301 ? X86::VPTERNLOGQZrrikz
6302 : X86::VPTERNLOGDZrrikz;
6303 MI.removeOperand(1);
6304 MIB->setDesc(get(Opc));
6305 // VPTERNLOG needs 3 register inputs and an immediate.
6306 // 0xff will return 1s for any input.
6307 MIB.addReg(Reg, RegState::Undef)
6308 .addReg(MaskReg, MaskState)
6309 .addReg(Reg, RegState::Undef)
6310 .addReg(Reg, RegState::Undef)
6311 .addImm(0xff);
6312 return true;
6313 }
6314 case X86::VMOVAPSZ128rm_NOVLX:
6315 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
6316 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6317 case X86::VMOVUPSZ128rm_NOVLX:
6318 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
6319 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6320 case X86::VMOVAPSZ256rm_NOVLX:
6321 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
6322 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6323 case X86::VMOVUPSZ256rm_NOVLX:
6324 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
6325 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6326 case X86::VMOVAPSZ128mr_NOVLX:
6327 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
6328 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6329 case X86::VMOVUPSZ128mr_NOVLX:
6330 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
6331 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6332 case X86::VMOVAPSZ256mr_NOVLX:
6333 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
6334 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6335 case X86::VMOVUPSZ256mr_NOVLX:
6336 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
6337 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6338 case X86::MOV32ri64: {
6339 Register Reg = MIB.getReg(0);
6340 Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
6341 MI.setDesc(get(X86::MOV32ri));
6342 MIB->getOperand(0).setReg(Reg32);
6344 return true;
6345 }
6346
6347 case X86::RDFLAGS32:
6348 case X86::RDFLAGS64: {
6349 unsigned Is64Bit = MI.getOpcode() == X86::RDFLAGS64;
6350 MachineBasicBlock &MBB = *MIB->getParent();
6351
6352 MachineInstr *NewMI = BuildMI(MBB, MI, MIB->getDebugLoc(),
6353 get(Is64Bit ? X86::PUSHF64 : X86::PUSHF32))
6354 .getInstr();
6355
6356 // Permit reads of the EFLAGS and DF registers without them being defined.
6357 // This intrinsic exists to read external processor state in flags, such as
6358 // the trap flag, interrupt flag, and direction flag, none of which are
6359 // modeled by the backend.
6360 assert(NewMI->getOperand(2).getReg() == X86::EFLAGS &&
6361 "Unexpected register in operand! Should be EFLAGS.");
6362 NewMI->getOperand(2).setIsUndef();
6363 assert(NewMI->getOperand(3).getReg() == X86::DF &&
6364 "Unexpected register in operand! Should be DF.");
6365 NewMI->getOperand(3).setIsUndef();
6366
6367 MIB->setDesc(get(Is64Bit ? X86::POP64r : X86::POP32r));
6368 return true;
6369 }
6370
6371 case X86::WRFLAGS32:
6372 case X86::WRFLAGS64: {
6373 unsigned Is64Bit = MI.getOpcode() == X86::WRFLAGS64;
6374 MachineBasicBlock &MBB = *MIB->getParent();
6375
6376 BuildMI(MBB, MI, MIB->getDebugLoc(),
6377 get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
6378 .addReg(MI.getOperand(0).getReg());
6379 BuildMI(MBB, MI, MIB->getDebugLoc(),
6380 get(Is64Bit ? X86::POPF64 : X86::POPF32));
6381 MI.eraseFromParent();
6382 return true;
6383 }
6384
6385 // KNL does not recognize dependency-breaking idioms for mask registers,
6386 // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
6387 // Using %k0 as the undef input register is a performance heuristic based
6388 // on the assumption that %k0 is used less frequently than the other mask
6389 // registers, since it is not usable as a write mask.
6390 // FIXME: A more advanced approach would be to choose the best input mask
6391 // register based on context.
6392 case X86::KSET0B:
6393 return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0);
6394 case X86::KSET0W:
6395 return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
6396 case X86::KSET0D:
6397 return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
6398 case X86::KSET0Q:
6399 return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
6400 case X86::KSET1B:
6401 return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0);
6402 case X86::KSET1W:
6403 return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
6404 case X86::KSET1D:
6405 return Expand2AddrKreg(MIB, get(X86::KXNORDkk), X86::K0);
6406 case X86::KSET1Q:
6407 return Expand2AddrKreg(MIB, get(X86::KXNORQkk), X86::K0);
6408 case TargetOpcode::LOAD_STACK_GUARD:
6409 expandLoadStackGuard(MIB, *this);
6410 return true;
6411 case X86::XOR64_FP:
6412 case X86::XOR32_FP:
6413 return expandXorFP(MIB, *this);
6414 case X86::SHLDROT32ri:
6415 return expandSHXDROT(MIB, get(X86::SHLD32rri8));
6416 case X86::SHLDROT64ri:
6417 return expandSHXDROT(MIB, get(X86::SHLD64rri8));
6418 case X86::SHRDROT32ri:
6419 return expandSHXDROT(MIB, get(X86::SHRD32rri8));
6420 case X86::SHRDROT64ri:
6421 return expandSHXDROT(MIB, get(X86::SHRD64rri8));
6422 case X86::ADD8rr_DB:
6423 MIB->setDesc(get(X86::OR8rr));
6424 break;
6425 case X86::ADD16rr_DB:
6426 MIB->setDesc(get(X86::OR16rr));
6427 break;
6428 case X86::ADD32rr_DB:
6429 MIB->setDesc(get(X86::OR32rr));
6430 break;
6431 case X86::ADD64rr_DB:
6432 MIB->setDesc(get(X86::OR64rr));
6433 break;
6434 case X86::ADD8ri_DB:
6435 MIB->setDesc(get(X86::OR8ri));
6436 break;
6437 case X86::ADD16ri_DB:
6438 MIB->setDesc(get(X86::OR16ri));
6439 break;
6440 case X86::ADD32ri_DB:
6441 MIB->setDesc(get(X86::OR32ri));
6442 break;
6443 case X86::ADD64ri32_DB:
6444 MIB->setDesc(get(X86::OR64ri32));
6445 break;
6446 }
6447 return false;
6448}
6449
6450/// Return true for all instructions that only update
6451/// the first 32 or 64-bits of the destination register and leave the rest
6452/// unmodified. This can be used to avoid folding loads if the instructions
6453/// only update part of the destination register, and the non-updated part is
6454/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
6455/// instructions breaks the partial register dependency and it can improve
6456/// performance. e.g.:
6457///
6458/// movss (%rdi), %xmm0
6459/// cvtss2sd %xmm0, %xmm0
6460///
6461/// Instead of
6462/// cvtss2sd (%rdi), %xmm0
6463///
6464/// FIXME: This should be turned into a TSFlags.
6465///
6466static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
6467 bool ForLoadFold = false) {
6468 switch (Opcode) {
6469 case X86::CVTSI2SSrr:
6470 case X86::CVTSI2SSrm:
6471 case X86::CVTSI642SSrr:
6472 case X86::CVTSI642SSrm:
6473 case X86::CVTSI2SDrr:
6474 case X86::CVTSI2SDrm:
6475 case X86::CVTSI642SDrr:
6476 case X86::CVTSI642SDrm:
6477 // Load folding won't effect the undef register update since the input is
6478 // a GPR.
6479 return !ForLoadFold;
6480 case X86::CVTSD2SSrr:
6481 case X86::CVTSD2SSrm:
6482 case X86::CVTSS2SDrr:
6483 case X86::CVTSS2SDrm:
6484 case X86::MOVHPDrm:
6485 case X86::MOVHPSrm:
6486 case X86::MOVLPDrm:
6487 case X86::MOVLPSrm:
6488 case X86::RCPSSr:
6489 case X86::RCPSSm:
6490 case X86::RCPSSr_Int:
6491 case X86::RCPSSm_Int:
6492 case X86::ROUNDSDri:
6493 case X86::ROUNDSDmi:
6494 case X86::ROUNDSSri:
6495 case X86::ROUNDSSmi:
6496 case X86::RSQRTSSr:
6497 case X86::RSQRTSSm:
6498 case X86::RSQRTSSr_Int:
6499 case X86::RSQRTSSm_Int:
6500 case X86::SQRTSSr:
6501 case X86::SQRTSSm:
6502 case X86::SQRTSSr_Int:
6503 case X86::SQRTSSm_Int:
6504 case X86::SQRTSDr:
6505 case X86::SQRTSDm:
6506 case X86::SQRTSDr_Int:
6507 case X86::SQRTSDm_Int:
6508 return true;
6509 case X86::VFCMULCPHZ128rm:
6510 case X86::VFCMULCPHZ128rmb:
6511 case X86::VFCMULCPHZ128rmbkz:
6512 case X86::VFCMULCPHZ128rmkz:
6513 case X86::VFCMULCPHZ128rr:
6514 case X86::VFCMULCPHZ128rrkz:
6515 case X86::VFCMULCPHZ256rm:
6516 case X86::VFCMULCPHZ256rmb:
6517 case X86::VFCMULCPHZ256rmbkz:
6518 case X86::VFCMULCPHZ256rmkz:
6519 case X86::VFCMULCPHZ256rr:
6520 case X86::VFCMULCPHZ256rrkz:
6521 case X86::VFCMULCPHZrm:
6522 case X86::VFCMULCPHZrmb:
6523 case X86::VFCMULCPHZrmbkz:
6524 case X86::VFCMULCPHZrmkz:
6525 case X86::VFCMULCPHZrr:
6526 case X86::VFCMULCPHZrrb:
6527 case X86::VFCMULCPHZrrbkz:
6528 case X86::VFCMULCPHZrrkz:
6529 case X86::VFMULCPHZ128rm:
6530 case X86::VFMULCPHZ128rmb:
6531 case X86::VFMULCPHZ128rmbkz:
6532 case X86::VFMULCPHZ128rmkz:
6533 case X86::VFMULCPHZ128rr:
6534 case X86::VFMULCPHZ128rrkz:
6535 case X86::VFMULCPHZ256rm:
6536 case X86::VFMULCPHZ256rmb:
6537 case X86::VFMULCPHZ256rmbkz:
6538 case X86::VFMULCPHZ256rmkz:
6539 case X86::VFMULCPHZ256rr:
6540 case X86::VFMULCPHZ256rrkz:
6541 case X86::VFMULCPHZrm:
6542 case X86::VFMULCPHZrmb:
6543 case X86::VFMULCPHZrmbkz:
6544 case X86::VFMULCPHZrmkz:
6545 case X86::VFMULCPHZrr:
6546 case X86::VFMULCPHZrrb:
6547 case X86::VFMULCPHZrrbkz:
6548 case X86::VFMULCPHZrrkz:
6549 case X86::VFCMULCSHZrm:
6550 case X86::VFCMULCSHZrmkz:
6551 case X86::VFCMULCSHZrr:
6552 case X86::VFCMULCSHZrrb:
6553 case X86::VFCMULCSHZrrbkz:
6554 case X86::VFCMULCSHZrrkz:
6555 case X86::VFMULCSHZrm:
6556 case X86::VFMULCSHZrmkz:
6557 case X86::VFMULCSHZrr:
6558 case X86::VFMULCSHZrrb:
6559 case X86::VFMULCSHZrrbkz:
6560 case X86::VFMULCSHZrrkz:
6561 return Subtarget.hasMULCFalseDeps();
6562 case X86::VPERMDYrm:
6563 case X86::VPERMDYrr:
6564 case X86::VPERMQYmi:
6565 case X86::VPERMQYri:
6566 case X86::VPERMPSYrm:
6567 case X86::VPERMPSYrr:
6568 case X86::VPERMPDYmi:
6569 case X86::VPERMPDYri:
6570 case X86::VPERMDZ256rm:
6571 case X86::VPERMDZ256rmb:
6572 case X86::VPERMDZ256rmbkz:
6573 case X86::VPERMDZ256rmkz:
6574 case X86::VPERMDZ256rr:
6575 case X86::VPERMDZ256rrkz:
6576 case X86::VPERMDZrm:
6577 case X86::VPERMDZrmb:
6578 case X86::VPERMDZrmbkz:
6579 case X86::VPERMDZrmkz:
6580 case X86::VPERMDZrr:
6581 case X86::VPERMDZrrkz:
6582 case X86::VPERMQZ256mbi:
6583 case X86::VPERMQZ256mbikz:
6584 case X86::VPERMQZ256mi:
6585 case X86::VPERMQZ256mikz:
6586 case X86::VPERMQZ256ri:
6587 case X86::VPERMQZ256rikz:
6588 case X86::VPERMQZ256rm:
6589 case X86::VPERMQZ256rmb:
6590 case X86::VPERMQZ256rmbkz:
6591 case X86::VPERMQZ256rmkz:
6592 case X86::VPERMQZ256rr:
6593 case X86::VPERMQZ256rrkz:
6594 case X86::VPERMQZmbi:
6595 case X86::VPERMQZmbikz:
6596 case X86::VPERMQZmi:
6597 case X86::VPERMQZmikz:
6598 case X86::VPERMQZri:
6599 case X86::VPERMQZrikz:
6600 case X86::VPERMQZrm:
6601 case X86::VPERMQZrmb:
6602 case X86::VPERMQZrmbkz:
6603 case X86::VPERMQZrmkz:
6604 case X86::VPERMQZrr:
6605 case X86::VPERMQZrrkz:
6606 case X86::VPERMPSZ256rm:
6607 case X86::VPERMPSZ256rmb:
6608 case X86::VPERMPSZ256rmbkz:
6609 case X86::VPERMPSZ256rmkz:
6610 case X86::VPERMPSZ256rr:
6611 case X86::VPERMPSZ256rrkz:
6612 case X86::VPERMPSZrm:
6613 case X86::VPERMPSZrmb:
6614 case X86::VPERMPSZrmbkz:
6615 case X86::VPERMPSZrmkz:
6616 case X86::VPERMPSZrr:
6617 case X86::VPERMPSZrrkz:
6618 case X86::VPERMPDZ256mbi:
6619 case X86::VPERMPDZ256mbikz:
6620 case X86::VPERMPDZ256mi:
6621 case X86::VPERMPDZ256mikz:
6622 case X86::VPERMPDZ256ri:
6623 case X86::VPERMPDZ256rikz:
6624 case X86::VPERMPDZ256rm:
6625 case X86::VPERMPDZ256rmb:
6626 case X86::VPERMPDZ256rmbkz:
6627 case X86::VPERMPDZ256rmkz:
6628 case X86::VPERMPDZ256rr:
6629 case X86::VPERMPDZ256rrkz:
6630 case X86::VPERMPDZmbi:
6631 case X86::VPERMPDZmbikz:
6632 case X86::VPERMPDZmi:
6633 case X86::VPERMPDZmikz:
6634 case X86::VPERMPDZri:
6635 case X86::VPERMPDZrikz:
6636 case X86::VPERMPDZrm:
6637 case X86::VPERMPDZrmb:
6638 case X86::VPERMPDZrmbkz:
6639 case X86::VPERMPDZrmkz:
6640 case X86::VPERMPDZrr:
6641 case X86::VPERMPDZrrkz:
6642 return Subtarget.hasPERMFalseDeps();
6643 case X86::VRANGEPDZ128rmbi:
6644 case X86::VRANGEPDZ128rmbikz:
6645 case X86::VRANGEPDZ128rmi:
6646 case X86::VRANGEPDZ128rmikz:
6647 case X86::VRANGEPDZ128rri:
6648 case X86::VRANGEPDZ128rrikz:
6649 case X86::VRANGEPDZ256rmbi:
6650 case X86::VRANGEPDZ256rmbikz:
6651 case X86::VRANGEPDZ256rmi:
6652 case X86::VRANGEPDZ256rmikz:
6653 case X86::VRANGEPDZ256rri:
6654 case X86::VRANGEPDZ256rrikz:
6655 case X86::VRANGEPDZrmbi:
6656 case X86::VRANGEPDZrmbikz:
6657 case X86::VRANGEPDZrmi:
6658 case X86::VRANGEPDZrmikz:
6659 case X86::VRANGEPDZrri:
6660 case X86::VRANGEPDZrrib:
6661 case X86::VRANGEPDZrribkz:
6662 case X86::VRANGEPDZrrikz:
6663 case X86::VRANGEPSZ128rmbi:
6664 case X86::VRANGEPSZ128rmbikz:
6665 case X86::VRANGEPSZ128rmi:
6666 case X86::VRANGEPSZ128rmikz:
6667 case X86::VRANGEPSZ128rri:
6668 case X86::VRANGEPSZ128rrikz:
6669 case X86::VRANGEPSZ256rmbi:
6670 case X86::VRANGEPSZ256rmbikz:
6671 case X86::VRANGEPSZ256rmi:
6672 case X86::VRANGEPSZ256rmikz:
6673 case X86::VRANGEPSZ256rri:
6674 case X86::VRANGEPSZ256rrikz:
6675 case X86::VRANGEPSZrmbi:
6676 case X86::VRANGEPSZrmbikz:
6677 case X86::VRANGEPSZrmi:
6678 case X86::VRANGEPSZrmikz:
6679 case X86::VRANGEPSZrri:
6680 case X86::VRANGEPSZrrib:
6681 case X86::VRANGEPSZrribkz:
6682 case X86::VRANGEPSZrrikz:
6683 case X86::VRANGESDZrmi:
6684 case X86::VRANGESDZrmikz:
6685 case X86::VRANGESDZrri:
6686 case X86::VRANGESDZrrib:
6687 case X86::VRANGESDZrribkz:
6688 case X86::VRANGESDZrrikz:
6689 case X86::VRANGESSZrmi:
6690 case X86::VRANGESSZrmikz:
6691 case X86::VRANGESSZrri:
6692 case X86::VRANGESSZrrib:
6693 case X86::VRANGESSZrribkz:
6694 case X86::VRANGESSZrrikz:
6695 return Subtarget.hasRANGEFalseDeps();
6696 case X86::VGETMANTSSZrmi:
6697 case X86::VGETMANTSSZrmikz:
6698 case X86::VGETMANTSSZrri:
6699 case X86::VGETMANTSSZrrib:
6700 case X86::VGETMANTSSZrribkz:
6701 case X86::VGETMANTSSZrrikz:
6702 case X86::VGETMANTSDZrmi:
6703 case X86::VGETMANTSDZrmikz:
6704 case X86::VGETMANTSDZrri:
6705 case X86::VGETMANTSDZrrib:
6706 case X86::VGETMANTSDZrribkz:
6707 case X86::VGETMANTSDZrrikz:
6708 case X86::VGETMANTSHZrmi:
6709 case X86::VGETMANTSHZrmikz:
6710 case X86::VGETMANTSHZrri:
6711 case X86::VGETMANTSHZrrib:
6712 case X86::VGETMANTSHZrribkz:
6713 case X86::VGETMANTSHZrrikz:
6714 case X86::VGETMANTPSZ128rmbi:
6715 case X86::VGETMANTPSZ128rmbikz:
6716 case X86::VGETMANTPSZ128rmi:
6717 case X86::VGETMANTPSZ128rmikz:
6718 case X86::VGETMANTPSZ256rmbi:
6719 case X86::VGETMANTPSZ256rmbikz:
6720 case X86::VGETMANTPSZ256rmi:
6721 case X86::VGETMANTPSZ256rmikz:
6722 case X86::VGETMANTPSZrmbi:
6723 case X86::VGETMANTPSZrmbikz:
6724 case X86::VGETMANTPSZrmi:
6725 case X86::VGETMANTPSZrmikz:
6726 case X86::VGETMANTPDZ128rmbi:
6727 case X86::VGETMANTPDZ128rmbikz:
6728 case X86::VGETMANTPDZ128rmi:
6729 case X86::VGETMANTPDZ128rmikz:
6730 case X86::VGETMANTPDZ256rmbi:
6731 case X86::VGETMANTPDZ256rmbikz:
6732 case X86::VGETMANTPDZ256rmi:
6733 case X86::VGETMANTPDZ256rmikz:
6734 case X86::VGETMANTPDZrmbi:
6735 case X86::VGETMANTPDZrmbikz:
6736 case X86::VGETMANTPDZrmi:
6737 case X86::VGETMANTPDZrmikz:
6738 return Subtarget.hasGETMANTFalseDeps();
6739 case X86::VPMULLQZ128rm:
6740 case X86::VPMULLQZ128rmb:
6741 case X86::VPMULLQZ128rmbkz:
6742 case X86::VPMULLQZ128rmkz:
6743 case X86::VPMULLQZ128rr:
6744 case X86::VPMULLQZ128rrkz:
6745 case X86::VPMULLQZ256rm:
6746 case X86::VPMULLQZ256rmb:
6747 case X86::VPMULLQZ256rmbkz:
6748 case X86::VPMULLQZ256rmkz:
6749 case X86::VPMULLQZ256rr:
6750 case X86::VPMULLQZ256rrkz:
6751 case X86::VPMULLQZrm:
6752 case X86::VPMULLQZrmb:
6753 case X86::VPMULLQZrmbkz:
6754 case X86::VPMULLQZrmkz:
6755 case X86::VPMULLQZrr:
6756 case X86::VPMULLQZrrkz:
6757 return Subtarget.hasMULLQFalseDeps();
6758 // GPR
6759 case X86::POPCNT32rm:
6760 case X86::POPCNT32rr:
6761 case X86::POPCNT64rm:
6762 case X86::POPCNT64rr:
6763 return Subtarget.hasPOPCNTFalseDeps();
6764 case X86::LZCNT32rm:
6765 case X86::LZCNT32rr:
6766 case X86::LZCNT64rm:
6767 case X86::LZCNT64rr:
6768 case X86::TZCNT32rm:
6769 case X86::TZCNT32rr:
6770 case X86::TZCNT64rm:
6771 case X86::TZCNT64rr:
6772 return Subtarget.hasLZCNTFalseDeps();
6773 }
6774
6775 return false;
6776}
6777
6778/// Inform the BreakFalseDeps pass how many idle
6779/// instructions we would like before a partial register update.
6781 const MachineInstr &MI, unsigned OpNum,
6782 const TargetRegisterInfo *TRI) const {
6783
6784 if (OpNum != 0)
6785 return 0;
6786
6787 // NDD ops with 8/16b results may appear to be partial register
6788 // updates after register allocation.
6789 bool HasNDDPartialWrite = false;
6790 if (X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
6791 Register Reg = MI.getOperand(0).getReg();
6792 if (!Reg.isVirtual())
6793 HasNDDPartialWrite =
6794 X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
6795 }
6796
6797 if (!(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
6798 return 0;
6799
6800 // Check if the result register is also used as a source.
6801 // For non-NDD ops, this means a partial update is wanted, hence we return 0.
6802 // For NDD ops, this means it is possible to compress the instruction
6803 // to a legacy form in CompressEVEX, which would create an unwanted partial
6804 // update, so we return the clearance.
6805 const MachineOperand &MO = MI.getOperand(0);
6806 Register Reg = MO.getReg();
6807 bool ReadsReg = false;
6808 if (Reg.isVirtual())
6809 ReadsReg = (MO.readsReg() || MI.readsVirtualRegister(Reg));
6810 else
6811 ReadsReg = MI.readsRegister(Reg, TRI);
6812 if (ReadsReg != HasNDDPartialWrite)
6813 return 0;
6814
6815 // If any instructions in the clearance range are reading Reg, insert a
6816 // dependency breaking instruction, which is inexpensive and is likely to
6817 // be hidden in other instruction's cycles.
6819}
6820
6821// Return true for any instruction the copies the high bits of the first source
6822// operand into the unused high bits of the destination operand.
6823// Also returns true for instructions that have two inputs where one may
6824// be undef and we want it to use the same register as the other input.
6825static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
6826 bool ForLoadFold = false) {
6827 // Set the OpNum parameter to the first source operand.
6828 switch (Opcode) {
6829 case X86::MMX_PUNPCKHBWrr:
6830 case X86::MMX_PUNPCKHWDrr:
6831 case X86::MMX_PUNPCKHDQrr:
6832 case X86::MMX_PUNPCKLBWrr:
6833 case X86::MMX_PUNPCKLWDrr:
6834 case X86::MMX_PUNPCKLDQrr:
6835 case X86::MOVHLPSrr:
6836 case X86::PACKSSWBrr:
6837 case X86::PACKUSWBrr:
6838 case X86::PACKSSDWrr:
6839 case X86::PACKUSDWrr:
6840 case X86::PUNPCKHBWrr:
6841 case X86::PUNPCKLBWrr:
6842 case X86::PUNPCKHWDrr:
6843 case X86::PUNPCKLWDrr:
6844 case X86::PUNPCKHDQrr:
6845 case X86::PUNPCKLDQrr:
6846 case X86::PUNPCKHQDQrr:
6847 case X86::PUNPCKLQDQrr:
6848 case X86::SHUFPDrri:
6849 case X86::SHUFPSrri:
6850 // These instructions are sometimes used with an undef first or second
6851 // source. Return true here so BreakFalseDeps will assign this source to the
6852 // same register as the first source to avoid a false dependency.
6853 // Operand 1 of these instructions is tied so they're separate from their
6854 // VEX counterparts.
6855 return OpNum == 2 && !ForLoadFold;
6856
6857 case X86::VMOVLHPSrr:
6858 case X86::VMOVLHPSZrr:
6859 case X86::VPACKSSWBrr:
6860 case X86::VPACKUSWBrr:
6861 case X86::VPACKSSDWrr:
6862 case X86::VPACKUSDWrr:
6863 case X86::VPACKSSWBZ128rr:
6864 case X86::VPACKUSWBZ128rr:
6865 case X86::VPACKSSDWZ128rr:
6866 case X86::VPACKUSDWZ128rr:
6867 case X86::VPERM2F128rri:
6868 case X86::VPERM2I128rri:
6869 case X86::VSHUFF32X4Z256rri:
6870 case X86::VSHUFF32X4Zrri:
6871 case X86::VSHUFF64X2Z256rri:
6872 case X86::VSHUFF64X2Zrri:
6873 case X86::VSHUFI32X4Z256rri:
6874 case X86::VSHUFI32X4Zrri:
6875 case X86::VSHUFI64X2Z256rri:
6876 case X86::VSHUFI64X2Zrri:
6877 case X86::VPUNPCKHBWrr:
6878 case X86::VPUNPCKLBWrr:
6879 case X86::VPUNPCKHBWYrr:
6880 case X86::VPUNPCKLBWYrr:
6881 case X86::VPUNPCKHBWZ128rr:
6882 case X86::VPUNPCKLBWZ128rr:
6883 case X86::VPUNPCKHBWZ256rr:
6884 case X86::VPUNPCKLBWZ256rr:
6885 case X86::VPUNPCKHBWZrr:
6886 case X86::VPUNPCKLBWZrr:
6887 case X86::VPUNPCKHWDrr:
6888 case X86::VPUNPCKLWDrr:
6889 case X86::VPUNPCKHWDYrr:
6890 case X86::VPUNPCKLWDYrr:
6891 case X86::VPUNPCKHWDZ128rr:
6892 case X86::VPUNPCKLWDZ128rr:
6893 case X86::VPUNPCKHWDZ256rr:
6894 case X86::VPUNPCKLWDZ256rr:
6895 case X86::VPUNPCKHWDZrr:
6896 case X86::VPUNPCKLWDZrr:
6897 case X86::VPUNPCKHDQrr:
6898 case X86::VPUNPCKLDQrr:
6899 case X86::VPUNPCKHDQYrr:
6900 case X86::VPUNPCKLDQYrr:
6901 case X86::VPUNPCKHDQZ128rr:
6902 case X86::VPUNPCKLDQZ128rr:
6903 case X86::VPUNPCKHDQZ256rr:
6904 case X86::VPUNPCKLDQZ256rr:
6905 case X86::VPUNPCKHDQZrr:
6906 case X86::VPUNPCKLDQZrr:
6907 case X86::VPUNPCKHQDQrr:
6908 case X86::VPUNPCKLQDQrr:
6909 case X86::VPUNPCKHQDQYrr:
6910 case X86::VPUNPCKLQDQYrr:
6911 case X86::VPUNPCKHQDQZ128rr:
6912 case X86::VPUNPCKLQDQZ128rr:
6913 case X86::VPUNPCKHQDQZ256rr:
6914 case X86::VPUNPCKLQDQZ256rr:
6915 case X86::VPUNPCKHQDQZrr:
6916 case X86::VPUNPCKLQDQZrr:
6917 // These instructions are sometimes used with an undef first or second
6918 // source. Return true here so BreakFalseDeps will assign this source to the
6919 // same register as the first source to avoid a false dependency.
6920 return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
6921
6922 case X86::VCVTSI2SSrr:
6923 case X86::VCVTSI2SSrm:
6924 case X86::VCVTSI2SSrr_Int:
6925 case X86::VCVTSI2SSrm_Int:
6926 case X86::VCVTSI642SSrr:
6927 case X86::VCVTSI642SSrm:
6928 case X86::VCVTSI642SSrr_Int:
6929 case X86::VCVTSI642SSrm_Int:
6930 case X86::VCVTSI2SDrr:
6931 case X86::VCVTSI2SDrm:
6932 case X86::VCVTSI2SDrr_Int:
6933 case X86::VCVTSI2SDrm_Int:
6934 case X86::VCVTSI642SDrr:
6935 case X86::VCVTSI642SDrm:
6936 case X86::VCVTSI642SDrr_Int:
6937 case X86::VCVTSI642SDrm_Int:
6938 // AVX-512
6939 case X86::VCVTSI2SSZrr:
6940 case X86::VCVTSI2SSZrm:
6941 case X86::VCVTSI2SSZrr_Int:
6942 case X86::VCVTSI2SSZrrb_Int:
6943 case X86::VCVTSI2SSZrm_Int:
6944 case X86::VCVTSI642SSZrr:
6945 case X86::VCVTSI642SSZrm:
6946 case X86::VCVTSI642SSZrr_Int:
6947 case X86::VCVTSI642SSZrrb_Int:
6948 case X86::VCVTSI642SSZrm_Int:
6949 case X86::VCVTSI2SDZrr:
6950 case X86::VCVTSI2SDZrm:
6951 case X86::VCVTSI2SDZrr_Int:
6952 case X86::VCVTSI2SDZrm_Int:
6953 case X86::VCVTSI642SDZrr:
6954 case X86::VCVTSI642SDZrm:
6955 case X86::VCVTSI642SDZrr_Int:
6956 case X86::VCVTSI642SDZrrb_Int:
6957 case X86::VCVTSI642SDZrm_Int:
6958 case X86::VCVTUSI2SSZrr:
6959 case X86::VCVTUSI2SSZrm:
6960 case X86::VCVTUSI2SSZrr_Int:
6961 case X86::VCVTUSI2SSZrrb_Int:
6962 case X86::VCVTUSI2SSZrm_Int:
6963 case X86::VCVTUSI642SSZrr:
6964 case X86::VCVTUSI642SSZrm:
6965 case X86::VCVTUSI642SSZrr_Int:
6966 case X86::VCVTUSI642SSZrrb_Int:
6967 case X86::VCVTUSI642SSZrm_Int:
6968 case X86::VCVTUSI2SDZrr:
6969 case X86::VCVTUSI2SDZrm:
6970 case X86::VCVTUSI2SDZrr_Int:
6971 case X86::VCVTUSI2SDZrm_Int:
6972 case X86::VCVTUSI642SDZrr:
6973 case X86::VCVTUSI642SDZrm:
6974 case X86::VCVTUSI642SDZrr_Int:
6975 case X86::VCVTUSI642SDZrrb_Int:
6976 case X86::VCVTUSI642SDZrm_Int:
6977 case X86::VCVTSI2SHZrr:
6978 case X86::VCVTSI2SHZrm:
6979 case X86::VCVTSI2SHZrr_Int:
6980 case X86::VCVTSI2SHZrrb_Int:
6981 case X86::VCVTSI2SHZrm_Int:
6982 case X86::VCVTSI642SHZrr:
6983 case X86::VCVTSI642SHZrm:
6984 case X86::VCVTSI642SHZrr_Int:
6985 case X86::VCVTSI642SHZrrb_Int:
6986 case X86::VCVTSI642SHZrm_Int:
6987 case X86::VCVTUSI2SHZrr:
6988 case X86::VCVTUSI2SHZrm:
6989 case X86::VCVTUSI2SHZrr_Int:
6990 case X86::VCVTUSI2SHZrrb_Int:
6991 case X86::VCVTUSI2SHZrm_Int:
6992 case X86::VCVTUSI642SHZrr:
6993 case X86::VCVTUSI642SHZrm:
6994 case X86::VCVTUSI642SHZrr_Int:
6995 case X86::VCVTUSI642SHZrrb_Int:
6996 case X86::VCVTUSI642SHZrm_Int:
6997 // Load folding won't effect the undef register update since the input is
6998 // a GPR.
6999 return OpNum == 1 && !ForLoadFold;
7000 case X86::VCVTSD2SSrr:
7001 case X86::VCVTSD2SSrm:
7002 case X86::VCVTSD2SSrr_Int:
7003 case X86::VCVTSD2SSrm_Int:
7004 case X86::VCVTSS2SDrr:
7005 case X86::VCVTSS2SDrm:
7006 case X86::VCVTSS2SDrr_Int:
7007 case X86::VCVTSS2SDrm_Int:
7008 case X86::VRCPSSr:
7009 case X86::VRCPSSr_Int:
7010 case X86::VRCPSSm:
7011 case X86::VRCPSSm_Int:
7012 case X86::VROUNDSDri:
7013 case X86::VROUNDSDmi:
7014 case X86::VROUNDSDri_Int:
7015 case X86::VROUNDSDmi_Int:
7016 case X86::VROUNDSSri:
7017 case X86::VROUNDSSmi:
7018 case X86::VROUNDSSri_Int:
7019 case X86::VROUNDSSmi_Int:
7020 case X86::VRSQRTSSr:
7021 case X86::VRSQRTSSr_Int:
7022 case X86::VRSQRTSSm:
7023 case X86::VRSQRTSSm_Int:
7024 case X86::VSQRTSSr:
7025 case X86::VSQRTSSr_Int:
7026 case X86::VSQRTSSm:
7027 case X86::VSQRTSSm_Int:
7028 case X86::VSQRTSDr:
7029 case X86::VSQRTSDr_Int:
7030 case X86::VSQRTSDm:
7031 case X86::VSQRTSDm_Int:
7032 // AVX-512
7033 case X86::VCVTSD2SSZrr:
7034 case X86::VCVTSD2SSZrr_Int:
7035 case X86::VCVTSD2SSZrrb_Int:
7036 case X86::VCVTSD2SSZrm:
7037 case X86::VCVTSD2SSZrm_Int:
7038 case X86::VCVTSS2SDZrr:
7039 case X86::VCVTSS2SDZrr_Int:
7040 case X86::VCVTSS2SDZrrb_Int:
7041 case X86::VCVTSS2SDZrm:
7042 case X86::VCVTSS2SDZrm_Int:
7043 case X86::VGETEXPSDZr:
7044 case X86::VGETEXPSDZrb:
7045 case X86::VGETEXPSDZm:
7046 case X86::VGETEXPSSZr:
7047 case X86::VGETEXPSSZrb:
7048 case X86::VGETEXPSSZm:
7049 case X86::VGETMANTSDZrri:
7050 case X86::VGETMANTSDZrrib:
7051 case X86::VGETMANTSDZrmi:
7052 case X86::VGETMANTSSZrri:
7053 case X86::VGETMANTSSZrrib:
7054 case X86::VGETMANTSSZrmi:
7055 case X86::VRNDSCALESDZrri:
7056 case X86::VRNDSCALESDZrri_Int:
7057 case X86::VRNDSCALESDZrrib_Int:
7058 case X86::VRNDSCALESDZrmi:
7059 case X86::VRNDSCALESDZrmi_Int:
7060 case X86::VRNDSCALESSZrri:
7061 case X86::VRNDSCALESSZrri_Int:
7062 case X86::VRNDSCALESSZrrib_Int:
7063 case X86::VRNDSCALESSZrmi:
7064 case X86::VRNDSCALESSZrmi_Int:
7065 case X86::VRCP14SDZrr:
7066 case X86::VRCP14SDZrm:
7067 case X86::VRCP14SSZrr:
7068 case X86::VRCP14SSZrm:
7069 case X86::VRCPSHZrr:
7070 case X86::VRCPSHZrm:
7071 case X86::VRSQRTSHZrr:
7072 case X86::VRSQRTSHZrm:
7073 case X86::VREDUCESHZrmi:
7074 case X86::VREDUCESHZrri:
7075 case X86::VREDUCESHZrrib:
7076 case X86::VGETEXPSHZr:
7077 case X86::VGETEXPSHZrb:
7078 case X86::VGETEXPSHZm:
7079 case X86::VGETMANTSHZrri:
7080 case X86::VGETMANTSHZrrib:
7081 case X86::VGETMANTSHZrmi:
7082 case X86::VRNDSCALESHZrri:
7083 case X86::VRNDSCALESHZrri_Int:
7084 case X86::VRNDSCALESHZrrib_Int:
7085 case X86::VRNDSCALESHZrmi:
7086 case X86::VRNDSCALESHZrmi_Int:
7087 case X86::VSQRTSHZr:
7088 case X86::VSQRTSHZr_Int:
7089 case X86::VSQRTSHZrb_Int:
7090 case X86::VSQRTSHZm:
7091 case X86::VSQRTSHZm_Int:
7092 case X86::VRCP28SDZr:
7093 case X86::VRCP28SDZrb:
7094 case X86::VRCP28SDZm:
7095 case X86::VRCP28SSZr:
7096 case X86::VRCP28SSZrb:
7097 case X86::VRCP28SSZm:
7098 case X86::VREDUCESSZrmi:
7099 case X86::VREDUCESSZrri:
7100 case X86::VREDUCESSZrrib:
7101 case X86::VRSQRT14SDZrr:
7102 case X86::VRSQRT14SDZrm:
7103 case X86::VRSQRT14SSZrr:
7104 case X86::VRSQRT14SSZrm:
7105 case X86::VRSQRT28SDZr:
7106 case X86::VRSQRT28SDZrb:
7107 case X86::VRSQRT28SDZm:
7108 case X86::VRSQRT28SSZr:
7109 case X86::VRSQRT28SSZrb:
7110 case X86::VRSQRT28SSZm:
7111 case X86::VSQRTSSZr:
7112 case X86::VSQRTSSZr_Int:
7113 case X86::VSQRTSSZrb_Int:
7114 case X86::VSQRTSSZm:
7115 case X86::VSQRTSSZm_Int:
7116 case X86::VSQRTSDZr:
7117 case X86::VSQRTSDZr_Int:
7118 case X86::VSQRTSDZrb_Int:
7119 case X86::VSQRTSDZm:
7120 case X86::VSQRTSDZm_Int:
7121 case X86::VCVTSD2SHZrr:
7122 case X86::VCVTSD2SHZrr_Int:
7123 case X86::VCVTSD2SHZrrb_Int:
7124 case X86::VCVTSD2SHZrm:
7125 case X86::VCVTSD2SHZrm_Int:
7126 case X86::VCVTSS2SHZrr:
7127 case X86::VCVTSS2SHZrr_Int:
7128 case X86::VCVTSS2SHZrrb_Int:
7129 case X86::VCVTSS2SHZrm:
7130 case X86::VCVTSS2SHZrm_Int:
7131 case X86::VCVTSH2SDZrr:
7132 case X86::VCVTSH2SDZrr_Int:
7133 case X86::VCVTSH2SDZrrb_Int:
7134 case X86::VCVTSH2SDZrm:
7135 case X86::VCVTSH2SDZrm_Int:
7136 case X86::VCVTSH2SSZrr:
7137 case X86::VCVTSH2SSZrr_Int:
7138 case X86::VCVTSH2SSZrrb_Int:
7139 case X86::VCVTSH2SSZrm:
7140 case X86::VCVTSH2SSZrm_Int:
7141 return OpNum == 1;
7142 case X86::VMOVSSZrrk:
7143 case X86::VMOVSDZrrk:
7144 return OpNum == 3 && !ForLoadFold;
7145 case X86::VMOVSSZrrkz:
7146 case X86::VMOVSDZrrkz:
7147 return OpNum == 2 && !ForLoadFold;
7148 }
7149
7150 return false;
7151}
7152
7153/// Inform the BreakFalseDeps pass how many idle instructions we would like
7154/// before certain undef register reads.
7155///
7156/// This catches the VCVTSI2SD family of instructions:
7157///
7158/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
7159///
7160/// We should to be careful *not* to catch VXOR idioms which are presumably
7161/// handled specially in the pipeline:
7162///
7163/// vxorps undef %xmm1, undef %xmm1, %xmm1
7164///
7165/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
7166/// high bits that are passed-through are not live.
7167unsigned
7169 const TargetRegisterInfo *TRI) const {
7170 const MachineOperand &MO = MI.getOperand(OpNum);
7171 if (MO.getReg().isPhysical() && hasUndefRegUpdate(MI.getOpcode(), OpNum))
7172 return UndefRegClearance;
7173
7174 return 0;
7175}
7176
7178 MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
7179 Register Reg = MI.getOperand(OpNum).getReg();
7180 // If MI kills this register, the false dependence is already broken.
7181 if (MI.killsRegister(Reg, TRI))
7182 return;
7183
7184 if (X86::VR128RegClass.contains(Reg)) {
7185 // These instructions are all floating point domain, so xorps is the best
7186 // choice.
7187 unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
7188 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
7189 .addReg(Reg, RegState::Undef)
7190 .addReg(Reg, RegState::Undef);
7191 MI.addRegisterKilled(Reg, TRI, true);
7192 } else if (X86::VR256RegClass.contains(Reg)) {
7193 // Use vxorps to clear the full ymm register.
7194 // It wants to read and write the xmm sub-register.
7195 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7196 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
7197 .addReg(XReg, RegState::Undef)
7198 .addReg(XReg, RegState::Undef)
7200 MI.addRegisterKilled(Reg, TRI, true);
7201 } else if (X86::VR128XRegClass.contains(Reg)) {
7202 // Only handle VLX targets.
7203 if (!Subtarget.hasVLX())
7204 return;
7205 // Since vxorps requires AVX512DQ, vpxord should be the best choice.
7206 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg)
7207 .addReg(Reg, RegState::Undef)
7208 .addReg(Reg, RegState::Undef);
7209 MI.addRegisterKilled(Reg, TRI, true);
7210 } else if (X86::VR256XRegClass.contains(Reg) ||
7211 X86::VR512RegClass.contains(Reg)) {
7212 // Only handle VLX targets.
7213 if (!Subtarget.hasVLX())
7214 return;
7215 // Use vpxord to clear the full ymm/zmm register.
7216 // It wants to read and write the xmm sub-register.
7217 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7218 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg)
7219 .addReg(XReg, RegState::Undef)
7220 .addReg(XReg, RegState::Undef)
7222 MI.addRegisterKilled(Reg, TRI, true);
7223 } else if (X86::GR64RegClass.contains(Reg)) {
7224 // Using XOR32rr because it has shorter encoding and zeros up the upper bits
7225 // as well.
7226 Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
7227 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
7228 .addReg(XReg, RegState::Undef)
7229 .addReg(XReg, RegState::Undef)
7231 MI.addRegisterKilled(Reg, TRI, true);
7232 } else if (X86::GR32RegClass.contains(Reg)) {
7233 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
7234 .addReg(Reg, RegState::Undef)
7235 .addReg(Reg, RegState::Undef);
7236 MI.addRegisterKilled(Reg, TRI, true);
7237 } else if ((X86::GR16RegClass.contains(Reg) ||
7238 X86::GR8RegClass.contains(Reg)) &&
7239 X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
7240 // This case is only expected for NDD ops which appear to be partial
7241 // writes, but are not due to the zeroing of the upper part. Here
7242 // we add an implicit def of the superegister, which prevents
7243 // CompressEVEX from converting this to a legacy form.
7244 Register SuperReg = getX86SubSuperRegister(Reg, 64);
7245 MachineInstrBuilder BuildMI(*MI.getParent()->getParent(), &MI);
7246 if (!MI.definesRegister(SuperReg, /*TRI=*/nullptr))
7247 BuildMI.addReg(SuperReg, RegState::ImplicitDefine);
7248 }
7249}
7250
7252 int PtrOffset = 0) {
7253 unsigned NumAddrOps = MOs.size();
7254
7255 if (NumAddrOps < 4) {
7256 // FrameIndex only - add an immediate offset (whether its zero or not).
7257 for (unsigned i = 0; i != NumAddrOps; ++i)
7258 MIB.add(MOs[i]);
7259 addOffset(MIB, PtrOffset);
7260 } else {
7261 // General Memory Addressing - we need to add any offset to an existing
7262 // offset.
7263 assert(MOs.size() == 5 && "Unexpected memory operand list length");
7264 for (unsigned i = 0; i != NumAddrOps; ++i) {
7265 const MachineOperand &MO = MOs[i];
7266 if (i == 3 && PtrOffset != 0) {
7267 MIB.addDisp(MO, PtrOffset);
7268 } else {
7269 MIB.add(MO);
7270 }
7271 }
7272 }
7273}
7274
7276 MachineInstr &NewMI,
7277 const TargetInstrInfo &TII) {
7278 MachineRegisterInfo &MRI = MF.getRegInfo();
7279
7280 for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
7281 MachineOperand &MO = NewMI.getOperand(Idx);
7282 // We only need to update constraints on virtual register operands.
7283 if (!MO.isReg())
7284 continue;
7285 Register Reg = MO.getReg();
7286 if (!Reg.isVirtual())
7287 continue;
7288
7289 auto *NewRC =
7290 MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx));
7291 if (!NewRC) {
7292 LLVM_DEBUG(
7293 dbgs() << "WARNING: Unable to update register constraint for operand "
7294 << Idx << " of instruction:\n";
7295 NewMI.dump(); dbgs() << "\n");
7296 }
7297 }
7298}
7299
7300static MachineInstr *fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
7304 const TargetInstrInfo &TII) {
7305 // Create the base instruction with the memory operand as the first part.
7306 // Omit the implicit operands, something BuildMI can't do.
7307 MachineInstr *NewMI =
7308 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7309 MachineInstrBuilder MIB(MF, NewMI);
7310 addOperands(MIB, MOs);
7311
7312 // Loop over the rest of the ri operands, converting them over.
7313 unsigned NumOps = MI.getDesc().getNumOperands() - 2;
7314 for (unsigned i = 0; i != NumOps; ++i) {
7315 MachineOperand &MO = MI.getOperand(i + 2);
7316 MIB.add(MO);
7317 }
7318 for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), NumOps + 2))
7319 MIB.add(MO);
7320
7321 updateOperandRegConstraints(MF, *NewMI, TII);
7322
7323 MachineBasicBlock *MBB = InsertPt->getParent();
7324 MBB->insert(InsertPt, NewMI);
7325
7326 return MIB;
7327}
7328
7329static MachineInstr *fuseInst(MachineFunction &MF, unsigned Opcode,
7330 unsigned OpNo, ArrayRef<MachineOperand> MOs,
7333 int PtrOffset = 0) {
7334 // Omit the implicit operands, something BuildMI can't do.
7335 MachineInstr *NewMI =
7336 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7337 MachineInstrBuilder MIB(MF, NewMI);
7338
7339 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
7340 MachineOperand &MO = MI.getOperand(i);
7341 if (i == OpNo) {
7342 assert(MO.isReg() && "Expected to fold into reg operand!");
7343 addOperands(MIB, MOs, PtrOffset);
7344 } else {
7345 MIB.add(MO);
7346 }
7347 }
7348
7349 updateOperandRegConstraints(MF, *NewMI, TII);
7350
7351 // Copy the NoFPExcept flag from the instruction we're fusing.
7354
7355 MachineBasicBlock *MBB = InsertPt->getParent();
7356 MBB->insert(InsertPt, NewMI);
7357
7358 return MIB;
7359}
7360
7361static MachineInstr *makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
7364 MachineInstr &MI) {
7365 MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
7366 MI.getDebugLoc(), TII.get(Opcode));
7367 addOperands(MIB, MOs);
7368 return MIB.addImm(0);
7369}
7370
7371MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
7372 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7374 unsigned Size, Align Alignment) const {
7375 switch (MI.getOpcode()) {
7376 case X86::INSERTPSrri:
7377 case X86::VINSERTPSrri:
7378 case X86::VINSERTPSZrri:
7379 // Attempt to convert the load of inserted vector into a fold load
7380 // of a single float.
7381 if (OpNum == 2) {
7382 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
7383 unsigned ZMask = Imm & 15;
7384 unsigned DstIdx = (Imm >> 4) & 3;
7385 unsigned SrcIdx = (Imm >> 6) & 3;
7386
7387 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7388 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7389 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7390 if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
7391 (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) {
7392 int PtrOffset = SrcIdx * 4;
7393 unsigned NewImm = (DstIdx << 4) | ZMask;
7394 unsigned NewOpCode =
7395 (MI.getOpcode() == X86::VINSERTPSZrri) ? X86::VINSERTPSZrmi
7396 : (MI.getOpcode() == X86::VINSERTPSrri) ? X86::VINSERTPSrmi
7397 : X86::INSERTPSrmi;
7398 MachineInstr *NewMI =
7399 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
7400 NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
7401 return NewMI;
7402 }
7403 }
7404 break;
7405 case X86::MOVHLPSrr:
7406 case X86::VMOVHLPSrr:
7407 case X86::VMOVHLPSZrr:
7408 // Move the upper 64-bits of the second operand to the lower 64-bits.
7409 // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
7410 // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
7411 if (OpNum == 2) {
7412 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7413 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7414 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7415 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
7416 unsigned NewOpCode =
7417 (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm
7418 : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm
7419 : X86::MOVLPSrm;
7420 MachineInstr *NewMI =
7421 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
7422 return NewMI;
7423 }
7424 }
7425 break;
7426 case X86::UNPCKLPDrr:
7427 // If we won't be able to fold this to the memory form of UNPCKL, use
7428 // MOVHPD instead. Done as custom because we can't have this in the load
7429 // table twice.
7430 if (OpNum == 2) {
7431 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7432 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7433 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7434 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
7435 MachineInstr *NewMI =
7436 fuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
7437 return NewMI;
7438 }
7439 }
7440 break;
7441 case X86::MOV32r0:
7442 if (auto *NewMI =
7443 makeM0Inst(*this, (Size == 4) ? X86::MOV32mi : X86::MOV64mi32, MOs,
7444 InsertPt, MI))
7445 return NewMI;
7446 break;
7447 }
7448
7449 return nullptr;
7450}
7451
7453 MachineInstr &MI) {
7454 if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/ true) ||
7455 !MI.getOperand(1).isReg())
7456 return false;
7457
7458 // The are two cases we need to handle depending on where in the pipeline
7459 // the folding attempt is being made.
7460 // -Register has the undef flag set.
7461 // -Register is produced by the IMPLICIT_DEF instruction.
7462
7463 if (MI.getOperand(1).isUndef())
7464 return true;
7465
7467 MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
7468 return VRegDef && VRegDef->isImplicitDef();
7469}
7470
7471unsigned X86InstrInfo::commuteOperandsForFold(MachineInstr &MI,
7472 unsigned Idx1) const {
7473 unsigned Idx2 = CommuteAnyOperandIndex;
7474 if (!findCommutedOpIndices(MI, Idx1, Idx2))
7475 return Idx1;
7476
7477 bool HasDef = MI.getDesc().getNumDefs();
7478 Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
7479 Register Reg1 = MI.getOperand(Idx1).getReg();
7480 Register Reg2 = MI.getOperand(Idx2).getReg();
7481 bool Tied1 = 0 == MI.getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO);
7482 bool Tied2 = 0 == MI.getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO);
7483
7484 // If either of the commutable operands are tied to the destination
7485 // then we can not commute + fold.
7486 if ((HasDef && Reg0 == Reg1 && Tied1) || (HasDef && Reg0 == Reg2 && Tied2))
7487 return Idx1;
7488
7489 return commuteInstruction(MI, false, Idx1, Idx2) ? Idx2 : Idx1;
7490}
7491
7492static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx) {
7493 if (PrintFailedFusing && !MI.isCopy())
7494 dbgs() << "We failed to fuse operand " << Idx << " in " << MI;
7495}
7496
7498 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7500 unsigned Size, Align Alignment, bool AllowCommute, MachineInstr *&CopyMI,
7501 VirtRegMap *VRM) const {
7502 bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
7503 bool isSlowIndirectCall = Subtarget.slowIndirectCall();
7504 unsigned Opc = MI.getOpcode();
7505
7506 // For CPUs that favor the register form of a call,
7507 // do not fold loads into calls, unless optimizing for size aggressively.
7508 if ((isSlowTwoMemOps || isSlowIndirectCall) &&
7509 !MF.getFunction().hasMinSize() &&
7510 (Opc == X86::CALL32r || Opc == X86::CALL64r ||
7511 Opc == X86::CALL64r_ImpCall))
7512 return nullptr;
7513
7514 // For CPUs that favor the register form of a push,
7515 // do not fold loads into pushes, unless optimizing for size aggressively.
7516 if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
7517 (Opc == X86::PUSH16r || Opc == X86::PUSH32r || Opc == X86::PUSH64r))
7518 return nullptr;
7519
7520 // Avoid partial and undef register update stalls unless optimizing for size.
7521 if (!MF.getFunction().hasOptSize() &&
7522 (hasPartialRegUpdate(Opc, Subtarget, /*ForLoadFold*/ true) ||
7524 return nullptr;
7525
7526 unsigned NumOps = MI.getDesc().getNumOperands();
7527 bool IsTwoAddr = NumOps > 1 && OpNum < 2 && MI.getOperand(0).isReg() &&
7528 MI.getOperand(1).isReg() &&
7529 MI.getOperand(0).getReg() == MI.getOperand(1).getReg();
7530
7531 // FIXME: AsmPrinter doesn't know how to handle
7532 // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
7533 if (Opc == X86::ADD32ri &&
7534 MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
7535 return nullptr;
7536
7537 // GOTTPOFF relocation loads can only be folded into add instructions.
7538 // FIXME: Need to exclude other relocations that only support specific
7539 // instructions.
7540 if (MOs.size() == X86::AddrNumOperands &&
7541 MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF &&
7542 Opc != X86::ADD64rr)
7543 return nullptr;
7544
7545 // Don't fold loads into indirect calls that need a KCFI check as we'll
7546 // have to unfold these in X86TargetLowering::EmitKCFICheck anyway.
7547 if (MI.isCall() && MI.getCFIType())
7548 return nullptr;
7549
7550 // Attempt to fold any custom cases we have.
7551 if (auto *CustomMI = foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt,
7552 Size, Alignment))
7553 return CustomMI;
7554
7555 // Folding a memory location into the two-address part of a two-address
7556 // instruction is different than folding it other places. It requires
7557 // replacing the *two* registers with the memory location.
7558 //
7559 // Utilize the mapping NonNDD -> RMW for the NDD variant.
7560 unsigned NonNDOpc = Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U;
7561 // Utilize the mapping NonNDD if NDD memory variant is not preferred.
7562 bool NoNDDM = NonNDOpc && !Subtarget.hasNDDM();
7563
7564 MachineRegisterInfo &MRI = MF.getRegInfo();
7565 if (NoNDDM && !IsTwoAddr && !MRI.isSSA()) {
7566 // Bail out if dst has subreg. It happens during register-coalescer from
7567 // 704B %19:gr32 = SUB32rr_ND killed %0:gr32, killed %7:gr32, ...
7568 // 752B undef %23.sub_32bit:gr64 = COPY killed %19:gr32
7569 // 768B %25:gr32 = LEA64_32r killed %23:gr64, 1, killed %21:gr64_nosp, ...
7570 // to
7571 // 704B undef %23.sub_32bit:gr64_with_sub_8bit = SUB32rr_ND %0:gr32, ...
7572 // 768B %25:gr32 = LEA64_32r %23:gr64_with_sub_8bit, 1, %21:gr64_nosp, ...
7573 // Machine verifier fails if we try to tie %23 to the source.
7574 if (MI.getOperand(0).getSubReg())
7575 return nullptr;
7576
7577 // Bail out if dst has been assigned a physical register. Otherwise, we
7578 // cannot update LiveRegMatrix properly.
7579 Register Dst = MI.getOperand(0).getReg();
7580 if (VRM && Dst != MI.getOperand(1).getReg() &&
7581 (!Dst.isVirtual() || VRM->getPhys(Dst)))
7582 return nullptr;
7583 }
7584
7585 const X86FoldTableEntry *I =
7586 IsTwoAddr ? lookupTwoAddrFoldTable(NonNDOpc ? NonNDOpc : Opc)
7587 : lookupFoldTable(NoNDDM ? NonNDOpc : Opc, OpNum);
7588
7589 MachineInstr *NewMI = nullptr;
7590 if (I) {
7591 unsigned Opcode = I->DstOp;
7592 if (Alignment <
7593 Align(1ULL << ((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT)))
7594 return nullptr;
7595 bool NarrowToMOV32rm = false;
7596 if (Size) {
7598 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7599 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7600 // Check if it's safe to fold the load. If the size of the object is
7601 // narrower than the load width, then it's not.
7602 // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
7603 if ((I->Flags & TB_FOLDED_LOAD) && Size < RCSize) {
7604 // If this is a 64-bit load, but the spill slot is 32, then we can do
7605 // a 32-bit load which is implicitly zero-extended. This likely is
7606 // due to live interval analysis remat'ing a load from stack slot.
7607 if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
7608 return nullptr;
7609 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
7610 return nullptr;
7611 Opcode = X86::MOV32rm;
7612 NarrowToMOV32rm = true;
7613 }
7614 // For stores, make sure the size of the object is equal to the size of
7615 // the store. If the object is larger, the extra bits would be garbage. If
7616 // the object is smaller we might overwrite another object or fault.
7617 if ((I->Flags & TB_FOLDED_STORE) && Size != RCSize)
7618 return nullptr;
7619 }
7620
7621 NewMI = IsTwoAddr ? fuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this)
7622 : fuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
7623
7624 if (NarrowToMOV32rm) {
7625 // If this is the special case where we use a MOV32rm to load a 32-bit
7626 // value and zero-extend the top bits. Change the destination register
7627 // to a 32-bit one.
7628 Register DstReg = NewMI->getOperand(0).getReg();
7629 if (DstReg.isPhysical())
7630 NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
7631 else
7632 NewMI->getOperand(0).setSubReg(X86::sub_32bit);
7633 }
7634
7635 if (NoNDDM && !IsTwoAddr) {
7636 Register SrcReg = MI.getOperand(1).getReg();
7637 unsigned SrcSub = MI.getOperand(1).getSubReg();
7638 if (MI.killsRegister(SrcReg, /*TRI=*/nullptr) ||
7639 MI.getOperand(0).getReg() == SrcReg)
7640 return NewMI;
7641
7642 Register NewSrc = MI.getOperand(0).getReg();
7643 if (MRI.isSSA()) {
7644 const TargetRegisterClass &RC = *MF.getRegInfo().getRegClass(SrcReg);
7645 NewSrc = MRI.createVirtualRegister(&RC);
7646 }
7647
7648 CopyMI = BuildMI(*NewMI->getParent(), *NewMI, MI.getDebugLoc(),
7649 get(TargetOpcode::COPY))
7650 .addDef(NewSrc)
7651 .addReg(SrcReg, {}, SrcSub);
7652 NewMI->getOperand(1).setReg(NewSrc);
7653 NewMI->getOperand(1).setSubReg(0);
7654 }
7655 return NewMI;
7656 }
7657
7658 if (AllowCommute) {
7659 // If the instruction and target operand are commutable, commute the
7660 // instruction and try again.
7661 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
7662 if (CommuteOpIdx2 == OpNum) {
7663 printFailMsgforFold(MI, OpNum);
7664 return nullptr;
7665 }
7666 // Attempt to fold with the commuted version of the instruction.
7667 NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
7668 Alignment, /*AllowCommute=*/false, CopyMI);
7669 if (NewMI)
7670 return NewMI;
7671 // Folding failed again - undo the commute before returning.
7672 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
7673 }
7674
7675 printFailMsgforFold(MI, OpNum);
7676 return nullptr;
7677}
7678
7681 ArrayRef<unsigned> Ops, int FrameIndex,
7682 MachineInstr *&CopyMI, LiveIntervals *LIS,
7683 VirtRegMap *VRM) const {
7685 // Check switch flag
7686 if (NoFusing)
7687 return nullptr;
7688
7689 // Avoid partial and undef register update stalls unless optimizing for size.
7690 if (!MF.getFunction().hasOptSize() &&
7691 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
7693 return nullptr;
7694
7695 // Don't fold subreg spills, or reloads that use a high subreg.
7696 for (auto Op : Ops) {
7697 MachineOperand &MO = MI.getOperand(Op);
7698 auto SubReg = MO.getSubReg();
7699 // MOV32r0 is special b/c it's used to clear a 64-bit register too.
7700 // (See patterns for MOV32r0 in TD files).
7701 if (MI.getOpcode() == X86::MOV32r0 && SubReg == X86::sub_32bit)
7702 continue;
7703 if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
7704 return nullptr;
7705 }
7706
7707 const MachineFrameInfo &MFI = MF.getFrameInfo();
7708 unsigned Size = MFI.getObjectSize(FrameIndex);
7709 Align Alignment = MFI.getObjectAlign(FrameIndex);
7710 // If the function stack isn't realigned we don't want to fold instructions
7711 // that need increased alignment.
7712 if (!RI.hasStackRealignment(MF))
7713 Alignment =
7714 std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
7715
7716 auto Impl = [&]() {
7717 return foldMemoryOperandImpl(
7718 MF, MI, Ops[0], MachineOperand::CreateFI(FrameIndex), InsertPt, Size,
7719 Alignment, /*AllowCommute=*/true, CopyMI, VRM);
7720 };
7721 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
7722 unsigned NewOpc = 0;
7723 unsigned RCSize = 0;
7724 unsigned Opc = MI.getOpcode();
7725 switch (Opc) {
7726 default:
7727 // NDD can be folded into RMW though its Op0 and Op1 are not tied.
7728 return (Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U) ? Impl()
7729 : nullptr;
7730 case X86::TEST8rr:
7731 NewOpc = X86::CMP8ri;
7732 RCSize = 1;
7733 break;
7734 case X86::TEST16rr:
7735 NewOpc = X86::CMP16ri;
7736 RCSize = 2;
7737 break;
7738 case X86::TEST32rr:
7739 NewOpc = X86::CMP32ri;
7740 RCSize = 4;
7741 break;
7742 case X86::TEST64rr:
7743 NewOpc = X86::CMP64ri32;
7744 RCSize = 8;
7745 break;
7746 }
7747 // Check if it's safe to fold the load. If the size of the object is
7748 // narrower than the load width, then it's not.
7749 if (Size < RCSize)
7750 return nullptr;
7751 // Change to CMPXXri r, 0 first.
7752 MI.setDesc(get(NewOpc));
7753 MI.getOperand(1).ChangeToImmediate(0);
7754 } else if (Ops.size() != 1)
7755 return nullptr;
7756
7757 return Impl();
7758}
7759
7760/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
7761/// because the latter uses contents that wouldn't be defined in the folded
7762/// version. For instance, this transformation isn't legal:
7763/// movss (%rdi), %xmm0
7764/// addps %xmm0, %xmm0
7765/// ->
7766/// addps (%rdi), %xmm0
7767///
7768/// But this one is:
7769/// movss (%rdi), %xmm0
7770/// addss %xmm0, %xmm0
7771/// ->
7772/// addss (%rdi), %xmm0
7773///
7775 const MachineInstr &UserMI,
7776 const MachineFunction &MF) {
7777 unsigned Opc = LoadMI.getOpcode();
7778 unsigned UserOpc = UserMI.getOpcode();
7780 const TargetRegisterClass *RC =
7781 MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
7782 unsigned RegSize = TRI.getRegSizeInBits(*RC);
7783
7784 if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm ||
7785 Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt ||
7786 Opc == X86::VMOVSSZrm_alt) &&
7787 RegSize > 32) {
7788 // These instructions only load 32 bits, we can't fold them if the
7789 // destination register is wider than 32 bits (4 bytes), and its user
7790 // instruction isn't scalar (SS).
7791 switch (UserOpc) {
7792 case X86::CVTSS2SDrr_Int:
7793 case X86::VCVTSS2SDrr_Int:
7794 case X86::VCVTSS2SDZrr_Int:
7795 case X86::VCVTSS2SDZrrk_Int:
7796 case X86::VCVTSS2SDZrrkz_Int:
7797 case X86::CVTSS2SIrr_Int:
7798 case X86::CVTSS2SI64rr_Int:
7799 case X86::VCVTSS2SIrr_Int:
7800 case X86::VCVTSS2SI64rr_Int:
7801 case X86::VCVTSS2SIZrr_Int:
7802 case X86::VCVTSS2SI64Zrr_Int:
7803 case X86::CVTTSS2SIrr_Int:
7804 case X86::CVTTSS2SI64rr_Int:
7805 case X86::VCVTTSS2SIrr_Int:
7806 case X86::VCVTTSS2SI64rr_Int:
7807 case X86::VCVTTSS2SIZrr_Int:
7808 case X86::VCVTTSS2SI64Zrr_Int:
7809 case X86::VCVTSS2USIZrr_Int:
7810 case X86::VCVTSS2USI64Zrr_Int:
7811 case X86::VCVTTSS2USIZrr_Int:
7812 case X86::VCVTTSS2USI64Zrr_Int:
7813 case X86::RCPSSr_Int:
7814 case X86::VRCPSSr_Int:
7815 case X86::RSQRTSSr_Int:
7816 case X86::VRSQRTSSr_Int:
7817 case X86::ROUNDSSri_Int:
7818 case X86::VROUNDSSri_Int:
7819 case X86::COMISSrr_Int:
7820 case X86::VCOMISSrr_Int:
7821 case X86::VCOMISSZrr_Int:
7822 case X86::UCOMISSrr_Int:
7823 case X86::VUCOMISSrr_Int:
7824 case X86::VUCOMISSZrr_Int:
7825 case X86::ADDSSrr_Int:
7826 case X86::VADDSSrr_Int:
7827 case X86::VADDSSZrr_Int:
7828 case X86::CMPSSrri_Int:
7829 case X86::VCMPSSrri_Int:
7830 case X86::VCMPSSZrri_Int:
7831 case X86::DIVSSrr_Int:
7832 case X86::VDIVSSrr_Int:
7833 case X86::VDIVSSZrr_Int:
7834 case X86::MAXSSrr_Int:
7835 case X86::VMAXSSrr_Int:
7836 case X86::VMAXSSZrr_Int:
7837 case X86::MINSSrr_Int:
7838 case X86::VMINSSrr_Int:
7839 case X86::VMINSSZrr_Int:
7840 case X86::MULSSrr_Int:
7841 case X86::VMULSSrr_Int:
7842 case X86::VMULSSZrr_Int:
7843 case X86::SQRTSSr_Int:
7844 case X86::VSQRTSSr_Int:
7845 case X86::VSQRTSSZr_Int:
7846 case X86::SUBSSrr_Int:
7847 case X86::VSUBSSrr_Int:
7848 case X86::VSUBSSZrr_Int:
7849 case X86::VADDSSZrrk_Int:
7850 case X86::VADDSSZrrkz_Int:
7851 case X86::VCMPSSZrrik_Int:
7852 case X86::VDIVSSZrrk_Int:
7853 case X86::VDIVSSZrrkz_Int:
7854 case X86::VMAXSSZrrk_Int:
7855 case X86::VMAXSSZrrkz_Int:
7856 case X86::VMINSSZrrk_Int:
7857 case X86::VMINSSZrrkz_Int:
7858 case X86::VMULSSZrrk_Int:
7859 case X86::VMULSSZrrkz_Int:
7860 case X86::VSQRTSSZrk_Int:
7861 case X86::VSQRTSSZrkz_Int:
7862 case X86::VSUBSSZrrk_Int:
7863 case X86::VSUBSSZrrkz_Int:
7864 case X86::VFMADDSS4rr_Int:
7865 case X86::VFNMADDSS4rr_Int:
7866 case X86::VFMSUBSS4rr_Int:
7867 case X86::VFNMSUBSS4rr_Int:
7868 case X86::VFMADD132SSr_Int:
7869 case X86::VFNMADD132SSr_Int:
7870 case X86::VFMADD213SSr_Int:
7871 case X86::VFNMADD213SSr_Int:
7872 case X86::VFMADD231SSr_Int:
7873 case X86::VFNMADD231SSr_Int:
7874 case X86::VFMSUB132SSr_Int:
7875 case X86::VFNMSUB132SSr_Int:
7876 case X86::VFMSUB213SSr_Int:
7877 case X86::VFNMSUB213SSr_Int:
7878 case X86::VFMSUB231SSr_Int:
7879 case X86::VFNMSUB231SSr_Int:
7880 case X86::VFMADD132SSZr_Int:
7881 case X86::VFNMADD132SSZr_Int:
7882 case X86::VFMADD213SSZr_Int:
7883 case X86::VFNMADD213SSZr_Int:
7884 case X86::VFMADD231SSZr_Int:
7885 case X86::VFNMADD231SSZr_Int:
7886 case X86::VFMSUB132SSZr_Int:
7887 case X86::VFNMSUB132SSZr_Int:
7888 case X86::VFMSUB213SSZr_Int:
7889 case X86::VFNMSUB213SSZr_Int:
7890 case X86::VFMSUB231SSZr_Int:
7891 case X86::VFNMSUB231SSZr_Int:
7892 case X86::VFMADD132SSZrk_Int:
7893 case X86::VFNMADD132SSZrk_Int:
7894 case X86::VFMADD213SSZrk_Int:
7895 case X86::VFNMADD213SSZrk_Int:
7896 case X86::VFMADD231SSZrk_Int:
7897 case X86::VFNMADD231SSZrk_Int:
7898 case X86::VFMSUB132SSZrk_Int:
7899 case X86::VFNMSUB132SSZrk_Int:
7900 case X86::VFMSUB213SSZrk_Int:
7901 case X86::VFNMSUB213SSZrk_Int:
7902 case X86::VFMSUB231SSZrk_Int:
7903 case X86::VFNMSUB231SSZrk_Int:
7904 case X86::VFMADD132SSZrkz_Int:
7905 case X86::VFNMADD132SSZrkz_Int:
7906 case X86::VFMADD213SSZrkz_Int:
7907 case X86::VFNMADD213SSZrkz_Int:
7908 case X86::VFMADD231SSZrkz_Int:
7909 case X86::VFNMADD231SSZrkz_Int:
7910 case X86::VFMSUB132SSZrkz_Int:
7911 case X86::VFNMSUB132SSZrkz_Int:
7912 case X86::VFMSUB213SSZrkz_Int:
7913 case X86::VFNMSUB213SSZrkz_Int:
7914 case X86::VFMSUB231SSZrkz_Int:
7915 case X86::VFNMSUB231SSZrkz_Int:
7916 case X86::VFIXUPIMMSSZrri:
7917 case X86::VFIXUPIMMSSZrrik:
7918 case X86::VFIXUPIMMSSZrrikz:
7919 case X86::VFPCLASSSSZri:
7920 case X86::VFPCLASSSSZrik:
7921 case X86::VGETEXPSSZr:
7922 case X86::VGETEXPSSZrk:
7923 case X86::VGETEXPSSZrkz:
7924 case X86::VGETMANTSSZrri:
7925 case X86::VGETMANTSSZrrik:
7926 case X86::VGETMANTSSZrrikz:
7927 case X86::VRANGESSZrri:
7928 case X86::VRANGESSZrrik:
7929 case X86::VRANGESSZrrikz:
7930 case X86::VRCP14SSZrr:
7931 case X86::VRCP14SSZrrk:
7932 case X86::VRCP14SSZrrkz:
7933 case X86::VRCP28SSZr:
7934 case X86::VRCP28SSZrk:
7935 case X86::VRCP28SSZrkz:
7936 case X86::VREDUCESSZrri:
7937 case X86::VREDUCESSZrrik:
7938 case X86::VREDUCESSZrrikz:
7939 case X86::VRNDSCALESSZrri_Int:
7940 case X86::VRNDSCALESSZrrik_Int:
7941 case X86::VRNDSCALESSZrrikz_Int:
7942 case X86::VRSQRT14SSZrr:
7943 case X86::VRSQRT14SSZrrk:
7944 case X86::VRSQRT14SSZrrkz:
7945 case X86::VRSQRT28SSZr:
7946 case X86::VRSQRT28SSZrk:
7947 case X86::VRSQRT28SSZrkz:
7948 case X86::VSCALEFSSZrr:
7949 case X86::VSCALEFSSZrrk:
7950 case X86::VSCALEFSSZrrkz:
7951 return false;
7952 default:
7953 return true;
7954 }
7955 }
7956
7957 if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm ||
7958 Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt ||
7959 Opc == X86::VMOVSDZrm_alt) &&
7960 RegSize > 64) {
7961 // These instructions only load 64 bits, we can't fold them if the
7962 // destination register is wider than 64 bits (8 bytes), and its user
7963 // instruction isn't scalar (SD).
7964 switch (UserOpc) {
7965 case X86::CVTSD2SSrr_Int:
7966 case X86::VCVTSD2SSrr_Int:
7967 case X86::VCVTSD2SSZrr_Int:
7968 case X86::VCVTSD2SSZrrk_Int:
7969 case X86::VCVTSD2SSZrrkz_Int:
7970 case X86::CVTSD2SIrr_Int:
7971 case X86::CVTSD2SI64rr_Int:
7972 case X86::VCVTSD2SIrr_Int:
7973 case X86::VCVTSD2SI64rr_Int:
7974 case X86::VCVTSD2SIZrr_Int:
7975 case X86::VCVTSD2SI64Zrr_Int:
7976 case X86::CVTTSD2SIrr_Int:
7977 case X86::CVTTSD2SI64rr_Int:
7978 case X86::VCVTTSD2SIrr_Int:
7979 case X86::VCVTTSD2SI64rr_Int:
7980 case X86::VCVTTSD2SIZrr_Int:
7981 case X86::VCVTTSD2SI64Zrr_Int:
7982 case X86::VCVTSD2USIZrr_Int:
7983 case X86::VCVTSD2USI64Zrr_Int:
7984 case X86::VCVTTSD2USIZrr_Int:
7985 case X86::VCVTTSD2USI64Zrr_Int:
7986 case X86::ROUNDSDri_Int:
7987 case X86::VROUNDSDri_Int:
7988 case X86::COMISDrr_Int:
7989 case X86::VCOMISDrr_Int:
7990 case X86::VCOMISDZrr_Int:
7991 case X86::UCOMISDrr_Int:
7992 case X86::VUCOMISDrr_Int:
7993 case X86::VUCOMISDZrr_Int:
7994 case X86::ADDSDrr_Int:
7995 case X86::VADDSDrr_Int:
7996 case X86::VADDSDZrr_Int:
7997 case X86::CMPSDrri_Int:
7998 case X86::VCMPSDrri_Int:
7999 case X86::VCMPSDZrri_Int:
8000 case X86::DIVSDrr_Int:
8001 case X86::VDIVSDrr_Int:
8002 case X86::VDIVSDZrr_Int:
8003 case X86::MAXSDrr_Int:
8004 case X86::VMAXSDrr_Int:
8005 case X86::VMAXSDZrr_Int:
8006 case X86::MINSDrr_Int:
8007 case X86::VMINSDrr_Int:
8008 case X86::VMINSDZrr_Int:
8009 case X86::MULSDrr_Int:
8010 case X86::VMULSDrr_Int:
8011 case X86::VMULSDZrr_Int:
8012 case X86::SQRTSDr_Int:
8013 case X86::VSQRTSDr_Int:
8014 case X86::VSQRTSDZr_Int:
8015 case X86::SUBSDrr_Int:
8016 case X86::VSUBSDrr_Int:
8017 case X86::VSUBSDZrr_Int:
8018 case X86::VADDSDZrrk_Int:
8019 case X86::VADDSDZrrkz_Int:
8020 case X86::VCMPSDZrrik_Int:
8021 case X86::VDIVSDZrrk_Int:
8022 case X86::VDIVSDZrrkz_Int:
8023 case X86::VMAXSDZrrk_Int:
8024 case X86::VMAXSDZrrkz_Int:
8025 case X86::VMINSDZrrk_Int:
8026 case X86::VMINSDZrrkz_Int:
8027 case X86::VMULSDZrrk_Int:
8028 case X86::VMULSDZrrkz_Int:
8029 case X86::VSQRTSDZrk_Int:
8030 case X86::VSQRTSDZrkz_Int:
8031 case X86::VSUBSDZrrk_Int:
8032 case X86::VSUBSDZrrkz_Int:
8033 case X86::VFMADDSD4rr_Int:
8034 case X86::VFNMADDSD4rr_Int:
8035 case X86::VFMSUBSD4rr_Int:
8036 case X86::VFNMSUBSD4rr_Int:
8037 case X86::VFMADD132SDr_Int:
8038 case X86::VFNMADD132SDr_Int:
8039 case X86::VFMADD213SDr_Int:
8040 case X86::VFNMADD213SDr_Int:
8041 case X86::VFMADD231SDr_Int:
8042 case X86::VFNMADD231SDr_Int:
8043 case X86::VFMSUB132SDr_Int:
8044 case X86::VFNMSUB132SDr_Int:
8045 case X86::VFMSUB213SDr_Int:
8046 case X86::VFNMSUB213SDr_Int:
8047 case X86::VFMSUB231SDr_Int:
8048 case X86::VFNMSUB231SDr_Int:
8049 case X86::VFMADD132SDZr_Int:
8050 case X86::VFNMADD132SDZr_Int:
8051 case X86::VFMADD213SDZr_Int:
8052 case X86::VFNMADD213SDZr_Int:
8053 case X86::VFMADD231SDZr_Int:
8054 case X86::VFNMADD231SDZr_Int:
8055 case X86::VFMSUB132SDZr_Int:
8056 case X86::VFNMSUB132SDZr_Int:
8057 case X86::VFMSUB213SDZr_Int:
8058 case X86::VFNMSUB213SDZr_Int:
8059 case X86::VFMSUB231SDZr_Int:
8060 case X86::VFNMSUB231SDZr_Int:
8061 case X86::VFMADD132SDZrk_Int:
8062 case X86::VFNMADD132SDZrk_Int:
8063 case X86::VFMADD213SDZrk_Int:
8064 case X86::VFNMADD213SDZrk_Int:
8065 case X86::VFMADD231SDZrk_Int:
8066 case X86::VFNMADD231SDZrk_Int:
8067 case X86::VFMSUB132SDZrk_Int:
8068 case X86::VFNMSUB132SDZrk_Int:
8069 case X86::VFMSUB213SDZrk_Int:
8070 case X86::VFNMSUB213SDZrk_Int:
8071 case X86::VFMSUB231SDZrk_Int:
8072 case X86::VFNMSUB231SDZrk_Int:
8073 case X86::VFMADD132SDZrkz_Int:
8074 case X86::VFNMADD132SDZrkz_Int:
8075 case X86::VFMADD213SDZrkz_Int:
8076 case X86::VFNMADD213SDZrkz_Int:
8077 case X86::VFMADD231SDZrkz_Int:
8078 case X86::VFNMADD231SDZrkz_Int:
8079 case X86::VFMSUB132SDZrkz_Int:
8080 case X86::VFNMSUB132SDZrkz_Int:
8081 case X86::VFMSUB213SDZrkz_Int:
8082 case X86::VFNMSUB213SDZrkz_Int:
8083 case X86::VFMSUB231SDZrkz_Int:
8084 case X86::VFNMSUB231SDZrkz_Int:
8085 case X86::VFIXUPIMMSDZrri:
8086 case X86::VFIXUPIMMSDZrrik:
8087 case X86::VFIXUPIMMSDZrrikz:
8088 case X86::VFPCLASSSDZri:
8089 case X86::VFPCLASSSDZrik:
8090 case X86::VGETEXPSDZr:
8091 case X86::VGETEXPSDZrk:
8092 case X86::VGETEXPSDZrkz:
8093 case X86::VGETMANTSDZrri:
8094 case X86::VGETMANTSDZrrik:
8095 case X86::VGETMANTSDZrrikz:
8096 case X86::VRANGESDZrri:
8097 case X86::VRANGESDZrrik:
8098 case X86::VRANGESDZrrikz:
8099 case X86::VRCP14SDZrr:
8100 case X86::VRCP14SDZrrk:
8101 case X86::VRCP14SDZrrkz:
8102 case X86::VRCP28SDZr:
8103 case X86::VRCP28SDZrk:
8104 case X86::VRCP28SDZrkz:
8105 case X86::VREDUCESDZrri:
8106 case X86::VREDUCESDZrrik:
8107 case X86::VREDUCESDZrrikz:
8108 case X86::VRNDSCALESDZrri_Int:
8109 case X86::VRNDSCALESDZrrik_Int:
8110 case X86::VRNDSCALESDZrrikz_Int:
8111 case X86::VRSQRT14SDZrr:
8112 case X86::VRSQRT14SDZrrk:
8113 case X86::VRSQRT14SDZrrkz:
8114 case X86::VRSQRT28SDZr:
8115 case X86::VRSQRT28SDZrk:
8116 case X86::VRSQRT28SDZrkz:
8117 case X86::VSCALEFSDZrr:
8118 case X86::VSCALEFSDZrrk:
8119 case X86::VSCALEFSDZrrkz:
8120 return false;
8121 default:
8122 return true;
8123 }
8124 }
8125
8126 if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) {
8127 // These instructions only load 16 bits, we can't fold them if the
8128 // destination register is wider than 16 bits (2 bytes), and its user
8129 // instruction isn't scalar (SH).
8130 switch (UserOpc) {
8131 case X86::VADDSHZrr_Int:
8132 case X86::VCMPSHZrri_Int:
8133 case X86::VDIVSHZrr_Int:
8134 case X86::VMAXSHZrr_Int:
8135 case X86::VMINSHZrr_Int:
8136 case X86::VMULSHZrr_Int:
8137 case X86::VSUBSHZrr_Int:
8138 case X86::VADDSHZrrk_Int:
8139 case X86::VADDSHZrrkz_Int:
8140 case X86::VCMPSHZrrik_Int:
8141 case X86::VDIVSHZrrk_Int:
8142 case X86::VDIVSHZrrkz_Int:
8143 case X86::VMAXSHZrrk_Int:
8144 case X86::VMAXSHZrrkz_Int:
8145 case X86::VMINSHZrrk_Int:
8146 case X86::VMINSHZrrkz_Int:
8147 case X86::VMULSHZrrk_Int:
8148 case X86::VMULSHZrrkz_Int:
8149 case X86::VSUBSHZrrk_Int:
8150 case X86::VSUBSHZrrkz_Int:
8151 case X86::VFMADD132SHZr_Int:
8152 case X86::VFNMADD132SHZr_Int:
8153 case X86::VFMADD213SHZr_Int:
8154 case X86::VFNMADD213SHZr_Int:
8155 case X86::VFMADD231SHZr_Int:
8156 case X86::VFNMADD231SHZr_Int:
8157 case X86::VFMSUB132SHZr_Int:
8158 case X86::VFNMSUB132SHZr_Int:
8159 case X86::VFMSUB213SHZr_Int:
8160 case X86::VFNMSUB213SHZr_Int:
8161 case X86::VFMSUB231SHZr_Int:
8162 case X86::VFNMSUB231SHZr_Int:
8163 case X86::VFMADD132SHZrk_Int:
8164 case X86::VFNMADD132SHZrk_Int:
8165 case X86::VFMADD213SHZrk_Int:
8166 case X86::VFNMADD213SHZrk_Int:
8167 case X86::VFMADD231SHZrk_Int:
8168 case X86::VFNMADD231SHZrk_Int:
8169 case X86::VFMSUB132SHZrk_Int:
8170 case X86::VFNMSUB132SHZrk_Int:
8171 case X86::VFMSUB213SHZrk_Int:
8172 case X86::VFNMSUB213SHZrk_Int:
8173 case X86::VFMSUB231SHZrk_Int:
8174 case X86::VFNMSUB231SHZrk_Int:
8175 case X86::VFMADD132SHZrkz_Int:
8176 case X86::VFNMADD132SHZrkz_Int:
8177 case X86::VFMADD213SHZrkz_Int:
8178 case X86::VFNMADD213SHZrkz_Int:
8179 case X86::VFMADD231SHZrkz_Int:
8180 case X86::VFNMADD231SHZrkz_Int:
8181 case X86::VFMSUB132SHZrkz_Int:
8182 case X86::VFNMSUB132SHZrkz_Int:
8183 case X86::VFMSUB213SHZrkz_Int:
8184 case X86::VFNMSUB213SHZrkz_Int:
8185 case X86::VFMSUB231SHZrkz_Int:
8186 case X86::VFNMSUB231SHZrkz_Int:
8187 return false;
8188 default:
8189 return true;
8190 }
8191 }
8192
8193 return false;
8194}
8195
8199 MachineInstr &LoadMI, MachineInstr *&CopyMI,
8200 LiveIntervals *LIS, VirtRegMap *VRM) const {
8202
8203 // If LoadMI is a masked load, check MI having the same mask.
8204 const MCInstrDesc &MCID = get(LoadMI.getOpcode());
8205 unsigned NumOps = MCID.getNumOperands();
8206 if (NumOps >= 3) {
8207 Register MaskReg;
8208 const MachineOperand &Op1 = LoadMI.getOperand(1);
8209 const MachineOperand &Op2 = LoadMI.getOperand(2);
8210
8211 auto IsVKWMClass = [](const TargetRegisterClass *RC) {
8212 return RC == &X86::VK2WMRegClass || RC == &X86::VK4WMRegClass ||
8213 RC == &X86::VK8WMRegClass || RC == &X86::VK16WMRegClass ||
8214 RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass;
8215 };
8216
8217 if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1)))
8218 MaskReg = Op1.getReg();
8219 else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2)))
8220 MaskReg = Op2.getReg();
8221
8222 if (MaskReg) {
8223 // Some instructions are invalid to fold into even with the same mask.
8224 // Folding is unsafe if an active destination element may read from a
8225 // source element that is masked off.
8226 if (isNonFoldableWithSameMask(MI.getOpcode()))
8227 return nullptr;
8228 bool HasSameMask = false;
8229 for (unsigned I = 1, E = MI.getDesc().getNumOperands(); I < E; ++I) {
8230 const MachineOperand &Op = MI.getOperand(I);
8231 if (Op.isReg() && Op.getReg() == MaskReg) {
8232 HasSameMask = true;
8233 break;
8234 }
8235 }
8236 if (!HasSameMask)
8237 return nullptr;
8238 }
8239 }
8240
8241 // TODO: Support the case where LoadMI loads a wide register, but MI
8242 // only uses a subreg.
8243 for (auto Op : Ops) {
8244 if (MI.getOperand(Op).getSubReg())
8245 return nullptr;
8246 }
8247
8248 // If loading from a FrameIndex, fold directly from the FrameIndex.
8249 int FrameIndex;
8250 if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
8251 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8252 return nullptr;
8253 return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex, CopyMI, LIS, VRM);
8254 }
8255
8256 // Check switch flag
8257 if (NoFusing)
8258 return nullptr;
8259
8260 // Avoid partial and undef register update stalls unless optimizing for size.
8261 if (!MF.getFunction().hasOptSize() &&
8262 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
8264 return nullptr;
8265
8266 // Do not fold a NDD instruction and a memory instruction with relocation to
8267 // avoid emit APX relocation when the flag is disabled for backward
8268 // compatibility.
8269 uint64_t TSFlags = MI.getDesc().TSFlags;
8271 X86II::hasNewDataDest(TSFlags))
8272 return nullptr;
8273
8274 // Determine the alignment of the load.
8275 Align Alignment;
8276 unsigned LoadOpc = LoadMI.getOpcode();
8277 if (LoadMI.hasOneMemOperand())
8278 Alignment = (*LoadMI.memoperands_begin())->getAlign();
8279 else
8280 switch (LoadOpc) {
8281 case X86::AVX512_512_SET0:
8282 case X86::AVX512_512_SETALLONES:
8283 Alignment = Align(64);
8284 break;
8285 case X86::AVX2_SETALLONES:
8286 case X86::AVX1_SETALLONES:
8287 case X86::AVX_SET0:
8288 case X86::AVX512_256_SET0:
8289 case X86::AVX512_256_SETALLONES:
8290 Alignment = Align(32);
8291 break;
8292 case X86::V_SET0:
8293 case X86::V_SETALLONES:
8294 case X86::AVX512_128_SET0:
8295 case X86::FsFLD0F128:
8296 case X86::AVX512_FsFLD0F128:
8297 case X86::AVX512_128_SETALLONES:
8298 Alignment = Align(16);
8299 break;
8300 case X86::MMX_SET0:
8301 case X86::FsFLD0SD:
8302 case X86::AVX512_FsFLD0SD:
8303 Alignment = Align(8);
8304 break;
8305 case X86::FsFLD0SS:
8306 case X86::AVX512_FsFLD0SS:
8307 Alignment = Align(4);
8308 break;
8309 case X86::FsFLD0SH:
8310 case X86::AVX512_FsFLD0SH:
8311 Alignment = Align(2);
8312 break;
8313 default:
8314 return nullptr;
8315 }
8316 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
8317 unsigned NewOpc = 0;
8318 switch (MI.getOpcode()) {
8319 default:
8320 return nullptr;
8321 case X86::TEST8rr:
8322 NewOpc = X86::CMP8ri;
8323 break;
8324 case X86::TEST16rr:
8325 NewOpc = X86::CMP16ri;
8326 break;
8327 case X86::TEST32rr:
8328 NewOpc = X86::CMP32ri;
8329 break;
8330 case X86::TEST64rr:
8331 NewOpc = X86::CMP64ri32;
8332 break;
8333 }
8334 // Change to CMPXXri r, 0 first.
8335 MI.setDesc(get(NewOpc));
8336 MI.getOperand(1).ChangeToImmediate(0);
8337 } else if (Ops.size() != 1)
8338 return nullptr;
8339
8340 // Make sure the subregisters match.
8341 // Otherwise we risk changing the size of the load.
8342 if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
8343 return nullptr;
8344
8346 switch (LoadOpc) {
8347 case X86::MMX_SET0:
8348 case X86::V_SET0:
8349 case X86::V_SETALLONES:
8350 case X86::AVX2_SETALLONES:
8351 case X86::AVX1_SETALLONES:
8352 case X86::AVX_SET0:
8353 case X86::AVX512_128_SET0:
8354 case X86::AVX512_256_SET0:
8355 case X86::AVX512_512_SET0:
8356 case X86::AVX512_128_SETALLONES:
8357 case X86::AVX512_256_SETALLONES:
8358 case X86::AVX512_512_SETALLONES:
8359 case X86::FsFLD0SH:
8360 case X86::AVX512_FsFLD0SH:
8361 case X86::FsFLD0SD:
8362 case X86::AVX512_FsFLD0SD:
8363 case X86::FsFLD0SS:
8364 case X86::AVX512_FsFLD0SS:
8365 case X86::FsFLD0F128:
8366 case X86::AVX512_FsFLD0F128: {
8367 // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
8368 // Create a constant-pool entry and operands to load from it.
8369
8370 // Large code model can't fold loads this way.
8372 return nullptr;
8373
8374 // x86-32 PIC requires a PIC base register for constant pools.
8375 unsigned PICBase = 0;
8376 // Since we're using Small or Kernel code model, we can always use
8377 // RIP-relative addressing for a smaller encoding.
8378 if (Subtarget.is64Bit()) {
8379 PICBase = X86::RIP;
8380 } else if (MF.getTarget().isPositionIndependent()) {
8381 // FIXME: PICBase = getGlobalBaseReg(&MF);
8382 // This doesn't work for several reasons.
8383 // 1. GlobalBaseReg may have been spilled.
8384 // 2. It may not be live at MI.
8385 return nullptr;
8386 }
8387
8388 // Create a constant-pool entry.
8390 Type *Ty;
8391 bool IsAllOnes = false;
8392 switch (LoadOpc) {
8393 case X86::FsFLD0SS:
8394 case X86::AVX512_FsFLD0SS:
8396 break;
8397 case X86::FsFLD0SD:
8398 case X86::AVX512_FsFLD0SD:
8400 break;
8401 case X86::FsFLD0F128:
8402 case X86::AVX512_FsFLD0F128:
8404 break;
8405 case X86::FsFLD0SH:
8406 case X86::AVX512_FsFLD0SH:
8408 break;
8409 case X86::AVX512_512_SETALLONES:
8410 IsAllOnes = true;
8411 [[fallthrough]];
8412 case X86::AVX512_512_SET0:
8414 16);
8415 break;
8416 case X86::AVX1_SETALLONES:
8417 case X86::AVX2_SETALLONES:
8418 case X86::AVX512_256_SETALLONES:
8419 IsAllOnes = true;
8420 [[fallthrough]];
8421 case X86::AVX512_256_SET0:
8422 case X86::AVX_SET0:
8424 8);
8425
8426 break;
8427 case X86::MMX_SET0:
8429 2);
8430 break;
8431 case X86::V_SETALLONES:
8432 case X86::AVX512_128_SETALLONES:
8433 IsAllOnes = true;
8434 [[fallthrough]];
8435 case X86::V_SET0:
8436 case X86::AVX512_128_SET0:
8438 4);
8439 break;
8440 }
8441
8442 const Constant *C =
8444 unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
8445
8446 // Create operands to load from the constant pool entry.
8447 MOs.push_back(MachineOperand::CreateReg(PICBase, false));
8449 MOs.push_back(MachineOperand::CreateReg(0, false));
8451 MOs.push_back(MachineOperand::CreateReg(0, false));
8452 break;
8453 }
8454 case X86::VPBROADCASTBZ128rm:
8455 case X86::VPBROADCASTBZ256rm:
8456 case X86::VPBROADCASTBZrm:
8457 case X86::VBROADCASTF32X2Z256rm:
8458 case X86::VBROADCASTF32X2Zrm:
8459 case X86::VBROADCASTI32X2Z128rm:
8460 case X86::VBROADCASTI32X2Z256rm:
8461 case X86::VBROADCASTI32X2Zrm:
8462 // No instructions currently fuse with 8bits or 32bits x 2.
8463 return nullptr;
8464
8465#define FOLD_BROADCAST(SIZE) \
8466 MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, \
8467 LoadMI.operands_begin() + NumOps); \
8468 return foldMemoryBroadcast(MF, MI, Ops[0], MOs, InsertPt, /*Size=*/SIZE, \
8469 /*AllowCommute=*/true);
8470 case X86::VPBROADCASTWZ128rm:
8471 case X86::VPBROADCASTWZ256rm:
8472 case X86::VPBROADCASTWZrm:
8473 FOLD_BROADCAST(16);
8474 case X86::VPBROADCASTDZ128rm:
8475 case X86::VPBROADCASTDZ256rm:
8476 case X86::VPBROADCASTDZrm:
8477 case X86::VBROADCASTSSZ128rm:
8478 case X86::VBROADCASTSSZ256rm:
8479 case X86::VBROADCASTSSZrm:
8480 FOLD_BROADCAST(32);
8481 case X86::VPBROADCASTQZ128rm:
8482 case X86::VPBROADCASTQZ256rm:
8483 case X86::VPBROADCASTQZrm:
8484 case X86::VBROADCASTSDZ256rm:
8485 case X86::VBROADCASTSDZrm:
8486 FOLD_BROADCAST(64);
8487 default: {
8488 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8489 return nullptr;
8490
8491 // Folding a normal load. Just copy the load's address operands.
8493 LoadMI.operands_begin() + NumOps);
8494 break;
8495 }
8496 }
8497 return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
8498 /*Size=*/0, Alignment, /*AllowCommute=*/true,
8499 CopyMI);
8500}
8501
8503X86InstrInfo::foldMemoryBroadcast(MachineFunction &MF, MachineInstr &MI,
8504 unsigned OpNum, ArrayRef<MachineOperand> MOs,
8506 unsigned BitsSize, bool AllowCommute) const {
8507
8508 if (auto *I = lookupBroadcastFoldTable(MI.getOpcode(), OpNum))
8509 return matchBroadcastSize(*I, BitsSize)
8510 ? fuseInst(MF, I->DstOp, OpNum, MOs, InsertPt, MI, *this)
8511 : nullptr;
8512
8513 if (AllowCommute) {
8514 // If the instruction and target operand are commutable, commute the
8515 // instruction and try again.
8516 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
8517 if (CommuteOpIdx2 == OpNum) {
8518 printFailMsgforFold(MI, OpNum);
8519 return nullptr;
8520 }
8521 MachineInstr *NewMI =
8522 foldMemoryBroadcast(MF, MI, CommuteOpIdx2, MOs, InsertPt, BitsSize,
8523 /*AllowCommute=*/false);
8524 if (NewMI)
8525 return NewMI;
8526 // Folding failed again - undo the commute before returning.
8527 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
8528 }
8529
8530 printFailMsgforFold(MI, OpNum);
8531 return nullptr;
8532}
8533
8537
8538 for (MachineMemOperand *MMO : MMOs) {
8539 if (!MMO->isLoad())
8540 continue;
8541
8542 if (!MMO->isStore()) {
8543 // Reuse the MMO.
8544 LoadMMOs.push_back(MMO);
8545 } else {
8546 // Clone the MMO and unset the store flag.
8547 LoadMMOs.push_back(MF.getMachineMemOperand(
8548 MMO, MMO->getFlags() & ~MachineMemOperand::MOStore));
8549 }
8550 }
8551
8552 return LoadMMOs;
8553}
8554
8558
8559 for (MachineMemOperand *MMO : MMOs) {
8560 if (!MMO->isStore())
8561 continue;
8562
8563 if (!MMO->isLoad()) {
8564 // Reuse the MMO.
8565 StoreMMOs.push_back(MMO);
8566 } else {
8567 // Clone the MMO and unset the load flag.
8568 StoreMMOs.push_back(MF.getMachineMemOperand(
8569 MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad));
8570 }
8571 }
8572
8573 return StoreMMOs;
8574}
8575
8577 const TargetRegisterClass *RC,
8578 const X86Subtarget &STI) {
8579 assert(STI.hasAVX512() && "Expected at least AVX512!");
8580 unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
8581 assert((SpillSize == 64 || STI.hasVLX()) &&
8582 "Can't broadcast less than 64 bytes without AVX512VL!");
8583
8584#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64) \
8585 case TYPE: \
8586 switch (SpillSize) { \
8587 default: \
8588 llvm_unreachable("Unknown spill size"); \
8589 case 16: \
8590 return X86::OP16; \
8591 case 32: \
8592 return X86::OP32; \
8593 case 64: \
8594 return X86::OP64; \
8595 } \
8596 break;
8597
8598 switch (I->Flags & TB_BCAST_MASK) {
8599 default:
8600 llvm_unreachable("Unexpected broadcast type!");
8601 CASE_BCAST_TYPE_OPC(TB_BCAST_W, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8602 VPBROADCASTWZrm)
8603 CASE_BCAST_TYPE_OPC(TB_BCAST_D, VPBROADCASTDZ128rm, VPBROADCASTDZ256rm,
8604 VPBROADCASTDZrm)
8605 CASE_BCAST_TYPE_OPC(TB_BCAST_Q, VPBROADCASTQZ128rm, VPBROADCASTQZ256rm,
8606 VPBROADCASTQZrm)
8607 CASE_BCAST_TYPE_OPC(TB_BCAST_SH, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8608 VPBROADCASTWZrm)
8609 CASE_BCAST_TYPE_OPC(TB_BCAST_SS, VBROADCASTSSZ128rm, VBROADCASTSSZ256rm,
8610 VBROADCASTSSZrm)
8611 CASE_BCAST_TYPE_OPC(TB_BCAST_SD, VMOVDDUPZ128rm, VBROADCASTSDZ256rm,
8612 VBROADCASTSDZrm)
8613 }
8614}
8615
8617 MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad,
8618 bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
8619 const X86FoldTableEntry *I = lookupUnfoldTable(MI.getOpcode());
8620 if (I == nullptr)
8621 return false;
8622 unsigned Opc = I->DstOp;
8623 unsigned Index = I->Flags & TB_INDEX_MASK;
8624 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8625 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8626 if (UnfoldLoad && !FoldedLoad)
8627 return false;
8628 UnfoldLoad &= FoldedLoad;
8629 if (UnfoldStore && !FoldedStore)
8630 return false;
8631 UnfoldStore &= FoldedStore;
8632
8633 const MCInstrDesc &MCID = get(Opc);
8634
8635 const TargetRegisterClass *RC = getRegClass(MCID, Index);
8637 // TODO: Check if 32-byte or greater accesses are slow too?
8638 if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
8639 Subtarget.isUnalignedMem16Slow())
8640 // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
8641 // conservatively assume the address is unaligned. That's bad for
8642 // performance.
8643 return false;
8648 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
8649 MachineOperand &Op = MI.getOperand(i);
8650 if (i >= Index && i < Index + X86::AddrNumOperands)
8651 AddrOps.push_back(Op);
8652 else if (Op.isReg() && Op.isImplicit())
8653 ImpOps.push_back(Op);
8654 else if (i < Index)
8655 BeforeOps.push_back(Op);
8656 else if (i > Index)
8657 AfterOps.push_back(Op);
8658 }
8659
8660 // Emit the load or broadcast instruction.
8661 if (UnfoldLoad) {
8662 auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
8663
8664 unsigned Opc;
8665 if (I->Flags & TB_BCAST_MASK) {
8666 Opc = getBroadcastOpcode(I, RC, Subtarget);
8667 } else {
8668 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8669 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8670 Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
8671 }
8672
8673 DebugLoc DL;
8674 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
8675 for (const MachineOperand &AddrOp : AddrOps)
8676 MIB.add(AddrOp);
8677 MIB.setMemRefs(MMOs);
8678 NewMIs.push_back(MIB);
8679
8680 if (UnfoldStore) {
8681 // Address operands cannot be marked isKill.
8682 for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
8683 MachineOperand &MO = NewMIs[0]->getOperand(i);
8684 if (MO.isReg())
8685 MO.setIsKill(false);
8686 }
8687 }
8688 }
8689
8690 // Emit the data processing instruction.
8691 MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
8692 MachineInstrBuilder MIB(MF, DataMI);
8693
8694 if (FoldedStore)
8695 MIB.addReg(Reg, RegState::Define);
8696 for (MachineOperand &BeforeOp : BeforeOps)
8697 MIB.add(BeforeOp);
8698 if (FoldedLoad)
8699 MIB.addReg(Reg);
8700 for (MachineOperand &AfterOp : AfterOps)
8701 MIB.add(AfterOp);
8702 for (MachineOperand &ImpOp : ImpOps) {
8703 MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) |
8705 getKillRegState(ImpOp.isKill()) |
8706 getDeadRegState(ImpOp.isDead()) |
8707 getUndefRegState(ImpOp.isUndef()));
8708 }
8709 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8710 switch (DataMI->getOpcode()) {
8711 default:
8712 break;
8713 case X86::CMP64ri32:
8714 case X86::CMP32ri:
8715 case X86::CMP16ri:
8716 case X86::CMP8ri: {
8717 MachineOperand &MO0 = DataMI->getOperand(0);
8718 MachineOperand &MO1 = DataMI->getOperand(1);
8719 if (MO1.isImm() && MO1.getImm() == 0) {
8720 unsigned NewOpc;
8721 switch (DataMI->getOpcode()) {
8722 default:
8723 llvm_unreachable("Unreachable!");
8724 case X86::CMP64ri32:
8725 NewOpc = X86::TEST64rr;
8726 break;
8727 case X86::CMP32ri:
8728 NewOpc = X86::TEST32rr;
8729 break;
8730 case X86::CMP16ri:
8731 NewOpc = X86::TEST16rr;
8732 break;
8733 case X86::CMP8ri:
8734 NewOpc = X86::TEST8rr;
8735 break;
8736 }
8737 DataMI->setDesc(get(NewOpc));
8738 MO1.ChangeToRegister(MO0.getReg(), false);
8739 }
8740 }
8741 }
8742 NewMIs.push_back(DataMI);
8743
8744 // Emit the store instruction.
8745 if (UnfoldStore) {
8746 const TargetRegisterClass *DstRC = getRegClass(MCID, 0);
8747 auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
8748 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
8749 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8750 unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
8751 DebugLoc DL;
8752 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
8753 for (const MachineOperand &AddrOp : AddrOps)
8754 MIB.add(AddrOp);
8755 MIB.addReg(Reg, RegState::Kill);
8756 MIB.setMemRefs(MMOs);
8757 NewMIs.push_back(MIB);
8758 }
8759
8760 return true;
8761}
8762
8764 SelectionDAG &DAG, SDNode *N, SmallVectorImpl<SDNode *> &NewNodes) const {
8765 if (!N->isMachineOpcode())
8766 return false;
8767
8768 const X86FoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode());
8769 if (I == nullptr)
8770 return false;
8771 unsigned Opc = I->DstOp;
8772 unsigned Index = I->Flags & TB_INDEX_MASK;
8773 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8774 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8775 const MCInstrDesc &MCID = get(Opc);
8778 const TargetRegisterClass *RC = getRegClass(MCID, Index);
8779 unsigned NumDefs = MCID.NumDefs;
8780 std::vector<SDValue> AddrOps;
8781 std::vector<SDValue> BeforeOps;
8782 std::vector<SDValue> AfterOps;
8783 SDLoc dl(N);
8784 unsigned NumOps = N->getNumOperands();
8785 for (unsigned i = 0; i != NumOps - 1; ++i) {
8786 SDValue Op = N->getOperand(i);
8787 if (i >= Index - NumDefs && i < Index - NumDefs + X86::AddrNumOperands)
8788 AddrOps.push_back(Op);
8789 else if (i < Index - NumDefs)
8790 BeforeOps.push_back(Op);
8791 else if (i > Index - NumDefs)
8792 AfterOps.push_back(Op);
8793 }
8794 SDValue Chain = N->getOperand(NumOps - 1);
8795 AddrOps.push_back(Chain);
8796
8797 // Emit the load instruction.
8798 SDNode *Load = nullptr;
8799 if (FoldedLoad) {
8800 EVT VT = *TRI.legalclasstypes_begin(*RC);
8801 auto MMOs = extractLoadMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8802 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8803 Subtarget.isUnalignedMem16Slow())
8804 // Do not introduce a slow unaligned load.
8805 return false;
8806 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8807 // memory access is slow above.
8808
8809 unsigned Opc;
8810 if (I->Flags & TB_BCAST_MASK) {
8811 Opc = getBroadcastOpcode(I, RC, Subtarget);
8812 } else {
8813 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8814 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8815 Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
8816 }
8817
8818 Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
8819 NewNodes.push_back(Load);
8820
8821 // Preserve memory reference information.
8822 DAG.setNodeMemRefs(cast<MachineSDNode>(Load), MMOs);
8823 }
8824
8825 // Emit the data processing instruction.
8826 std::vector<EVT> VTs;
8827 const TargetRegisterClass *DstRC = nullptr;
8828 if (MCID.getNumDefs() > 0) {
8829 DstRC = getRegClass(MCID, 0);
8830 VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
8831 }
8832 for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
8833 EVT VT = N->getValueType(i);
8834 if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
8835 VTs.push_back(VT);
8836 }
8837 if (Load)
8838 BeforeOps.push_back(SDValue(Load, 0));
8839 llvm::append_range(BeforeOps, AfterOps);
8840 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8841 switch (Opc) {
8842 default:
8843 break;
8844 case X86::CMP64ri32:
8845 case X86::CMP32ri:
8846 case X86::CMP16ri:
8847 case X86::CMP8ri:
8848 if (isNullConstant(BeforeOps[1])) {
8849 switch (Opc) {
8850 default:
8851 llvm_unreachable("Unreachable!");
8852 case X86::CMP64ri32:
8853 Opc = X86::TEST64rr;
8854 break;
8855 case X86::CMP32ri:
8856 Opc = X86::TEST32rr;
8857 break;
8858 case X86::CMP16ri:
8859 Opc = X86::TEST16rr;
8860 break;
8861 case X86::CMP8ri:
8862 Opc = X86::TEST8rr;
8863 break;
8864 }
8865 BeforeOps[1] = BeforeOps[0];
8866 }
8867 }
8868 SDNode *NewNode = DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
8869 NewNodes.push_back(NewNode);
8870
8871 // Emit the store instruction.
8872 if (FoldedStore) {
8873 AddrOps.pop_back();
8874 AddrOps.push_back(SDValue(NewNode, 0));
8875 AddrOps.push_back(Chain);
8876 auto MMOs = extractStoreMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8877 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8878 Subtarget.isUnalignedMem16Slow())
8879 // Do not introduce a slow unaligned store.
8880 return false;
8881 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8882 // memory access is slow above.
8883 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8884 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8885 SDNode *Store =
8886 DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
8887 dl, MVT::Other, AddrOps);
8888 NewNodes.push_back(Store);
8889
8890 // Preserve memory reference information.
8891 DAG.setNodeMemRefs(cast<MachineSDNode>(Store), MMOs);
8892 }
8893
8894 return true;
8895}
8896
8897unsigned
8899 bool UnfoldStore,
8900 unsigned *LoadRegIndex) const {
8902 if (I == nullptr)
8903 return 0;
8904 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8905 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8906 if (UnfoldLoad && !FoldedLoad)
8907 return 0;
8908 if (UnfoldStore && !FoldedStore)
8909 return 0;
8910 if (LoadRegIndex)
8911 *LoadRegIndex = I->Flags & TB_INDEX_MASK;
8912 return I->DstOp;
8913}
8914
8916 int64_t &Offset1,
8917 int64_t &Offset2) const {
8918 if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
8919 return false;
8920
8921 auto IsLoadOpcode = [&](unsigned Opcode) {
8922 switch (Opcode) {
8923 default:
8924 return false;
8925 case X86::MOV8rm:
8926 case X86::MOV16rm:
8927 case X86::MOV32rm:
8928 case X86::MOV64rm:
8929 case X86::LD_Fp32m:
8930 case X86::LD_Fp64m:
8931 case X86::LD_Fp80m:
8932 case X86::MOVSSrm:
8933 case X86::MOVSSrm_alt:
8934 case X86::MOVSDrm:
8935 case X86::MOVSDrm_alt:
8936 case X86::MMX_MOVD64rm:
8937 case X86::MMX_MOVQ64rm:
8938 case X86::MOVAPSrm:
8939 case X86::MOVUPSrm:
8940 case X86::MOVAPDrm:
8941 case X86::MOVUPDrm:
8942 case X86::MOVDQArm:
8943 case X86::MOVDQUrm:
8944 // AVX load instructions
8945 case X86::VMOVSSrm:
8946 case X86::VMOVSSrm_alt:
8947 case X86::VMOVSDrm:
8948 case X86::VMOVSDrm_alt:
8949 case X86::VMOVAPSrm:
8950 case X86::VMOVUPSrm:
8951 case X86::VMOVAPDrm:
8952 case X86::VMOVUPDrm:
8953 case X86::VMOVDQArm:
8954 case X86::VMOVDQUrm:
8955 case X86::VMOVAPSYrm:
8956 case X86::VMOVUPSYrm:
8957 case X86::VMOVAPDYrm:
8958 case X86::VMOVUPDYrm:
8959 case X86::VMOVDQAYrm:
8960 case X86::VMOVDQUYrm:
8961 // AVX512 load instructions
8962 case X86::VMOVSSZrm:
8963 case X86::VMOVSSZrm_alt:
8964 case X86::VMOVSDZrm:
8965 case X86::VMOVSDZrm_alt:
8966 case X86::VMOVAPSZ128rm:
8967 case X86::VMOVUPSZ128rm:
8968 case X86::VMOVAPSZ128rm_NOVLX:
8969 case X86::VMOVUPSZ128rm_NOVLX:
8970 case X86::VMOVAPDZ128rm:
8971 case X86::VMOVUPDZ128rm:
8972 case X86::VMOVDQU8Z128rm:
8973 case X86::VMOVDQU16Z128rm:
8974 case X86::VMOVDQA32Z128rm:
8975 case X86::VMOVDQU32Z128rm:
8976 case X86::VMOVDQA64Z128rm:
8977 case X86::VMOVDQU64Z128rm:
8978 case X86::VMOVAPSZ256rm:
8979 case X86::VMOVUPSZ256rm:
8980 case X86::VMOVAPSZ256rm_NOVLX:
8981 case X86::VMOVUPSZ256rm_NOVLX:
8982 case X86::VMOVAPDZ256rm:
8983 case X86::VMOVUPDZ256rm:
8984 case X86::VMOVDQU8Z256rm:
8985 case X86::VMOVDQU16Z256rm:
8986 case X86::VMOVDQA32Z256rm:
8987 case X86::VMOVDQU32Z256rm:
8988 case X86::VMOVDQA64Z256rm:
8989 case X86::VMOVDQU64Z256rm:
8990 case X86::VMOVAPSZrm:
8991 case X86::VMOVUPSZrm:
8992 case X86::VMOVAPDZrm:
8993 case X86::VMOVUPDZrm:
8994 case X86::VMOVDQU8Zrm:
8995 case X86::VMOVDQU16Zrm:
8996 case X86::VMOVDQA32Zrm:
8997 case X86::VMOVDQU32Zrm:
8998 case X86::VMOVDQA64Zrm:
8999 case X86::VMOVDQU64Zrm:
9000 case X86::KMOVBkm:
9001 case X86::KMOVBkm_EVEX:
9002 case X86::KMOVWkm:
9003 case X86::KMOVWkm_EVEX:
9004 case X86::KMOVDkm:
9005 case X86::KMOVDkm_EVEX:
9006 case X86::KMOVQkm:
9007 case X86::KMOVQkm_EVEX:
9008 return true;
9009 }
9010 };
9011
9012 if (!IsLoadOpcode(Load1->getMachineOpcode()) ||
9013 !IsLoadOpcode(Load2->getMachineOpcode()))
9014 return false;
9015
9016 // Lambda to check if both the loads have the same value for an operand index.
9017 auto HasSameOp = [&](int I) {
9018 return Load1->getOperand(I) == Load2->getOperand(I);
9019 };
9020
9021 // All operands except the displacement should match.
9022 if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
9023 !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
9024 return false;
9025
9026 // Chain Operand must be the same.
9027 if (!HasSameOp(5))
9028 return false;
9029
9030 // Now let's examine if the displacements are constants.
9033 if (!Disp1 || !Disp2)
9034 return false;
9035
9036 Offset1 = Disp1->getSExtValue();
9037 Offset2 = Disp2->getSExtValue();
9038 return true;
9039}
9040
9042 int64_t Offset1, int64_t Offset2,
9043 unsigned NumLoads) const {
9044 assert(Offset2 > Offset1);
9045 if ((Offset2 - Offset1) / 8 > 64)
9046 return false;
9047
9048 unsigned Opc1 = Load1->getMachineOpcode();
9049 unsigned Opc2 = Load2->getMachineOpcode();
9050 if (Opc1 != Opc2)
9051 return false; // FIXME: overly conservative?
9052
9053 switch (Opc1) {
9054 default:
9055 break;
9056 case X86::LD_Fp32m:
9057 case X86::LD_Fp64m:
9058 case X86::LD_Fp80m:
9059 case X86::MMX_MOVD64rm:
9060 case X86::MMX_MOVQ64rm:
9061 return false;
9062 }
9063
9064 EVT VT = Load1->getValueType(0);
9065 switch (VT.getSimpleVT().SimpleTy) {
9066 default:
9067 // XMM registers. In 64-bit mode we can be a bit more aggressive since we
9068 // have 16 of them to play with.
9069 if (Subtarget.is64Bit()) {
9070 if (NumLoads >= 3)
9071 return false;
9072 } else if (NumLoads) {
9073 return false;
9074 }
9075 break;
9076 case MVT::i8:
9077 case MVT::i16:
9078 case MVT::i32:
9079 case MVT::i64:
9080 case MVT::f32:
9081 case MVT::f64:
9082 if (NumLoads)
9083 return false;
9084 break;
9085 }
9086
9087 return true;
9088}
9089
9091 const MachineBasicBlock *MBB,
9092 const MachineFunction &MF) const {
9093
9094 // ENDBR instructions should not be scheduled around.
9095 unsigned Opcode = MI.getOpcode();
9096 if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
9097 Opcode == X86::PLDTILECFGV)
9098 return true;
9099
9100 // Frame setup and destroy can't be scheduled around.
9101 if (MI.getFlag(MachineInstr::FrameSetup) ||
9103 return true;
9104
9106}
9107
9110 assert(Cond.size() == 1 && "Invalid X86 branch condition!");
9111 X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
9112 Cond[0].setImm(GetOppositeBranchCondition(CC));
9113 return false;
9114}
9115
9117 const TargetRegisterClass *RC) const {
9118 // FIXME: Return false for x87 stack register classes for now. We can't
9119 // allow any loads of these registers before FpGet_ST0_80.
9120 return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
9121 RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
9122 RC == &X86::RFP80RegClass);
9123}
9124
9125/// Return a virtual register initialized with the
9126/// the global base register value. Output instructions required to
9127/// initialize the register in the function entry block, if necessary.
9128///
9129/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
9130///
9133 Register GlobalBaseReg = X86FI->getGlobalBaseReg();
9134 if (GlobalBaseReg)
9135 return GlobalBaseReg;
9136
9137 // Create the register. The code to initialize it is inserted
9138 // later, by the CGBR pass (below).
9139 MachineRegisterInfo &RegInfo = MF->getRegInfo();
9140 GlobalBaseReg = RegInfo.createVirtualRegister(
9141 Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass);
9142 X86FI->setGlobalBaseReg(GlobalBaseReg);
9143 return GlobalBaseReg;
9144}
9145
9146// FIXME: Some shuffle and unpack instructions have equivalents in different
9147// domains, but they require a bit more work than just switching opcodes.
9148
9149static const uint16_t *lookup(unsigned opcode, unsigned domain,
9150 ArrayRef<uint16_t[3]> Table) {
9151 for (const uint16_t(&Row)[3] : Table)
9152 if (Row[domain - 1] == opcode)
9153 return Row;
9154 return nullptr;
9155}
9156
9157static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
9158 ArrayRef<uint16_t[4]> Table) {
9159 // If this is the integer domain make sure to check both integer columns.
9160 for (const uint16_t(&Row)[4] : Table)
9161 if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
9162 return Row;
9163 return nullptr;
9164}
9165
9166// Helper to attempt to widen/narrow blend masks.
9167static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
9168 unsigned NewWidth, unsigned *pNewMask = nullptr) {
9169 assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
9170 "Illegal blend mask scale");
9171 unsigned NewMask = 0;
9172
9173 if ((OldWidth % NewWidth) == 0) {
9174 unsigned Scale = OldWidth / NewWidth;
9175 unsigned SubMask = (1u << Scale) - 1;
9176 for (unsigned i = 0; i != NewWidth; ++i) {
9177 unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
9178 if (Sub == SubMask)
9179 NewMask |= (1u << i);
9180 else if (Sub != 0x0)
9181 return false;
9182 }
9183 } else {
9184 unsigned Scale = NewWidth / OldWidth;
9185 unsigned SubMask = (1u << Scale) - 1;
9186 for (unsigned i = 0; i != OldWidth; ++i) {
9187 if (OldMask & (1 << i)) {
9188 NewMask |= (SubMask << (i * Scale));
9189 }
9190 }
9191 }
9192
9193 if (pNewMask)
9194 *pNewMask = NewMask;
9195 return true;
9196}
9197
9199 unsigned Opcode = MI.getOpcode();
9200 unsigned NumOperands = MI.getDesc().getNumOperands();
9201
9202 auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
9203 uint16_t validDomains = 0;
9204 if (MI.getOperand(NumOperands - 1).isImm()) {
9205 unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
9206 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
9207 validDomains |= 0x2; // PackedSingle
9208 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
9209 validDomains |= 0x4; // PackedDouble
9210 if (!Is256 || Subtarget.hasAVX2())
9211 validDomains |= 0x8; // PackedInt
9212 }
9213 return validDomains;
9214 };
9215
9216 switch (Opcode) {
9217 case X86::BLENDPDrmi:
9218 case X86::BLENDPDrri:
9219 case X86::VBLENDPDrmi:
9220 case X86::VBLENDPDrri:
9221 return GetBlendDomains(2, false);
9222 case X86::VBLENDPDYrmi:
9223 case X86::VBLENDPDYrri:
9224 return GetBlendDomains(4, true);
9225 case X86::BLENDPSrmi:
9226 case X86::BLENDPSrri:
9227 case X86::VBLENDPSrmi:
9228 case X86::VBLENDPSrri:
9229 case X86::VPBLENDDrmi:
9230 case X86::VPBLENDDrri:
9231 return GetBlendDomains(4, false);
9232 case X86::VBLENDPSYrmi:
9233 case X86::VBLENDPSYrri:
9234 case X86::VPBLENDDYrmi:
9235 case X86::VPBLENDDYrri:
9236 return GetBlendDomains(8, true);
9237 case X86::PBLENDWrmi:
9238 case X86::PBLENDWrri:
9239 case X86::VPBLENDWrmi:
9240 case X86::VPBLENDWrri:
9241 // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
9242 case X86::VPBLENDWYrmi:
9243 case X86::VPBLENDWYrri:
9244 return GetBlendDomains(8, false);
9245 case X86::VPANDDZ128rr:
9246 case X86::VPANDDZ128rm:
9247 case X86::VPANDDZ256rr:
9248 case X86::VPANDDZ256rm:
9249 case X86::VPANDQZ128rr:
9250 case X86::VPANDQZ128rm:
9251 case X86::VPANDQZ256rr:
9252 case X86::VPANDQZ256rm:
9253 case X86::VPANDNDZ128rr:
9254 case X86::VPANDNDZ128rm:
9255 case X86::VPANDNDZ256rr:
9256 case X86::VPANDNDZ256rm:
9257 case X86::VPANDNQZ128rr:
9258 case X86::VPANDNQZ128rm:
9259 case X86::VPANDNQZ256rr:
9260 case X86::VPANDNQZ256rm:
9261 case X86::VPORDZ128rr:
9262 case X86::VPORDZ128rm:
9263 case X86::VPORDZ256rr:
9264 case X86::VPORDZ256rm:
9265 case X86::VPORQZ128rr:
9266 case X86::VPORQZ128rm:
9267 case X86::VPORQZ256rr:
9268 case X86::VPORQZ256rm:
9269 case X86::VPXORDZ128rr:
9270 case X86::VPXORDZ128rm:
9271 case X86::VPXORDZ256rr:
9272 case X86::VPXORDZ256rm:
9273 case X86::VPXORQZ128rr:
9274 case X86::VPXORQZ128rm:
9275 case X86::VPXORQZ256rr:
9276 case X86::VPXORQZ256rm:
9277 // If we don't have DQI see if we can still switch from an EVEX integer
9278 // instruction to a VEX floating point instruction.
9279 if (Subtarget.hasDQI())
9280 return 0;
9281
9282 if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16)
9283 return 0;
9284 if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16)
9285 return 0;
9286 // Register forms will have 3 operands. Memory form will have more.
9287 if (NumOperands == 3 &&
9288 RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
9289 return 0;
9290
9291 // All domains are valid.
9292 return 0xe;
9293 case X86::MOVHLPSrr:
9294 // We can swap domains when both inputs are the same register.
9295 // FIXME: This doesn't catch all the cases we would like. If the input
9296 // register isn't KILLed by the instruction, the two address instruction
9297 // pass puts a COPY on one input. The other input uses the original
9298 // register. This prevents the same physical register from being used by
9299 // both inputs.
9300 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9301 MI.getOperand(0).getSubReg() == 0 &&
9302 MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0)
9303 return 0x6;
9304 return 0;
9305 case X86::SHUFPDrri:
9306 return 0x6;
9307 }
9308 return 0;
9309}
9310
9311#include "X86ReplaceableInstrs.def"
9312
9314 unsigned Domain) const {
9315 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9316 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9317 assert(dom && "Not an SSE instruction");
9318
9319 unsigned Opcode = MI.getOpcode();
9320 unsigned NumOperands = MI.getDesc().getNumOperands();
9321
9322 auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
9323 if (MI.getOperand(NumOperands - 1).isImm()) {
9324 unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
9325 Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
9326 unsigned NewImm = Imm;
9327
9328 const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
9329 if (!table)
9330 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9331
9332 if (Domain == 1) { // PackedSingle
9333 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9334 } else if (Domain == 2) { // PackedDouble
9335 AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
9336 } else if (Domain == 3) { // PackedInt
9337 if (Subtarget.hasAVX2()) {
9338 // If we are already VPBLENDW use that, else use VPBLENDD.
9339 if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
9340 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9341 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9342 }
9343 } else {
9344 assert(!Is256 && "128-bit vector expected");
9345 AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
9346 }
9347 }
9348
9349 assert(table && table[Domain - 1] && "Unknown domain op");
9350 MI.setDesc(get(table[Domain - 1]));
9351 MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
9352 }
9353 return true;
9354 };
9355
9356 switch (Opcode) {
9357 case X86::BLENDPDrmi:
9358 case X86::BLENDPDrri:
9359 case X86::VBLENDPDrmi:
9360 case X86::VBLENDPDrri:
9361 return SetBlendDomain(2, false);
9362 case X86::VBLENDPDYrmi:
9363 case X86::VBLENDPDYrri:
9364 return SetBlendDomain(4, true);
9365 case X86::BLENDPSrmi:
9366 case X86::BLENDPSrri:
9367 case X86::VBLENDPSrmi:
9368 case X86::VBLENDPSrri:
9369 case X86::VPBLENDDrmi:
9370 case X86::VPBLENDDrri:
9371 return SetBlendDomain(4, false);
9372 case X86::VBLENDPSYrmi:
9373 case X86::VBLENDPSYrri:
9374 case X86::VPBLENDDYrmi:
9375 case X86::VPBLENDDYrri:
9376 return SetBlendDomain(8, true);
9377 case X86::PBLENDWrmi:
9378 case X86::PBLENDWrri:
9379 case X86::VPBLENDWrmi:
9380 case X86::VPBLENDWrri:
9381 return SetBlendDomain(8, false);
9382 case X86::VPBLENDWYrmi:
9383 case X86::VPBLENDWYrri:
9384 return SetBlendDomain(16, true);
9385 case X86::VPANDDZ128rr:
9386 case X86::VPANDDZ128rm:
9387 case X86::VPANDDZ256rr:
9388 case X86::VPANDDZ256rm:
9389 case X86::VPANDQZ128rr:
9390 case X86::VPANDQZ128rm:
9391 case X86::VPANDQZ256rr:
9392 case X86::VPANDQZ256rm:
9393 case X86::VPANDNDZ128rr:
9394 case X86::VPANDNDZ128rm:
9395 case X86::VPANDNDZ256rr:
9396 case X86::VPANDNDZ256rm:
9397 case X86::VPANDNQZ128rr:
9398 case X86::VPANDNQZ128rm:
9399 case X86::VPANDNQZ256rr:
9400 case X86::VPANDNQZ256rm:
9401 case X86::VPORDZ128rr:
9402 case X86::VPORDZ128rm:
9403 case X86::VPORDZ256rr:
9404 case X86::VPORDZ256rm:
9405 case X86::VPORQZ128rr:
9406 case X86::VPORQZ128rm:
9407 case X86::VPORQZ256rr:
9408 case X86::VPORQZ256rm:
9409 case X86::VPXORDZ128rr:
9410 case X86::VPXORDZ128rm:
9411 case X86::VPXORDZ256rr:
9412 case X86::VPXORDZ256rm:
9413 case X86::VPXORQZ128rr:
9414 case X86::VPXORQZ128rm:
9415 case X86::VPXORQZ256rr:
9416 case X86::VPXORQZ256rm: {
9417 // Without DQI, convert EVEX instructions to VEX instructions.
9418 if (Subtarget.hasDQI())
9419 return false;
9420
9421 const uint16_t *table =
9422 lookupAVX512(MI.getOpcode(), dom, ReplaceableCustomAVX512LogicInstrs);
9423 assert(table && "Instruction not found in table?");
9424 // Don't change integer Q instructions to D instructions and
9425 // use D intructions if we started with a PS instruction.
9426 if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9427 Domain = 4;
9428 MI.setDesc(get(table[Domain - 1]));
9429 return true;
9430 }
9431 case X86::UNPCKHPDrr:
9432 case X86::MOVHLPSrr:
9433 // We just need to commute the instruction which will switch the domains.
9434 if (Domain != dom && Domain != 3 &&
9435 MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9436 MI.getOperand(0).getSubReg() == 0 &&
9437 MI.getOperand(1).getSubReg() == 0 &&
9438 MI.getOperand(2).getSubReg() == 0) {
9439 commuteInstruction(MI, false);
9440 return true;
9441 }
9442 // We must always return true for MOVHLPSrr.
9443 if (Opcode == X86::MOVHLPSrr)
9444 return true;
9445 break;
9446 case X86::SHUFPDrri: {
9447 if (Domain == 1) {
9448 unsigned Imm = MI.getOperand(3).getImm();
9449 unsigned NewImm = 0x44;
9450 if (Imm & 1)
9451 NewImm |= 0x0a;
9452 if (Imm & 2)
9453 NewImm |= 0xa0;
9454 MI.getOperand(3).setImm(NewImm);
9455 MI.setDesc(get(X86::SHUFPSrri));
9456 }
9457 return true;
9458 }
9459 }
9460 return false;
9461}
9462
9463std::pair<uint16_t, uint16_t>
9465 uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9466 unsigned opcode = MI.getOpcode();
9467 uint16_t validDomains = 0;
9468 if (domain) {
9469 // Attempt to match for custom instructions.
9470 validDomains = getExecutionDomainCustom(MI);
9471 if (validDomains)
9472 return std::make_pair(domain, validDomains);
9473
9474 if (lookup(opcode, domain, ReplaceableInstrs)) {
9475 validDomains = 0xe;
9476 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
9477 validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
9478 } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
9479 validDomains = 0x6;
9480 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
9481 // Insert/extract instructions should only effect domain if AVX2
9482 // is enabled.
9483 if (!Subtarget.hasAVX2())
9484 return std::make_pair(0, 0);
9485 validDomains = 0xe;
9486 } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
9487 validDomains = 0xe;
9488 } else if (Subtarget.hasDQI() &&
9489 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
9490 validDomains = 0xe;
9491 } else if (Subtarget.hasDQI()) {
9492 if (const uint16_t *table =
9493 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) {
9494 if (domain == 1 || (domain == 3 && table[3] == opcode))
9495 validDomains = 0xa;
9496 else
9497 validDomains = 0xc;
9498 }
9499 }
9500 }
9501 return std::make_pair(domain, validDomains);
9502}
9503
9505 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9506 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9507 assert(dom && "Not an SSE instruction");
9508
9509 // Attempt to match for custom instructions.
9511 return;
9512
9513 const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
9514 if (!table) { // try the other table
9515 assert((Subtarget.hasAVX2() || Domain < 3) &&
9516 "256-bit vector operations only available in AVX2");
9517 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
9518 }
9519 if (!table) { // try the FP table
9520 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
9521 assert((!table || Domain < 3) &&
9522 "Can only select PackedSingle or PackedDouble");
9523 }
9524 if (!table) { // try the other table
9525 assert(Subtarget.hasAVX2() &&
9526 "256-bit insert/extract only available in AVX2");
9527 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
9528 }
9529 if (!table) { // try the AVX512 table
9530 assert(Subtarget.hasAVX512() && "Requires AVX-512");
9531 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
9532 // Don't change integer Q instructions to D instructions.
9533 if (table && Domain == 3 && table[3] == MI.getOpcode())
9534 Domain = 4;
9535 }
9536 if (!table) { // try the AVX512DQ table
9537 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9538 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
9539 // Don't change integer Q instructions to D instructions and
9540 // use D instructions if we started with a PS instruction.
9541 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9542 Domain = 4;
9543 }
9544 if (!table) { // try the AVX512DQMasked table
9545 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9546 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
9547 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9548 Domain = 4;
9549 }
9550 assert(table && "Cannot change domain");
9551 MI.setDesc(get(table[Domain - 1]));
9552}
9553
9559
9560/// Return the noop instruction to use for a noop.
9562 MCInst Nop;
9563 Nop.setOpcode(X86::NOOP);
9564 return Nop;
9565}
9566
9568 switch (opc) {
9569 default:
9570 return false;
9571 case X86::DIVPDrm:
9572 case X86::DIVPDrr:
9573 case X86::DIVPSrm:
9574 case X86::DIVPSrr:
9575 case X86::DIVSDrm:
9576 case X86::DIVSDrm_Int:
9577 case X86::DIVSDrr:
9578 case X86::DIVSDrr_Int:
9579 case X86::DIVSSrm:
9580 case X86::DIVSSrm_Int:
9581 case X86::DIVSSrr:
9582 case X86::DIVSSrr_Int:
9583 case X86::SQRTPDm:
9584 case X86::SQRTPDr:
9585 case X86::SQRTPSm:
9586 case X86::SQRTPSr:
9587 case X86::SQRTSDm:
9588 case X86::SQRTSDm_Int:
9589 case X86::SQRTSDr:
9590 case X86::SQRTSDr_Int:
9591 case X86::SQRTSSm:
9592 case X86::SQRTSSm_Int:
9593 case X86::SQRTSSr:
9594 case X86::SQRTSSr_Int:
9595 // AVX instructions with high latency
9596 case X86::VDIVPDrm:
9597 case X86::VDIVPDrr:
9598 case X86::VDIVPDYrm:
9599 case X86::VDIVPDYrr:
9600 case X86::VDIVPSrm:
9601 case X86::VDIVPSrr:
9602 case X86::VDIVPSYrm:
9603 case X86::VDIVPSYrr:
9604 case X86::VDIVSDrm:
9605 case X86::VDIVSDrm_Int:
9606 case X86::VDIVSDrr:
9607 case X86::VDIVSDrr_Int:
9608 case X86::VDIVSSrm:
9609 case X86::VDIVSSrm_Int:
9610 case X86::VDIVSSrr:
9611 case X86::VDIVSSrr_Int:
9612 case X86::VSQRTPDm:
9613 case X86::VSQRTPDr:
9614 case X86::VSQRTPDYm:
9615 case X86::VSQRTPDYr:
9616 case X86::VSQRTPSm:
9617 case X86::VSQRTPSr:
9618 case X86::VSQRTPSYm:
9619 case X86::VSQRTPSYr:
9620 case X86::VSQRTSDm:
9621 case X86::VSQRTSDm_Int:
9622 case X86::VSQRTSDr:
9623 case X86::VSQRTSDr_Int:
9624 case X86::VSQRTSSm:
9625 case X86::VSQRTSSm_Int:
9626 case X86::VSQRTSSr:
9627 case X86::VSQRTSSr_Int:
9628 // AVX512 instructions with high latency
9629 case X86::VDIVPDZ128rm:
9630 case X86::VDIVPDZ128rmb:
9631 case X86::VDIVPDZ128rmbk:
9632 case X86::VDIVPDZ128rmbkz:
9633 case X86::VDIVPDZ128rmk:
9634 case X86::VDIVPDZ128rmkz:
9635 case X86::VDIVPDZ128rr:
9636 case X86::VDIVPDZ128rrk:
9637 case X86::VDIVPDZ128rrkz:
9638 case X86::VDIVPDZ256rm:
9639 case X86::VDIVPDZ256rmb:
9640 case X86::VDIVPDZ256rmbk:
9641 case X86::VDIVPDZ256rmbkz:
9642 case X86::VDIVPDZ256rmk:
9643 case X86::VDIVPDZ256rmkz:
9644 case X86::VDIVPDZ256rr:
9645 case X86::VDIVPDZ256rrk:
9646 case X86::VDIVPDZ256rrkz:
9647 case X86::VDIVPDZrrb:
9648 case X86::VDIVPDZrrbk:
9649 case X86::VDIVPDZrrbkz:
9650 case X86::VDIVPDZrm:
9651 case X86::VDIVPDZrmb:
9652 case X86::VDIVPDZrmbk:
9653 case X86::VDIVPDZrmbkz:
9654 case X86::VDIVPDZrmk:
9655 case X86::VDIVPDZrmkz:
9656 case X86::VDIVPDZrr:
9657 case X86::VDIVPDZrrk:
9658 case X86::VDIVPDZrrkz:
9659 case X86::VDIVPSZ128rm:
9660 case X86::VDIVPSZ128rmb:
9661 case X86::VDIVPSZ128rmbk:
9662 case X86::VDIVPSZ128rmbkz:
9663 case X86::VDIVPSZ128rmk:
9664 case X86::VDIVPSZ128rmkz:
9665 case X86::VDIVPSZ128rr:
9666 case X86::VDIVPSZ128rrk:
9667 case X86::VDIVPSZ128rrkz:
9668 case X86::VDIVPSZ256rm:
9669 case X86::VDIVPSZ256rmb:
9670 case X86::VDIVPSZ256rmbk:
9671 case X86::VDIVPSZ256rmbkz:
9672 case X86::VDIVPSZ256rmk:
9673 case X86::VDIVPSZ256rmkz:
9674 case X86::VDIVPSZ256rr:
9675 case X86::VDIVPSZ256rrk:
9676 case X86::VDIVPSZ256rrkz:
9677 case X86::VDIVPSZrrb:
9678 case X86::VDIVPSZrrbk:
9679 case X86::VDIVPSZrrbkz:
9680 case X86::VDIVPSZrm:
9681 case X86::VDIVPSZrmb:
9682 case X86::VDIVPSZrmbk:
9683 case X86::VDIVPSZrmbkz:
9684 case X86::VDIVPSZrmk:
9685 case X86::VDIVPSZrmkz:
9686 case X86::VDIVPSZrr:
9687 case X86::VDIVPSZrrk:
9688 case X86::VDIVPSZrrkz:
9689 case X86::VDIVSDZrm:
9690 case X86::VDIVSDZrr:
9691 case X86::VDIVSDZrm_Int:
9692 case X86::VDIVSDZrmk_Int:
9693 case X86::VDIVSDZrmkz_Int:
9694 case X86::VDIVSDZrr_Int:
9695 case X86::VDIVSDZrrk_Int:
9696 case X86::VDIVSDZrrkz_Int:
9697 case X86::VDIVSDZrrb_Int:
9698 case X86::VDIVSDZrrbk_Int:
9699 case X86::VDIVSDZrrbkz_Int:
9700 case X86::VDIVSSZrm:
9701 case X86::VDIVSSZrr:
9702 case X86::VDIVSSZrm_Int:
9703 case X86::VDIVSSZrmk_Int:
9704 case X86::VDIVSSZrmkz_Int:
9705 case X86::VDIVSSZrr_Int:
9706 case X86::VDIVSSZrrk_Int:
9707 case X86::VDIVSSZrrkz_Int:
9708 case X86::VDIVSSZrrb_Int:
9709 case X86::VDIVSSZrrbk_Int:
9710 case X86::VDIVSSZrrbkz_Int:
9711 case X86::VSQRTPDZ128m:
9712 case X86::VSQRTPDZ128mb:
9713 case X86::VSQRTPDZ128mbk:
9714 case X86::VSQRTPDZ128mbkz:
9715 case X86::VSQRTPDZ128mk:
9716 case X86::VSQRTPDZ128mkz:
9717 case X86::VSQRTPDZ128r:
9718 case X86::VSQRTPDZ128rk:
9719 case X86::VSQRTPDZ128rkz:
9720 case X86::VSQRTPDZ256m:
9721 case X86::VSQRTPDZ256mb:
9722 case X86::VSQRTPDZ256mbk:
9723 case X86::VSQRTPDZ256mbkz:
9724 case X86::VSQRTPDZ256mk:
9725 case X86::VSQRTPDZ256mkz:
9726 case X86::VSQRTPDZ256r:
9727 case X86::VSQRTPDZ256rk:
9728 case X86::VSQRTPDZ256rkz:
9729 case X86::VSQRTPDZm:
9730 case X86::VSQRTPDZmb:
9731 case X86::VSQRTPDZmbk:
9732 case X86::VSQRTPDZmbkz:
9733 case X86::VSQRTPDZmk:
9734 case X86::VSQRTPDZmkz:
9735 case X86::VSQRTPDZr:
9736 case X86::VSQRTPDZrb:
9737 case X86::VSQRTPDZrbk:
9738 case X86::VSQRTPDZrbkz:
9739 case X86::VSQRTPDZrk:
9740 case X86::VSQRTPDZrkz:
9741 case X86::VSQRTPSZ128m:
9742 case X86::VSQRTPSZ128mb:
9743 case X86::VSQRTPSZ128mbk:
9744 case X86::VSQRTPSZ128mbkz:
9745 case X86::VSQRTPSZ128mk:
9746 case X86::VSQRTPSZ128mkz:
9747 case X86::VSQRTPSZ128r:
9748 case X86::VSQRTPSZ128rk:
9749 case X86::VSQRTPSZ128rkz:
9750 case X86::VSQRTPSZ256m:
9751 case X86::VSQRTPSZ256mb:
9752 case X86::VSQRTPSZ256mbk:
9753 case X86::VSQRTPSZ256mbkz:
9754 case X86::VSQRTPSZ256mk:
9755 case X86::VSQRTPSZ256mkz:
9756 case X86::VSQRTPSZ256r:
9757 case X86::VSQRTPSZ256rk:
9758 case X86::VSQRTPSZ256rkz:
9759 case X86::VSQRTPSZm:
9760 case X86::VSQRTPSZmb:
9761 case X86::VSQRTPSZmbk:
9762 case X86::VSQRTPSZmbkz:
9763 case X86::VSQRTPSZmk:
9764 case X86::VSQRTPSZmkz:
9765 case X86::VSQRTPSZr:
9766 case X86::VSQRTPSZrb:
9767 case X86::VSQRTPSZrbk:
9768 case X86::VSQRTPSZrbkz:
9769 case X86::VSQRTPSZrk:
9770 case X86::VSQRTPSZrkz:
9771 case X86::VSQRTSDZm:
9772 case X86::VSQRTSDZm_Int:
9773 case X86::VSQRTSDZmk_Int:
9774 case X86::VSQRTSDZmkz_Int:
9775 case X86::VSQRTSDZr:
9776 case X86::VSQRTSDZr_Int:
9777 case X86::VSQRTSDZrk_Int:
9778 case X86::VSQRTSDZrkz_Int:
9779 case X86::VSQRTSDZrb_Int:
9780 case X86::VSQRTSDZrbk_Int:
9781 case X86::VSQRTSDZrbkz_Int:
9782 case X86::VSQRTSSZm:
9783 case X86::VSQRTSSZm_Int:
9784 case X86::VSQRTSSZmk_Int:
9785 case X86::VSQRTSSZmkz_Int:
9786 case X86::VSQRTSSZr:
9787 case X86::VSQRTSSZr_Int:
9788 case X86::VSQRTSSZrk_Int:
9789 case X86::VSQRTSSZrkz_Int:
9790 case X86::VSQRTSSZrb_Int:
9791 case X86::VSQRTSSZrbk_Int:
9792 case X86::VSQRTSSZrbkz_Int:
9793
9794 case X86::VGATHERDPDYrm:
9795 case X86::VGATHERDPDZ128rm:
9796 case X86::VGATHERDPDZ256rm:
9797 case X86::VGATHERDPDZrm:
9798 case X86::VGATHERDPDrm:
9799 case X86::VGATHERDPSYrm:
9800 case X86::VGATHERDPSZ128rm:
9801 case X86::VGATHERDPSZ256rm:
9802 case X86::VGATHERDPSZrm:
9803 case X86::VGATHERDPSrm:
9804 case X86::VGATHERPF0DPDm:
9805 case X86::VGATHERPF0DPSm:
9806 case X86::VGATHERPF0QPDm:
9807 case X86::VGATHERPF0QPSm:
9808 case X86::VGATHERPF1DPDm:
9809 case X86::VGATHERPF1DPSm:
9810 case X86::VGATHERPF1QPDm:
9811 case X86::VGATHERPF1QPSm:
9812 case X86::VGATHERQPDYrm:
9813 case X86::VGATHERQPDZ128rm:
9814 case X86::VGATHERQPDZ256rm:
9815 case X86::VGATHERQPDZrm:
9816 case X86::VGATHERQPDrm:
9817 case X86::VGATHERQPSYrm:
9818 case X86::VGATHERQPSZ128rm:
9819 case X86::VGATHERQPSZ256rm:
9820 case X86::VGATHERQPSZrm:
9821 case X86::VGATHERQPSrm:
9822 case X86::VPGATHERDDYrm:
9823 case X86::VPGATHERDDZ128rm:
9824 case X86::VPGATHERDDZ256rm:
9825 case X86::VPGATHERDDZrm:
9826 case X86::VPGATHERDDrm:
9827 case X86::VPGATHERDQYrm:
9828 case X86::VPGATHERDQZ128rm:
9829 case X86::VPGATHERDQZ256rm:
9830 case X86::VPGATHERDQZrm:
9831 case X86::VPGATHERDQrm:
9832 case X86::VPGATHERQDYrm:
9833 case X86::VPGATHERQDZ128rm:
9834 case X86::VPGATHERQDZ256rm:
9835 case X86::VPGATHERQDZrm:
9836 case X86::VPGATHERQDrm:
9837 case X86::VPGATHERQQYrm:
9838 case X86::VPGATHERQQZ128rm:
9839 case X86::VPGATHERQQZ256rm:
9840 case X86::VPGATHERQQZrm:
9841 case X86::VPGATHERQQrm:
9842 case X86::VSCATTERDPDZ128mr:
9843 case X86::VSCATTERDPDZ256mr:
9844 case X86::VSCATTERDPDZmr:
9845 case X86::VSCATTERDPSZ128mr:
9846 case X86::VSCATTERDPSZ256mr:
9847 case X86::VSCATTERDPSZmr:
9848 case X86::VSCATTERPF0DPDm:
9849 case X86::VSCATTERPF0DPSm:
9850 case X86::VSCATTERPF0QPDm:
9851 case X86::VSCATTERPF0QPSm:
9852 case X86::VSCATTERPF1DPDm:
9853 case X86::VSCATTERPF1DPSm:
9854 case X86::VSCATTERPF1QPDm:
9855 case X86::VSCATTERPF1QPSm:
9856 case X86::VSCATTERQPDZ128mr:
9857 case X86::VSCATTERQPDZ256mr:
9858 case X86::VSCATTERQPDZmr:
9859 case X86::VSCATTERQPSZ128mr:
9860 case X86::VSCATTERQPSZ256mr:
9861 case X86::VSCATTERQPSZmr:
9862 case X86::VPSCATTERDDZ128mr:
9863 case X86::VPSCATTERDDZ256mr:
9864 case X86::VPSCATTERDDZmr:
9865 case X86::VPSCATTERDQZ128mr:
9866 case X86::VPSCATTERDQZ256mr:
9867 case X86::VPSCATTERDQZmr:
9868 case X86::VPSCATTERQDZ128mr:
9869 case X86::VPSCATTERQDZ256mr:
9870 case X86::VPSCATTERQDZmr:
9871 case X86::VPSCATTERQQZ128mr:
9872 case X86::VPSCATTERQQZ256mr:
9873 case X86::VPSCATTERQQZmr:
9874 return true;
9875 }
9876}
9877
9879 const MachineRegisterInfo *MRI,
9880 const MachineInstr &DefMI,
9881 unsigned DefIdx,
9882 const MachineInstr &UseMI,
9883 unsigned UseIdx) const {
9884 return isHighLatencyDef(DefMI.getOpcode());
9885}
9886
9888 const MachineBasicBlock *MBB) const {
9889 assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 &&
9890 Inst.getNumDefs() <= 2 && "Reassociation needs binary operators");
9891
9892 // Integer binary math/logic instructions have a third source operand:
9893 // the EFLAGS register. That operand must be both defined here and never
9894 // used; ie, it must be dead. If the EFLAGS operand is live, then we can
9895 // not change anything because rearranging the operands could affect other
9896 // instructions that depend on the exact status flags (zero, sign, etc.)
9897 // that are set by using these particular operands with this operation.
9898 const MachineOperand *FlagDef =
9899 Inst.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
9900 assert((Inst.getNumDefs() == 1 || FlagDef) && "Implicit def isn't flags?");
9901 if (FlagDef && !FlagDef->isDead())
9902 return false;
9903
9905}
9906
9907// TODO: There are many more machine instruction opcodes to match:
9908// 1. Other data types (integer, vectors)
9909// 2. Other math / logic operations (xor, or)
9910// 3. Other forms of the same operation (intrinsics and other variants)
9912 bool Invert) const {
9913 if (Invert)
9914 return false;
9915 switch (Inst.getOpcode()) {
9916 CASE_ND(ADD8rr)
9917 CASE_ND(ADD16rr)
9918 CASE_ND(ADD32rr)
9919 CASE_ND(ADD64rr)
9920 CASE_ND(AND8rr)
9921 CASE_ND(AND16rr)
9922 CASE_ND(AND32rr)
9923 CASE_ND(AND64rr)
9924 CASE_ND(OR8rr)
9925 CASE_ND(OR16rr)
9926 CASE_ND(OR32rr)
9927 CASE_ND(OR64rr)
9928 CASE_ND(XOR8rr)
9929 CASE_ND(XOR16rr)
9930 CASE_ND(XOR32rr)
9931 CASE_ND(XOR64rr)
9932 CASE_ND(IMUL16rr)
9933 CASE_ND(IMUL32rr)
9934 CASE_ND(IMUL64rr)
9935 case X86::PANDrr:
9936 case X86::PORrr:
9937 case X86::PXORrr:
9938 case X86::ANDPDrr:
9939 case X86::ANDPSrr:
9940 case X86::ORPDrr:
9941 case X86::ORPSrr:
9942 case X86::XORPDrr:
9943 case X86::XORPSrr:
9944 case X86::PADDBrr:
9945 case X86::PADDWrr:
9946 case X86::PADDDrr:
9947 case X86::PADDQrr:
9948 case X86::PMULLWrr:
9949 case X86::PMULLDrr:
9950 case X86::PMAXSBrr:
9951 case X86::PMAXSDrr:
9952 case X86::PMAXSWrr:
9953 case X86::PMAXUBrr:
9954 case X86::PMAXUDrr:
9955 case X86::PMAXUWrr:
9956 case X86::PMINSBrr:
9957 case X86::PMINSDrr:
9958 case X86::PMINSWrr:
9959 case X86::PMINUBrr:
9960 case X86::PMINUDrr:
9961 case X86::PMINUWrr:
9962 case X86::VPANDrr:
9963 case X86::VPANDYrr:
9964 case X86::VPANDDZ128rr:
9965 case X86::VPANDDZ256rr:
9966 case X86::VPANDDZrr:
9967 case X86::VPANDQZ128rr:
9968 case X86::VPANDQZ256rr:
9969 case X86::VPANDQZrr:
9970 case X86::VPORrr:
9971 case X86::VPORYrr:
9972 case X86::VPORDZ128rr:
9973 case X86::VPORDZ256rr:
9974 case X86::VPORDZrr:
9975 case X86::VPORQZ128rr:
9976 case X86::VPORQZ256rr:
9977 case X86::VPORQZrr:
9978 case X86::VPXORrr:
9979 case X86::VPXORYrr:
9980 case X86::VPXORDZ128rr:
9981 case X86::VPXORDZ256rr:
9982 case X86::VPXORDZrr:
9983 case X86::VPXORQZ128rr:
9984 case X86::VPXORQZ256rr:
9985 case X86::VPXORQZrr:
9986 case X86::VANDPDrr:
9987 case X86::VANDPSrr:
9988 case X86::VANDPDYrr:
9989 case X86::VANDPSYrr:
9990 case X86::VANDPDZ128rr:
9991 case X86::VANDPSZ128rr:
9992 case X86::VANDPDZ256rr:
9993 case X86::VANDPSZ256rr:
9994 case X86::VANDPDZrr:
9995 case X86::VANDPSZrr:
9996 case X86::VORPDrr:
9997 case X86::VORPSrr:
9998 case X86::VORPDYrr:
9999 case X86::VORPSYrr:
10000 case X86::VORPDZ128rr:
10001 case X86::VORPSZ128rr:
10002 case X86::VORPDZ256rr:
10003 case X86::VORPSZ256rr:
10004 case X86::VORPDZrr:
10005 case X86::VORPSZrr:
10006 case X86::VXORPDrr:
10007 case X86::VXORPSrr:
10008 case X86::VXORPDYrr:
10009 case X86::VXORPSYrr:
10010 case X86::VXORPDZ128rr:
10011 case X86::VXORPSZ128rr:
10012 case X86::VXORPDZ256rr:
10013 case X86::VXORPSZ256rr:
10014 case X86::VXORPDZrr:
10015 case X86::VXORPSZrr:
10016 case X86::KADDBkk:
10017 case X86::KADDWkk:
10018 case X86::KADDDkk:
10019 case X86::KADDQkk:
10020 case X86::KANDBkk:
10021 case X86::KANDWkk:
10022 case X86::KANDDkk:
10023 case X86::KANDQkk:
10024 case X86::KORBkk:
10025 case X86::KORWkk:
10026 case X86::KORDkk:
10027 case X86::KORQkk:
10028 case X86::KXORBkk:
10029 case X86::KXORWkk:
10030 case X86::KXORDkk:
10031 case X86::KXORQkk:
10032 case X86::VPADDBrr:
10033 case X86::VPADDWrr:
10034 case X86::VPADDDrr:
10035 case X86::VPADDQrr:
10036 case X86::VPADDBYrr:
10037 case X86::VPADDWYrr:
10038 case X86::VPADDDYrr:
10039 case X86::VPADDQYrr:
10040 case X86::VPADDBZ128rr:
10041 case X86::VPADDWZ128rr:
10042 case X86::VPADDDZ128rr:
10043 case X86::VPADDQZ128rr:
10044 case X86::VPADDBZ256rr:
10045 case X86::VPADDWZ256rr:
10046 case X86::VPADDDZ256rr:
10047 case X86::VPADDQZ256rr:
10048 case X86::VPADDBZrr:
10049 case X86::VPADDWZrr:
10050 case X86::VPADDDZrr:
10051 case X86::VPADDQZrr:
10052 case X86::VPMULLWrr:
10053 case X86::VPMULLWYrr:
10054 case X86::VPMULLWZ128rr:
10055 case X86::VPMULLWZ256rr:
10056 case X86::VPMULLWZrr:
10057 case X86::VPMULLDrr:
10058 case X86::VPMULLDYrr:
10059 case X86::VPMULLDZ128rr:
10060 case X86::VPMULLDZ256rr:
10061 case X86::VPMULLDZrr:
10062 case X86::VPMULLQZ128rr:
10063 case X86::VPMULLQZ256rr:
10064 case X86::VPMULLQZrr:
10065 case X86::VPMAXSBrr:
10066 case X86::VPMAXSBYrr:
10067 case X86::VPMAXSBZ128rr:
10068 case X86::VPMAXSBZ256rr:
10069 case X86::VPMAXSBZrr:
10070 case X86::VPMAXSDrr:
10071 case X86::VPMAXSDYrr:
10072 case X86::VPMAXSDZ128rr:
10073 case X86::VPMAXSDZ256rr:
10074 case X86::VPMAXSDZrr:
10075 case X86::VPMAXSQZ128rr:
10076 case X86::VPMAXSQZ256rr:
10077 case X86::VPMAXSQZrr:
10078 case X86::VPMAXSWrr:
10079 case X86::VPMAXSWYrr:
10080 case X86::VPMAXSWZ128rr:
10081 case X86::VPMAXSWZ256rr:
10082 case X86::VPMAXSWZrr:
10083 case X86::VPMAXUBrr:
10084 case X86::VPMAXUBYrr:
10085 case X86::VPMAXUBZ128rr:
10086 case X86::VPMAXUBZ256rr:
10087 case X86::VPMAXUBZrr:
10088 case X86::VPMAXUDrr:
10089 case X86::VPMAXUDYrr:
10090 case X86::VPMAXUDZ128rr:
10091 case X86::VPMAXUDZ256rr:
10092 case X86::VPMAXUDZrr:
10093 case X86::VPMAXUQZ128rr:
10094 case X86::VPMAXUQZ256rr:
10095 case X86::VPMAXUQZrr:
10096 case X86::VPMAXUWrr:
10097 case X86::VPMAXUWYrr:
10098 case X86::VPMAXUWZ128rr:
10099 case X86::VPMAXUWZ256rr:
10100 case X86::VPMAXUWZrr:
10101 case X86::VPMINSBrr:
10102 case X86::VPMINSBYrr:
10103 case X86::VPMINSBZ128rr:
10104 case X86::VPMINSBZ256rr:
10105 case X86::VPMINSBZrr:
10106 case X86::VPMINSDrr:
10107 case X86::VPMINSDYrr:
10108 case X86::VPMINSDZ128rr:
10109 case X86::VPMINSDZ256rr:
10110 case X86::VPMINSDZrr:
10111 case X86::VPMINSQZ128rr:
10112 case X86::VPMINSQZ256rr:
10113 case X86::VPMINSQZrr:
10114 case X86::VPMINSWrr:
10115 case X86::VPMINSWYrr:
10116 case X86::VPMINSWZ128rr:
10117 case X86::VPMINSWZ256rr:
10118 case X86::VPMINSWZrr:
10119 case X86::VPMINUBrr:
10120 case X86::VPMINUBYrr:
10121 case X86::VPMINUBZ128rr:
10122 case X86::VPMINUBZ256rr:
10123 case X86::VPMINUBZrr:
10124 case X86::VPMINUDrr:
10125 case X86::VPMINUDYrr:
10126 case X86::VPMINUDZ128rr:
10127 case X86::VPMINUDZ256rr:
10128 case X86::VPMINUDZrr:
10129 case X86::VPMINUQZ128rr:
10130 case X86::VPMINUQZ256rr:
10131 case X86::VPMINUQZrr:
10132 case X86::VPMINUWrr:
10133 case X86::VPMINUWYrr:
10134 case X86::VPMINUWZ128rr:
10135 case X86::VPMINUWZ256rr:
10136 case X86::VPMINUWZrr:
10137 // Normal min/max instructions are not commutative because of NaN and signed
10138 // zero semantics, but these are. Thus, there's no need to check for global
10139 // relaxed math; the instructions themselves have the properties we need.
10140 case X86::MAXCPDrr:
10141 case X86::MAXCPSrr:
10142 case X86::MAXCSDrr:
10143 case X86::MAXCSSrr:
10144 case X86::MINCPDrr:
10145 case X86::MINCPSrr:
10146 case X86::MINCSDrr:
10147 case X86::MINCSSrr:
10148 case X86::VMAXCPDrr:
10149 case X86::VMAXCPSrr:
10150 case X86::VMAXCPDYrr:
10151 case X86::VMAXCPSYrr:
10152 case X86::VMAXCPDZ128rr:
10153 case X86::VMAXCPSZ128rr:
10154 case X86::VMAXCPDZ256rr:
10155 case X86::VMAXCPSZ256rr:
10156 case X86::VMAXCPDZrr:
10157 case X86::VMAXCPSZrr:
10158 case X86::VMAXCSDrr:
10159 case X86::VMAXCSSrr:
10160 case X86::VMAXCSDZrr:
10161 case X86::VMAXCSSZrr:
10162 case X86::VMINCPDrr:
10163 case X86::VMINCPSrr:
10164 case X86::VMINCPDYrr:
10165 case X86::VMINCPSYrr:
10166 case X86::VMINCPDZ128rr:
10167 case X86::VMINCPSZ128rr:
10168 case X86::VMINCPDZ256rr:
10169 case X86::VMINCPSZ256rr:
10170 case X86::VMINCPDZrr:
10171 case X86::VMINCPSZrr:
10172 case X86::VMINCSDrr:
10173 case X86::VMINCSSrr:
10174 case X86::VMINCSDZrr:
10175 case X86::VMINCSSZrr:
10176 case X86::VMAXCPHZ128rr:
10177 case X86::VMAXCPHZ256rr:
10178 case X86::VMAXCPHZrr:
10179 case X86::VMAXCSHZrr:
10180 case X86::VMINCPHZ128rr:
10181 case X86::VMINCPHZ256rr:
10182 case X86::VMINCPHZrr:
10183 case X86::VMINCSHZrr:
10184 return true;
10185 case X86::ADDPDrr:
10186 case X86::ADDPSrr:
10187 case X86::ADDSDrr:
10188 case X86::ADDSSrr:
10189 case X86::MULPDrr:
10190 case X86::MULPSrr:
10191 case X86::MULSDrr:
10192 case X86::MULSSrr:
10193 case X86::VADDPDrr:
10194 case X86::VADDPSrr:
10195 case X86::VADDPDYrr:
10196 case X86::VADDPSYrr:
10197 case X86::VADDPDZ128rr:
10198 case X86::VADDPSZ128rr:
10199 case X86::VADDPDZ256rr:
10200 case X86::VADDPSZ256rr:
10201 case X86::VADDPDZrr:
10202 case X86::VADDPSZrr:
10203 case X86::VADDSDrr:
10204 case X86::VADDSSrr:
10205 case X86::VADDSDZrr:
10206 case X86::VADDSSZrr:
10207 case X86::VMULPDrr:
10208 case X86::VMULPSrr:
10209 case X86::VMULPDYrr:
10210 case X86::VMULPSYrr:
10211 case X86::VMULPDZ128rr:
10212 case X86::VMULPSZ128rr:
10213 case X86::VMULPDZ256rr:
10214 case X86::VMULPSZ256rr:
10215 case X86::VMULPDZrr:
10216 case X86::VMULPSZrr:
10217 case X86::VMULSDrr:
10218 case X86::VMULSSrr:
10219 case X86::VMULSDZrr:
10220 case X86::VMULSSZrr:
10221 case X86::VADDPHZ128rr:
10222 case X86::VADDPHZ256rr:
10223 case X86::VADDPHZrr:
10224 case X86::VADDSHZrr:
10225 case X86::VMULPHZ128rr:
10226 case X86::VMULPHZ256rr:
10227 case X86::VMULPHZrr:
10228 case X86::VMULSHZrr:
10231 default:
10232 return false;
10233 }
10234}
10235
10236/// If \p DescribedReg overlaps with the MOVrr instruction's destination
10237/// register then, if possible, describe the value in terms of the source
10238/// register.
10239static std::optional<ParamLoadedValue>
10241 const TargetRegisterInfo *TRI) {
10242 Register DestReg = MI.getOperand(0).getReg();
10243 Register SrcReg = MI.getOperand(1).getReg();
10244
10245 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10246
10247 // If the described register is the destination, just return the source.
10248 if (DestReg == DescribedReg)
10249 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10250
10251 // If the described register is a sub-register of the destination register,
10252 // then pick out the source register's corresponding sub-register.
10253 if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
10254 Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
10255 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
10256 }
10257
10258 // The remaining case to consider is when the described register is a
10259 // super-register of the destination register. MOV8rr and MOV16rr does not
10260 // write to any of the other bytes in the register, meaning that we'd have to
10261 // describe the value using a combination of the source register and the
10262 // non-overlapping bits in the described register, which is not currently
10263 // possible.
10264 if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr ||
10265 !TRI->isSuperRegister(DestReg, DescribedReg))
10266 return std::nullopt;
10267
10268 assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case");
10269 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10270}
10271
10272std::optional<ParamLoadedValue>
10274 const MachineOperand *Op = nullptr;
10275 DIExpression *Expr = nullptr;
10276
10278
10279 switch (MI.getOpcode()) {
10280 case X86::LEA32r:
10281 case X86::LEA64r:
10282 case X86::LEA64_32r: {
10283 // We may need to describe a 64-bit parameter with a 32-bit LEA.
10284 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10285 return std::nullopt;
10286
10287 // Operand 4 could be global address. For now we do not support
10288 // such situation.
10289 if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
10290 return std::nullopt;
10291
10292 const MachineOperand &Op1 = MI.getOperand(1);
10293 const MachineOperand &Op2 = MI.getOperand(3);
10294 assert(Op2.isReg() &&
10295 (Op2.getReg() == X86::NoRegister || Op2.getReg().isPhysical()));
10296
10297 // Omit situations like:
10298 // %rsi = lea %rsi, 4, ...
10299 if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
10300 Op2.getReg() == MI.getOperand(0).getReg())
10301 return std::nullopt;
10302 else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
10303 TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
10304 (Op2.getReg() != X86::NoRegister &&
10305 TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
10306 return std::nullopt;
10307
10308 int64_t Coef = MI.getOperand(2).getImm();
10309 int64_t Offset = MI.getOperand(4).getImm();
10311
10312 if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
10313 Op = &Op1;
10314 } else if (Op1.isFI())
10315 Op = &Op1;
10316
10317 if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
10318 Ops.push_back(dwarf::DW_OP_constu);
10319 Ops.push_back(Coef + 1);
10320 Ops.push_back(dwarf::DW_OP_mul);
10321 } else {
10322 if (Op && Op2.getReg() != X86::NoRegister) {
10323 int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
10324 if (dwarfReg < 0)
10325 return std::nullopt;
10326 else if (dwarfReg < 32) {
10327 Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
10328 Ops.push_back(0);
10329 } else {
10330 Ops.push_back(dwarf::DW_OP_bregx);
10331 Ops.push_back(dwarfReg);
10332 Ops.push_back(0);
10333 }
10334 } else if (!Op) {
10335 assert(Op2.getReg() != X86::NoRegister);
10336 Op = &Op2;
10337 }
10338
10339 if (Coef > 1) {
10340 assert(Op2.getReg() != X86::NoRegister);
10341 Ops.push_back(dwarf::DW_OP_constu);
10342 Ops.push_back(Coef);
10343 Ops.push_back(dwarf::DW_OP_mul);
10344 }
10345
10346 if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
10347 Op2.getReg() != X86::NoRegister) {
10348 Ops.push_back(dwarf::DW_OP_plus);
10349 }
10350 }
10351
10353 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
10354
10355 return ParamLoadedValue(*Op, Expr);
10356 }
10357 case X86::MOV8ri:
10358 case X86::MOV16ri:
10359 // TODO: Handle MOV8ri and MOV16ri.
10360 return std::nullopt;
10361 case X86::MOV32ri:
10362 case X86::MOV64ri:
10363 case X86::MOV64ri32:
10364 // MOV32ri may be used for producing zero-extended 32-bit immediates in
10365 // 64-bit parameters, so we need to consider super-registers.
10366 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10367 return std::nullopt;
10368 return ParamLoadedValue(MI.getOperand(1), Expr);
10369 case X86::MOV8rr:
10370 case X86::MOV16rr:
10371 case X86::MOV32rr:
10372 case X86::MOV64rr:
10373 return describeMOVrrLoadedValue(MI, Reg, TRI);
10374 case X86::XOR32rr: {
10375 // 64-bit parameters are zero-materialized using XOR32rr, so also consider
10376 // super-registers.
10377 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10378 return std::nullopt;
10379 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
10381 return std::nullopt;
10382 }
10383 case X86::MOVSX64rr32: {
10384 // We may need to describe the lower 32 bits of the MOVSX; for example, in
10385 // cases like this:
10386 //
10387 // $ebx = [...]
10388 // $rdi = MOVSX64rr32 $ebx
10389 // $esi = MOV32rr $edi
10390 if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg))
10391 return std::nullopt;
10392
10393 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10394
10395 // If the described register is the destination register we need to
10396 // sign-extend the source register from 32 bits. The other case we handle
10397 // is when the described register is the 32-bit sub-register of the
10398 // destination register, in case we just need to return the source
10399 // register.
10400 if (Reg == MI.getOperand(0).getReg())
10401 Expr = DIExpression::appendExt(Expr, 32, 64, true);
10402 else
10403 assert(getX86MCRegisterClass(X86::GR32RegClassID).contains(Reg) &&
10404 "Unhandled sub-register case for MOVSX64rr32");
10405
10406 return ParamLoadedValue(MI.getOperand(1), Expr);
10407 }
10408 default:
10409 assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction");
10411 }
10412}
10413
10414/// This is an architecture-specific helper function of reassociateOps.
10415/// Set special operand attributes for new instructions after reassociation.
10417 MachineInstr &OldMI2,
10418 MachineInstr &NewMI1,
10419 MachineInstr &NewMI2) const {
10420 // Integer instructions may define an implicit EFLAGS dest register operand.
10421 MachineOperand *OldFlagDef1 =
10422 OldMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10423 MachineOperand *OldFlagDef2 =
10424 OldMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10425
10426 assert(!OldFlagDef1 == !OldFlagDef2 &&
10427 "Unexpected instruction type for reassociation");
10428
10429 if (!OldFlagDef1 || !OldFlagDef2)
10430 return;
10431
10432 assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() &&
10433 "Must have dead EFLAGS operand in reassociable instruction");
10434
10435 MachineOperand *NewFlagDef1 =
10436 NewMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10437 MachineOperand *NewFlagDef2 =
10438 NewMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10439
10440 assert(NewFlagDef1 && NewFlagDef2 &&
10441 "Unexpected operand in reassociable instruction");
10442
10443 // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
10444 // of this pass or other passes. The EFLAGS operands must be dead in these new
10445 // instructions because the EFLAGS operands in the original instructions must
10446 // be dead in order for reassociation to occur.
10447 NewFlagDef1->setIsDead();
10448 NewFlagDef2->setIsDead();
10449}
10450
10451std::pair<unsigned, unsigned>
10453 return std::make_pair(TF, 0u);
10454}
10455
10458 using namespace X86II;
10459 static const std::pair<unsigned, const char *> TargetFlags[] = {
10460 {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
10461 {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
10462 {MO_GOT, "x86-got"},
10463 {MO_GOTOFF, "x86-gotoff"},
10464 {MO_GOTPCREL, "x86-gotpcrel"},
10465 {MO_GOTPCREL_NORELAX, "x86-gotpcrel-norelax"},
10466 {MO_PLT, "x86-plt"},
10467 {MO_TLSGD, "x86-tlsgd"},
10468 {MO_TLSLD, "x86-tlsld"},
10469 {MO_TLSLDM, "x86-tlsldm"},
10470 {MO_GOTTPOFF, "x86-gottpoff"},
10471 {MO_INDNTPOFF, "x86-indntpoff"},
10472 {MO_TPOFF, "x86-tpoff"},
10473 {MO_DTPOFF, "x86-dtpoff"},
10474 {MO_NTPOFF, "x86-ntpoff"},
10475 {MO_GOTNTPOFF, "x86-gotntpoff"},
10476 {MO_DLLIMPORT, "x86-dllimport"},
10477 {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
10478 {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
10479 {MO_TLVP, "x86-tlvp"},
10480 {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
10481 {MO_SECREL, "x86-secrel"},
10482 {MO_COFFSTUB, "x86-coffstub"}};
10483 return ArrayRef(TargetFlags);
10484}
10485
10486/// Constants defining how certain sequences should be outlined.
10487///
10488/// \p MachineOutlinerDefault implies that the function is called with a call
10489/// instruction, and a return must be emitted for the outlined function frame.
10490///
10491/// That is,
10492///
10493/// I1 OUTLINED_FUNCTION:
10494/// I2 --> call OUTLINED_FUNCTION I1
10495/// I3 I2
10496/// I3
10497/// ret
10498///
10499/// * Call construction overhead: 1 (call instruction)
10500/// * Frame construction overhead: 1 (return instruction)
10501///
10502/// \p MachineOutlinerTailCall implies that the function is being tail called.
10503/// A jump is emitted instead of a call, and the return is already present in
10504/// the outlined sequence. That is,
10505///
10506/// I1 OUTLINED_FUNCTION:
10507/// I2 --> jmp OUTLINED_FUNCTION I1
10508/// ret I2
10509/// ret
10510///
10511/// * Call construction overhead: 1 (jump instruction)
10512/// * Frame construction overhead: 0 (don't need to return)
10513///
10515
10516std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10518 const MachineModuleInfo &MMI,
10519 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10520 unsigned MinRepeats) const {
10521 unsigned SequenceSize = 0;
10522 for (auto &MI : RepeatedSequenceLocs[0]) {
10523 // FIXME: x86 doesn't implement getInstSizeInBytes, so
10524 // we can't tell the cost. Just assume each instruction
10525 // is one byte.
10526 if (MI.isDebugInstr() || MI.isKill())
10527 continue;
10528 SequenceSize += 1;
10529 }
10530
10531 // We check to see if CFI Instructions are present, and if they are
10532 // we find the number of CFI Instructions in the candidates.
10533 unsigned CFICount = 0;
10534 for (auto &I : RepeatedSequenceLocs[0]) {
10535 if (I.isCFIInstruction())
10536 CFICount++;
10537 }
10538
10539 // We compare the number of found CFI Instructions to the number of CFI
10540 // instructions in the parent function for each candidate. We must check this
10541 // since if we outline one of the CFI instructions in a function, we have to
10542 // outline them all for correctness. If we do not, the address offsets will be
10543 // incorrect between the two sections of the program.
10544 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10545 std::vector<MCCFIInstruction> CFIInstructions =
10546 C.getMF()->getFrameInstructions();
10547
10548 if (CFICount > 0 && CFICount != CFIInstructions.size())
10549 return std::nullopt;
10550 }
10551
10552 // FIXME: Use real size in bytes for call and ret instructions.
10553 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10554 for (outliner::Candidate &C : RepeatedSequenceLocs)
10555 C.setCallInfo(MachineOutlinerTailCall, 1);
10556
10557 return std::make_unique<outliner::OutlinedFunction>(
10558 RepeatedSequenceLocs, SequenceSize,
10559 0, // Number of bytes to emit frame.
10560 MachineOutlinerTailCall // Type of frame.
10561 );
10562 }
10563
10564 if (CFICount > 0)
10565 return std::nullopt;
10566
10567 for (outliner::Candidate &C : RepeatedSequenceLocs)
10568 C.setCallInfo(MachineOutlinerDefault, 1);
10569
10570 return std::make_unique<outliner::OutlinedFunction>(
10571 RepeatedSequenceLocs, SequenceSize, 1, MachineOutlinerDefault);
10572}
10573
10575 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10576 const Function &F = MF.getFunction();
10577
10578 // Does the function use a red zone? If it does, then we can't risk messing
10579 // with the stack.
10580 if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) {
10581 // It could have a red zone. If it does, then we don't want to touch it.
10583 if (!X86FI || X86FI->getUsesRedZone())
10584 return false;
10585 }
10586
10587 // If we *don't* want to outline from things that could potentially be deduped
10588 // then return false.
10589 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10590 return false;
10591
10592 // This function is viable for outlining, so return true.
10593 return true;
10594}
10595
10599 unsigned Flags) const {
10600 MachineInstr &MI = *MIT;
10601
10602 // Is this a terminator for a basic block?
10603 if (MI.isTerminator())
10604 // TargetInstrInfo::getOutliningType has already filtered out anything
10605 // that would break this, so we can allow it here.
10607
10608 // Don't outline anything that modifies or reads from the stack pointer.
10609 //
10610 // FIXME: There are instructions which are being manually built without
10611 // explicit uses/defs so we also have to check the MCInstrDesc. We should be
10612 // able to remove the extra checks once those are fixed up. For example,
10613 // sometimes we might get something like %rax = POP64r 1. This won't be
10614 // caught by modifiesRegister or readsRegister even though the instruction
10615 // really ought to be formed so that modifiesRegister/readsRegister would
10616 // catch it.
10617 if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
10618 MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
10619 MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
10621
10622 // Outlined calls change the instruction pointer, so don't read from it.
10623 if (MI.readsRegister(X86::RIP, &RI) ||
10624 MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
10625 MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
10627
10628 // Don't outline CFI instructions.
10629 if (MI.isCFIInstruction())
10631
10633}
10634
10637 const outliner::OutlinedFunction &OF) const {
10638 // If we're a tail call, we already have a return, so don't do anything.
10639 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10640 return;
10641
10642 // We're a normal call, so our sequence doesn't have a return instruction.
10643 // Add it in.
10644 MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RET64));
10645 MBB.insert(MBB.end(), retq);
10646}
10647
10651 // Is it a tail call?
10652 if (C.CallConstructionID == MachineOutlinerTailCall) {
10653 // Yes, just insert a JMP.
10654 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64))
10655 .addGlobalAddress(M.getNamedValue(MF.getName())));
10656 } else {
10657 // No, insert a call.
10658 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
10659 .addGlobalAddress(M.getNamedValue(MF.getName())));
10660 }
10661
10662 return It;
10663}
10664
10667 DebugLoc &DL,
10668 bool AllowSideEffects) const {
10669 const MachineFunction &MF = *MBB.getParent();
10670 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
10672
10673 if (ST.hasMMX() && X86::VR64RegClass.contains(Reg))
10674 // FIXME: Should we ignore MMX registers?
10675 return;
10676
10677 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10678 // Convert register to the 32-bit version. Both 'movl' and 'xorl' clear the
10679 // upper bits of a 64-bit register automagically.
10680 Reg = getX86SubSuperRegister(Reg, 32);
10681
10682 if (!AllowSideEffects)
10683 // XOR affects flags, so use a MOV instead.
10684 BuildMI(MBB, Iter, DL, get(X86::MOV32ri), Reg).addImm(0);
10685 else
10686 BuildMI(MBB, Iter, DL, get(X86::XOR32rr), Reg)
10687 .addReg(Reg, RegState::Undef)
10688 .addReg(Reg, RegState::Undef);
10689 } else if (X86::VR128RegClass.contains(Reg)) {
10690 // XMM#
10691 if (!ST.hasSSE1())
10692 return;
10693
10694 BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
10695 } else if (X86::VR256RegClass.contains(Reg)) {
10696 // YMM#
10697 if (!ST.hasAVX())
10698 return;
10699
10700 BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
10701 } else if (X86::VR512RegClass.contains(Reg)) {
10702 // ZMM#
10703 if (!ST.hasAVX512())
10704 return;
10705
10706 BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
10707 } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
10708 X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
10709 X86::VK16RegClass.contains(Reg)) {
10710 if (!ST.hasVLX())
10711 return;
10712
10713 unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
10714 BuildMI(MBB, Iter, DL, get(Op), Reg);
10715 }
10716}
10717
10719 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
10720 bool DoRegPressureReduce) const {
10721 unsigned Opc = Root.getOpcode();
10722 switch (Opc) {
10723 case X86::VPDPWSSDrr:
10724 case X86::VPDPWSSDrm:
10725 case X86::VPDPWSSDYrr:
10726 case X86::VPDPWSSDYrm: {
10727 if (!Subtarget.hasFastDPWSSD()) {
10729 return true;
10730 }
10731 break;
10732 }
10733 case X86::VPDPWSSDZ128rr:
10734 case X86::VPDPWSSDZ128rm:
10735 case X86::VPDPWSSDZ256rr:
10736 case X86::VPDPWSSDZ256rm:
10737 case X86::VPDPWSSDZrr:
10738 case X86::VPDPWSSDZrm: {
10739 if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
10741 return true;
10742 }
10743 break;
10744 }
10745 }
10747 Patterns, DoRegPressureReduce);
10748}
10749
10750static void
10754 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
10755 MachineFunction *MF = Root.getMF();
10757
10758 unsigned Opc = Root.getOpcode();
10759 unsigned AddOpc = 0;
10760 unsigned MaddOpc = 0;
10761 switch (Opc) {
10762 default:
10763 assert(false && "It should not reach here");
10764 break;
10765 // vpdpwssd xmm2,xmm3,xmm1
10766 // -->
10767 // vpmaddwd xmm3,xmm3,xmm1
10768 // vpaddd xmm2,xmm2,xmm3
10769 case X86::VPDPWSSDrr:
10770 MaddOpc = X86::VPMADDWDrr;
10771 AddOpc = X86::VPADDDrr;
10772 break;
10773 case X86::VPDPWSSDrm:
10774 MaddOpc = X86::VPMADDWDrm;
10775 AddOpc = X86::VPADDDrr;
10776 break;
10777 case X86::VPDPWSSDZ128rr:
10778 MaddOpc = X86::VPMADDWDZ128rr;
10779 AddOpc = X86::VPADDDZ128rr;
10780 break;
10781 case X86::VPDPWSSDZ128rm:
10782 MaddOpc = X86::VPMADDWDZ128rm;
10783 AddOpc = X86::VPADDDZ128rr;
10784 break;
10785 // vpdpwssd ymm2,ymm3,ymm1
10786 // -->
10787 // vpmaddwd ymm3,ymm3,ymm1
10788 // vpaddd ymm2,ymm2,ymm3
10789 case X86::VPDPWSSDYrr:
10790 MaddOpc = X86::VPMADDWDYrr;
10791 AddOpc = X86::VPADDDYrr;
10792 break;
10793 case X86::VPDPWSSDYrm:
10794 MaddOpc = X86::VPMADDWDYrm;
10795 AddOpc = X86::VPADDDYrr;
10796 break;
10797 case X86::VPDPWSSDZ256rr:
10798 MaddOpc = X86::VPMADDWDZ256rr;
10799 AddOpc = X86::VPADDDZ256rr;
10800 break;
10801 case X86::VPDPWSSDZ256rm:
10802 MaddOpc = X86::VPMADDWDZ256rm;
10803 AddOpc = X86::VPADDDZ256rr;
10804 break;
10805 // vpdpwssd zmm2,zmm3,zmm1
10806 // -->
10807 // vpmaddwd zmm3,zmm3,zmm1
10808 // vpaddd zmm2,zmm2,zmm3
10809 case X86::VPDPWSSDZrr:
10810 MaddOpc = X86::VPMADDWDZrr;
10811 AddOpc = X86::VPADDDZrr;
10812 break;
10813 case X86::VPDPWSSDZrm:
10814 MaddOpc = X86::VPMADDWDZrm;
10815 AddOpc = X86::VPADDDZrr;
10816 break;
10817 }
10818 // Create vpmaddwd.
10819 const TargetRegisterClass *RC =
10820 RegInfo.getRegClass(Root.getOperand(0).getReg());
10821 Register NewReg = RegInfo.createVirtualRegister(RC);
10822 MachineInstr *Madd = Root.getMF()->CloneMachineInstr(&Root);
10823 Madd->setDesc(TII.get(MaddOpc));
10824 Madd->untieRegOperand(1);
10825 Madd->removeOperand(1);
10826 Madd->getOperand(0).setReg(NewReg);
10827 InstrIdxForVirtReg.insert(std::make_pair(NewReg, 0));
10828 // Create vpaddd.
10829 Register DstReg = Root.getOperand(0).getReg();
10830 bool IsKill = Root.getOperand(1).isKill();
10831 MachineInstr *Add =
10832 BuildMI(*MF, MIMetadata(Root), TII.get(AddOpc), DstReg)
10833 .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill))
10834 .addReg(Madd->getOperand(0).getReg(), getKillRegState(true));
10835 InsInstrs.push_back(Madd);
10836 InsInstrs.push_back(Add);
10837 DelInstrs.push_back(&Root);
10838}
10839
10841 MachineInstr &Root, unsigned Pattern,
10844 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
10845 switch (Pattern) {
10846 default:
10847 // Reassociate instructions.
10849 DelInstrs, InstrIdxForVirtReg);
10850 return;
10852 genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs,
10853 InstrIdxForVirtReg);
10854 return;
10855 }
10856}
10857
10858// See also: X86DAGToDAGISel::SelectInlineAsmMemoryOperand().
10860 int FI) const {
10863 M.Base.FrameIndex = FI;
10864 M.getFullAddress(Ops);
10865}
10866
10868X86InstrInfo::insertCodePrefetchInstr(MachineBasicBlock &MBB,
10869 MachineBasicBlock::iterator InsertBefore,
10870 const GlobalValue *GV) const {
10871 MachineFunction &MF = *MBB.getParent();
10872 MachineInstr *PrefetchInstr = MF.CreateMachineInstr(
10873 get(X86::PREFETCHIT1),
10874 InsertBefore == MBB.instr_end() ? MBB.findPrevDebugLoc(InsertBefore)
10875 : InsertBefore->getDebugLoc(),
10876 true);
10877 MachineInstrBuilder MIB(MF, PrefetchInstr);
10880 /*base_alignment=*/llvm::Align(1)));
10881 MIB.addReg(X86::RIP).addImm(1).addReg(X86::NoRegister);
10882 MIB.addGlobalAddress(GV);
10883 MIB.addReg(X86::NoRegister);
10884 MBB.insert(InsertBefore, PrefetchInstr);
10885 return PrefetchInstr;
10886}
10887
10888#define GET_INSTRINFO_HELPERS
10889#include "X86GenInstrInfo.inc"
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
return SDValue()
static bool isFrameStoreOpcode(int Opcode)
static bool isFrameLoadOpcode(int Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerDefault
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
static bool lookup(const GsymReader &GR, GsymDataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
static SDValue isNOT(SDValue V, SelectionDAG &DAG)
static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
Expand a single-def pseudo instruction to a two-addr instruction with two undef reads of the register...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
Provides some synthesis utilities to produce sequences of values.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
#define LLVM_DEBUG(...)
Definition Debug.h:119
#define FROM_TO(FROM, TO)
cl::opt< bool > X86EnableAPXForRelocation
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static bool isLEA(unsigned Opcode)
static void addOperands(MachineInstrBuilder &MIB, ArrayRef< MachineOperand > MOs, int PtrOffset=0)
static std::optional< ParamLoadedValue > describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetRegisterInfo *TRI)
If DescribedReg overlaps with the MOVrr instruction's destination register then, if possible,...
static cl::opt< unsigned > PartialRegUpdateClearance("partial-reg-update-clearance", cl::desc("Clearance between two register writes " "for inserting XOR to avoid partial " "register update"), cl::init(64), cl::Hidden)
static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI)
static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg, const X86Subtarget &Subtarget)
static bool isConvertibleLEA(MachineInstr *MI)
static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, const X86Subtarget &Subtarget)
static bool isAMXOpcode(unsigned Opc)
static int getJumpTableIndexFromReg(const MachineRegisterInfo &MRI, Register Reg)
static void updateOperandRegConstraints(MachineFunction &MF, MachineInstr &NewMI, const TargetInstrInfo &TII)
static int getJumpTableIndexFromAddr(const MachineInstr &MI)
static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth, unsigned NewWidth, unsigned *pNewMask=nullptr)
static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, bool MinusOne)
static unsigned getNewOpcFromTable(ArrayRef< X86TableEntry > Table, unsigned Opc)
static unsigned getStoreRegOpcode(Register SrcReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
#define FOLD_BROADCAST(SIZE)
static cl::opt< unsigned > UndefRegClearance("undef-reg-clearance", cl::desc("How many idle instructions we would like before " "certain undef register reads"), cl::init(128), cl::Hidden)
#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64)
static bool isTruncatedShiftCountForLEA(unsigned ShAmt)
Check whether the given shift count is appropriate can be represented by a LEA instruction.
static cl::opt< bool > ReMatPICStubLoad("remat-pic-stub-load", cl::desc("Re-materialize load from stub in PIC mode"), cl::init(false), cl::Hidden)
static SmallVector< MachineMemOperand *, 2 > extractLoadMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static MachineInstr * fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII)
static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx)
static bool canConvert2Copy(unsigned Opc)
static cl::opt< bool > NoFusing("disable-spill-fusing", cl::desc("Disable fusing of spill code into instructions"), cl::Hidden)
static bool expandNOVLXStore(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &StoreDesc, const MCInstrDesc &ExtractDesc, unsigned SubIdx)
static bool isX87Reg(Register Reg)
Return true if the Reg is X87 register.
static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc, Register Reg)
Expand a single-def pseudo instruction to a two-addr instruction with two k0 reads.
#define VPERM_CASES_BROADCAST(Suffix)
static std::pair< X86::CondCode, unsigned > isUseDefConvertible(const MachineInstr &MI)
Check whether the use can be converted to remove a comparison against zero.
static bool findRedundantFlagInstr(MachineInstr &CmpInstr, MachineInstr &CmpValDefInstr, const MachineRegisterInfo *MRI, MachineInstr **AndInstr, const TargetRegisterInfo *TRI, const X86Subtarget &ST, bool &NoSignFlag, bool &ClearsOverflowFlag)
static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
static unsigned getLoadRegOpcode(Register DestReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, bool ForLoadFold=false)
static MachineInstr * makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI)
#define GET_ND_IF_ENABLED(OPC)
static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI, const TargetInstrInfo &TII, bool HasAVX)
static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget, bool ForLoadFold=false)
Return true for all instructions that only update the first 32 or 64-bits of the destination register...
#define CASE_NF(OP)
static const uint16_t * lookupAVX512(unsigned opcode, unsigned domain, ArrayRef< uint16_t[4]> Table)
static unsigned getLoadStoreRegOpcode(Register Reg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI, bool Load)
#define VPERM_CASES(Suffix)
#define FROM_TO_SIZE(A, B, S)
static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, bool &ClearsOverflowFlag)
Check whether the definition can be converted to remove a comparison against zero.
static MachineInstr * fuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII, int PtrOffset=0)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode)
static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static MachineBasicBlock * getFallThroughMBB(MachineBasicBlock *MBB, MachineBasicBlock *TBB)
static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, const MachineInstr &UserMI, const MachineFunction &MF)
Check if LoadMI is a partial register load that we can't fold into MI because the latter uses content...
static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI)
static bool isHReg(Register Reg)
Test if the given register is a physical h register.
static cl::opt< bool > PrintFailedFusing("print-failed-fuse-candidates", cl::desc("Print instructions that the allocator wants to" " fuse, but the X86 backend currently can't"), cl::Hidden)
static bool expandNOVLXLoad(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &LoadDesc, const MCInstrDesc &BroadcastDesc, unsigned SubIdx)
static void genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
#define CASE_ND(OP)
static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
This determines which of three possible cases of a three source commute the source indexes correspond...
static unsigned getTruncatedShiftCount(const MachineInstr &MI, unsigned ShiftAmtOperandIdx)
Check whether the shift count for a machine operand is non-zero.
static SmallVector< MachineMemOperand *, 2 > extractStoreMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static unsigned getBroadcastOpcode(const X86FoldTableEntry *I, const TargetRegisterClass *RC, const X86Subtarget &STI)
static unsigned convertALUrr2ALUri(unsigned Opc)
Convert an ALUrr opcode to corresponding ALUri opcode.
static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI)
Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
static bool isCommutableVPERMV3Instruction(unsigned Opcode)
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:130
size_t size() const
Get the array size.
Definition ArrayRef.h:141
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:754
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:752
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DWARF expression.
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
static LLVM_ABI DIExpression * appendExt(const DIExpression *Expr, unsigned FromSize, unsigned ToSize, bool Signed)
Append a zero- or sign-extension to Expr.
A debug info location.
Definition DebugLoc.h:126
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:688
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:685
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:353
LiveInterval - This class represents the liveness of a register, or stack slot.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
A set of physical registers with utility functions to track liveness when walking backward/forward th...
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
static LocationSize precise(uint64_t Value)
bool usesWindowsCFI() const
Definition MCAsmInfo.h:674
static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int64_t Adjustment, SMLoc Loc={})
.cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but Offset is a relative value that is added/subt...
Definition MCDwarf.h:638
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
void setOpcode(unsigned Op)
Definition MCInst.h:201
Describe properties that are true of each instruction in the target description file.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1554
Set of metadata that should be preserved when using BuildMI().
SimpleValueType SimpleTy
MachineInstrBundleIterator< const MachineInstr > const_iterator
void push_back(MachineInstr *MI)
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
This class is a data container for one entry in a MachineConstantPool.
union llvm::MachineConstantPoolEntry::@004270020304201266316354007027341142157160323045 Val
The constant itself.
bool isMachineConstantPoolEntry() const
isMachineConstantPoolEntry - Return true if the MachineConstantPoolEntry is indeed a target specific ...
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
LLVM_ABI unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
bool needsFrameMoves() const
True if this function needs frame moves for debug or exceptions.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
mop_iterator operands_begin()
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isImplicitDef() const
const MachineBasicBlock * getParent() const
void dropDebugNumber()
Drop any variable location debugging information associated with this instruction.
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI unsigned getNumExplicitDefs() const
Returns the number of non-implicit definitions.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
void setFlag(MIFlag Flag)
Set a MI flag.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
unsigned getNumDefs() const
Returns the total number of definitions.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void setDebugLoc(DebugLoc DL)
Replace current source information with new such.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImplicit(bool Val=true)
void setImm(int64_t immVal)
int64_t getImm() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
bool isCPI() const
isCPI - Tests if this is a MO_ConstantPoolIndex operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
bool isJTI() const
isJTI - Tests if this is a MO_JumpTableIndex operand.
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateCPI(unsigned Idx, int Offset, unsigned TargetFlags=0)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
iterator_range< def_instr_iterator > def_instructions(Register Reg) const
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
MachineFunction & getMachineFunction() const
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
virtual const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
virtual bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const
Returns true iff the routine could find two commutable operands in the given machine instruction.
virtual bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const
Return true when \P Inst has reassociable operands in the same \P MBB.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
bool isPositionIndependent() const
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
CodeModel::Model getCodeModel() const
Returns the code model.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
virtual const TargetFrameLowering * getFrameLowering() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getZero()
Definition TypeSize.h:349
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
static LLVM_ABI Type * getFP128Ty(LLVMContext &C)
Definition Type.cpp:291
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:287
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:284
SlotIndex def
The index of the defining instruction.
LLVM Value Representation.
Definition Value.h:75
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst, MachineInstr::MIFlag Flag=MachineInstr::NoFlags) const
Wraps up getting a CFI index and building a MachineInstr for it.
void getFrameIndexOperands(SmallVectorImpl< MachineOperand > &Ops, int FI) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
Check if there exists an earlier instruction that operates on the same source operands and sets eflag...
bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
Overrides the isSchedulingBoundary from Codegen/TargetInstrInfo.cpp to make it capable of identifying...
MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, outliner::Candidate &C) const override
void replaceBranchWithTailCall(MachineBasicBlock &MBB, SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex=nullptr) const override
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override
Returns true iff the routine could find two commutable operands in the given machine instruction.
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
X86InstrInfo(const X86Subtarget &STI)
static bool isDataInvariantLoad(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value l...
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned CommuteOpIdx1, unsigned CommuteOpIdx2) const override
bool isFunctionSafeToOutlineFrom(MachineFunction &MF, bool OutlineFromLinkOnceODRs) const override
const X86RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
bool hasCommutePreference(MachineInstr &MI, bool &Commute) const override
Returns true if we have preference on the operands order in MI, the commute decision is returned in C...
bool hasLiveCondCodeDef(MachineInstr &MI) const
True if MI has a condition code def, e.g.
std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const override
bool canMakeTailCallConditional(SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl< MachineInstr * > &NewMIs) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
convertToThreeAddress - This method must be implemented by targets that set the M_CONVERTIBLE_TO_3_AD...
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool expandPostRAPseudo(MachineInstr &MI) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isAssociativeAndCommutative(const MachineInstr &Inst, bool Invert) const override
MCInst getNop() const override
Return the noop instruction to use for a noop.
outliner::InstrType getOutliningTypeImpl(const MachineModuleInfo &MMI, MachineBasicBlock::iterator &MIT, unsigned Flags) const override
bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override
This is a used by the pre-regalloc scheduler to determine (in conjunction with areLoadsFromSameBasePt...
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isStoreToStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const override
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
bool isUnconditionalTailCall(const MachineInstr &MI) const override
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, LaneBitmask UsedLanes=LaneBitmask::getAll()) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
std::optional< std::unique_ptr< outliner::OutlinedFunction > > getOutliningCandidateInfo(const MachineModuleInfo &MMI, std::vector< outliner::Candidate > &RepeatedSequenceLocs, unsigned MinRepeats) const override
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, Register &NewSrc, unsigned &NewSrcSubReg, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV, LiveIntervals *LIS) const
Given an operand within a MachineInstr, insert preceding code to put it into the right format for a p...
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isLoadFromStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const
int getSPAdjust(const MachineInstr &MI) const override
getSPAdjust - This returns the stack pointer adjustment made by this instruction.
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isReMaterializableImpl(const MachineInstr &MI) const override
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
int getJumpTableIndex(const MachineInstr &MI) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const override
This is an architecture-specific helper function of reassociateOps.
std::pair< uint16_t, uint16_t > getExecutionDomain(const MachineInstr &MI) const override
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
isCoalescableExtInstr - Return true if the instruction is a "coalescable" extension instruction.
void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Opc, Register Reg, int FrameIdx, bool isKill=false) const
void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg) const override
When getMachineCombinerPatterns() finds potential patterns, this function generates the instructions ...
bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, TargetInstrInfo::MachineBranchPredicate &MBP, bool AllowModify=false) const override
static bool isDataInvariant(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value o...
unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before certain undef register...
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
Fold a load or store of the specified stack slot into the specified machine instruction for the speci...
void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
void buildClearRegister(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator Iter, DebugLoc &DL, bool AllowSideEffects=true) const override
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
int64_t getFrameAdjustment(const MachineInstr &I) const
Returns the stack pointer adjustment that happens inside the frame setup..destroy sequence (e....
bool hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr &DefMI, unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const override
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override
uint16_t getExecutionDomainCustom(const MachineInstr &MI) const
bool isHighLatencyDef(int opc) const override
void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const override
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override
foldImmediate - 'Reg' is known to be defined by a move immediate instruction, try to fold the immedia...
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, const X86InstrFMA3Group &FMA3Group) const
Returns an adjusted FMA opcode that must be used in FMA instruction that performs the same computatio...
bool preservesZeroValueInReg(const MachineInstr *MI, const Register NullValueReg, const TargetRegisterInfo *TRI) const override
unsigned getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before a partial register upd...
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
const TargetRegisterClass * constrainRegClassToNonRex2(const TargetRegisterClass *RC) const
bool hasAVX512() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
const X86FrameLowering * getFrameLowering() const override
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:52
X86II - This namespace holds all of the target specific flags that instruction info tracks.
bool isKMergeMasked(uint64_t TSFlags)
bool hasNewDataDest(uint64_t TSFlags)
@ MO_GOT_ABSOLUTE_ADDRESS
MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a relocation of: SYMBOL_LABEL + [.
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
@ SSEDomainShift
Execution domain for SSE instructions.
bool canUseApxExtendedReg(const MCInstrDesc &Desc)
bool isPseudo(uint64_t TSFlags)
bool isKMasked(uint64_t TSFlags)
int getMemoryOperandNo(uint64_t TSFlags)
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
Define some predicates that are used for node matching.
CondCode getCondFromBranch(const MachineInstr &MI)
CondCode getCondFromCFCMov(const MachineInstr &MI)
@ LAST_VALID_COND
Definition X86BaseInfo.h:94
CondCode getCondFromMI(const MachineInstr &MI)
Return the condition code of the instruction.
int getFirstAddrOperandIdx(const MachineInstr &MI)
Return the index of the instruction's first address operand, if it has a memory reference,...
@ AddrNumOperands
Definition X86BaseInfo.h:36
unsigned getSwappedVCMPImm(unsigned Imm)
Get the VCMP immediate if the opcodes are swapped.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
unsigned getSwappedVPCOMImm(unsigned Imm)
Get the VPCOM immediate if the opcodes are swapped.
bool isX87Instruction(MachineInstr &MI)
Check if the instruction is X87 instruction.
unsigned getNonNDVariant(unsigned Opc)
unsigned getVPCMPImmForCond(ISD::CondCode CC)
Get the VPCMP immediate for the given condition.
std::pair< CondCode, bool > getX86ConditionCode(CmpInst::Predicate Predicate)
Return a pair of condition code for the given predicate and whether the instruction operands should b...
CondCode getCondFromSETCC(const MachineInstr &MI)
unsigned getSwappedVPCMPImm(unsigned Imm)
Get the VPCMP immediate if the opcodes are swapped.
CondCode getCondFromCCMP(const MachineInstr &MI)
int getCCMPCondFlagsFromCondCode(CondCode CC)
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
const Constant * getConstantFromPool(const MachineInstr &MI, unsigned OpNo)
Find any constant pool entry associated with a specific instruction operand.
unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm)
Return a MOVri opcode for materializing Imm into a 32- or 64-bit GPR.
unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand=false, bool HasNDD=false)
Return a cmov opcode for the given register size in bytes, and operand type.
unsigned getNFVariant(unsigned Opc)
unsigned getVectorRegisterWidth(const MCOperandInfo &Info)
Get the width of the vector register operand.
CondCode getCondFromCMov(const MachineInstr &MI)
initializer< Ty > init(const Ty &Val)
InstrType
Represents how an instruction should be mapped by the outliner.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:573
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
static bool isAddMemInstrWithRelocation(const MachineInstr &MI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
static bool isMem(const MachineInstr &MI, unsigned Op)
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition Alignment.h:134
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
static const MachineInstrBuilder & addRegReg(const MachineInstrBuilder &MIB, Register Reg1, bool isKill1, unsigned SubReg1, Register Reg2, bool isKill2, unsigned SubReg2)
addRegReg - This function is used to add a memory reference of the form: [Reg + Reg].
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
constexpr RegState getDeadRegState(bool B)
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
bool isNonFoldableWithSameMask(unsigned RegOp)
const X86FoldTableEntry * lookupBroadcastFoldTable(unsigned RegOp, unsigned OpNum)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
const X86InstrFMA3Group * getFMA3Group(unsigned Opcode, uint64_t TSFlags)
Returns a reference to a group of FMA3 opcodes to where the given Opcode is included.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
const X86FoldTableEntry * lookupTwoAddrFoldTable(unsigned RegOp)
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1970
constexpr RegState getDefRegState(bool B)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
RegState getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
static bool isMemInstrWithGOTPCREL(const MachineInstr &MI)
static const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, int Offset)
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2052
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
Definition InstrProf.h:145
MaybeAlign getStackAlign(const CallBase &I, unsigned Index)
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
const X86FoldTableEntry * lookupUnfoldTable(unsigned MemOp)
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool matchBroadcastSize(const X86FoldTableEntry &Entry, unsigned BroadcastBits)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const X86FoldTableEntry * lookupFoldTable(unsigned RegOp, unsigned OpNum)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
constexpr RegState getUndefRegState(bool B)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
This represents a simple continuous liveness interval for a value.
std::vector< MachineInstr * > Kills
Kills - List of MachineInstruction's which are the last use of this virtual register (kill it) in the...
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
X86AddressMode - This struct holds a generalized full x86 address mode.
enum llvm::X86AddressMode::@202116273335065351270200035056227005202106004277 BaseType
This class is used to group {132, 213, 231} forms of FMA opcodes together.
unsigned get213Opcode() const
Returns the 213 form of FMA opcode.
unsigned get231Opcode() const
Returns the 231 form of FMA opcode.
bool isIntrinsic() const
Returns true iff the group of FMA opcodes holds intrinsic opcodes.
unsigned get132Opcode() const
Returns the 132 form of FMA opcode.
An individual sequence of instructions to be replaced with a call to an outlined function.
The information necessary to create an outlined function for some class of candidate.