LLVM 19.0.0git
AArch64MIPeepholeOpt.cpp
Go to the documentation of this file.
1//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass performs below peephole optimizations on MIR level.
10//
11// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13//
14// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16//
17// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19//
20// The mov pseudo instruction could be expanded to multiple mov instructions
21// later. In this case, we could try to split the constant operand of mov
22// instruction into two immediates which can be directly encoded into
23// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24// multiple `mov` + `and/add/sub` instructions.
25//
26// 4. Remove redundant ORRWrs which is generated by zero-extend.
27//
28// %3:gpr32 = ORRWrs $wzr, %2, 0
29// %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30//
31// If AArch64's 32-bit form of instruction defines the source operand of
32// ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33// operand are set to zero.
34//
35// 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36// ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
37//
38// 6. %intermediate:gpr32 = COPY %src:fpr128
39// %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40// ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
41//
42// In cases where a source FPR is copied to a GPR in order to be copied
43// to a destination FPR, we can directly copy the values between the FPRs,
44// eliminating the use of the Integer unit. When we match a pattern of
45// INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46// source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47// instructions.
48//
49// 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50// 64-bits. For example,
51//
52// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53// %2:fpr64 = MOVID 0
54// %4:fpr128 = IMPLICIT_DEF
55// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
56// %6:fpr128 = IMPLICIT_DEF
57// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
58// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
59// ==>
60// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61// %6:fpr128 = IMPLICIT_DEF
62// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
63//
64//===----------------------------------------------------------------------===//
65
66#include "AArch64ExpandImm.h"
67#include "AArch64InstrInfo.h"
71
72using namespace llvm;
73
74#define DEBUG_TYPE "aarch64-mi-peephole-opt"
75
76namespace {
77
78struct AArch64MIPeepholeOpt : public MachineFunctionPass {
79 static char ID;
80
81 AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
83 }
84
85 const AArch64InstrInfo *TII;
87 MachineLoopInfo *MLI;
89
90 using OpcodePair = std::pair<unsigned, unsigned>;
91 template <typename T>
92 using SplitAndOpcFunc =
93 std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
94 using BuildMIFunc =
95 std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
97
98 /// For instructions where an immediate operand could be split into two
99 /// separate immediate instructions, use the splitTwoPartImm two handle the
100 /// optimization.
101 ///
102 /// To implement, the following function types must be passed to
103 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
104 /// splitting the immediate is valid and returns the associated new opcode. A
105 /// BuildMIFunc must be implemented to build the two immediate instructions.
106 ///
107 /// Example Pattern (where IMM would require 2+ MOV instructions):
108 /// %dst = <Instr>rr %src IMM [...]
109 /// becomes:
110 /// %tmp = <Instr>ri %src (encode half IMM) [...]
111 /// %dst = <Instr>ri %tmp (encode half IMM) [...]
112 template <typename T>
113 bool splitTwoPartImm(MachineInstr &MI,
114 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
115
116 bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
117 MachineInstr *&SubregToRegMI);
118
119 template <typename T>
120 bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
121 template <typename T>
122 bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
123
124 template <typename T>
125 bool visitAND(unsigned Opc, MachineInstr &MI);
126 bool visitORR(MachineInstr &MI);
127 bool visitINSERT(MachineInstr &MI);
128 bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
129 bool visitINSvi64lane(MachineInstr &MI);
130 bool visitFMOVDr(MachineInstr &MI);
131 bool visitCopy(MachineInstr &MI);
132 bool runOnMachineFunction(MachineFunction &MF) override;
133
134 StringRef getPassName() const override {
135 return "AArch64 MI Peephole Optimization pass";
136 }
137
138 void getAnalysisUsage(AnalysisUsage &AU) const override {
139 AU.setPreservesCFG();
142 }
143};
144
145char AArch64MIPeepholeOpt::ID = 0;
146
147} // end anonymous namespace
148
149INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
150 "AArch64 MI Peephole Optimization", false, false)
151
152template <typename T>
153static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
154 T UImm = static_cast<T>(Imm);
156 return false;
157
158 // If this immediate can be handled by one instruction, do not split it.
161 if (Insn.size() == 1)
162 return false;
163
164 // The bitmask immediate consists of consecutive ones. Let's say there is
165 // constant 0b00000000001000000000010000000000 which does not consist of
166 // consecutive ones. We can split it in to two bitmask immediate like
167 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
168 // If we do AND with these two bitmask immediate, we can see original one.
170 unsigned HighestBitSet = Log2_64(UImm);
171
172 // Create a mask which is filled with one from the position of lowest bit set
173 // to the position of highest bit set.
174 T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
175 (static_cast<T>(1) << LowestBitSet);
176 // Create a mask which is filled with one outside the position of lowest bit
177 // set and the position of highest bit set.
178 T NewImm2 = UImm | ~NewImm1;
179
180 // If the split value is not valid bitmask immediate, do not split this
181 // constant.
183 return false;
184
187 return true;
188}
189
190template <typename T>
191bool AArch64MIPeepholeOpt::visitAND(
192 unsigned Opc, MachineInstr &MI) {
193 // Try below transformation.
194 //
195 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
196 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
197 //
198 // The mov pseudo instruction could be expanded to multiple mov instructions
199 // later. Let's try to split the constant operand of mov instruction into two
200 // bitmask immediates. It makes only two AND instructions intead of multiple
201 // mov + and instructions.
202
203 return splitTwoPartImm<T>(
204 MI,
205 [Opc](T Imm, unsigned RegSize, T &Imm0,
206 T &Imm1) -> std::optional<OpcodePair> {
207 if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
208 return std::make_pair(Opc, Opc);
209 return std::nullopt;
210 },
211 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
212 unsigned Imm1, Register SrcReg, Register NewTmpReg,
213 Register NewDstReg) {
214 DebugLoc DL = MI.getDebugLoc();
215 MachineBasicBlock *MBB = MI.getParent();
216 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
217 .addReg(SrcReg)
218 .addImm(Imm0);
219 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
220 .addReg(NewTmpReg)
221 .addImm(Imm1);
222 });
223}
224
225bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
226 // Check this ORR comes from below zero-extend pattern.
227 //
228 // def : Pat<(i64 (zext GPR32:$src)),
229 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
230 if (MI.getOperand(3).getImm() != 0)
231 return false;
232
233 if (MI.getOperand(1).getReg() != AArch64::WZR)
234 return false;
235
236 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
237 if (!SrcMI)
238 return false;
239
240 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
241 //
242 // When you use the 32-bit form of an instruction, the upper 32 bits of the
243 // source registers are ignored and the upper 32 bits of the destination
244 // register are set to zero.
245 //
246 // If AArch64's 32-bit form of instruction defines the source operand of
247 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
248 // real AArch64 instruction and if it is not, do not process the opcode
249 // conservatively.
250 if (SrcMI->getOpcode() == TargetOpcode::COPY &&
251 SrcMI->getOperand(1).getReg().isVirtual()) {
252 const TargetRegisterClass *RC =
253 MRI->getRegClass(SrcMI->getOperand(1).getReg());
254
255 // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
256 // that the upper bits are zero.
257 if (RC != &AArch64::FPR32RegClass &&
258 ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
259 SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
260 return false;
261 Register CpySrc = SrcMI->getOperand(1).getReg();
262 if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
263 CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
264 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
265 TII->get(TargetOpcode::COPY), CpySrc)
266 .add(SrcMI->getOperand(1));
267 }
268 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
269 TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
270 .addReg(CpySrc);
271 SrcMI->eraseFromParent();
272 }
273 else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
274 return false;
275
276 Register DefReg = MI.getOperand(0).getReg();
277 Register SrcReg = MI.getOperand(2).getReg();
278 MRI->replaceRegWith(DefReg, SrcReg);
279 MRI->clearKillFlags(SrcReg);
280 LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
281 MI.eraseFromParent();
282
283 return true;
284}
285
286bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
287 // Check this INSERT_SUBREG comes from below zero-extend pattern.
288 //
289 // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
290 // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
291 //
292 // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
293 // COPY would destroy the upper part of the register anyway
294 if (!MI.isRegTiedToDefOperand(1))
295 return false;
296
297 Register DstReg = MI.getOperand(0).getReg();
298 const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
299 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
300 if (!SrcMI)
301 return false;
302
303 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
304 //
305 // When you use the 32-bit form of an instruction, the upper 32 bits of the
306 // source registers are ignored and the upper 32 bits of the destination
307 // register are set to zero.
308 //
309 // If AArch64's 32-bit form of instruction defines the source operand of
310 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
311 // real AArch64 instruction and if it is not, do not process the opcode
312 // conservatively.
313 if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
314 !AArch64::GPR64allRegClass.hasSubClassEq(RC))
315 return false;
316
317 // Build a SUBREG_TO_REG instruction
318 MachineInstr *SubregMI =
319 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
320 TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
321 .addImm(0)
322 .add(MI.getOperand(2))
323 .add(MI.getOperand(3));
324 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n");
325 (void)SubregMI;
326 MI.eraseFromParent();
327
328 return true;
329}
330
331template <typename T>
332static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
333 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
334 // imm0 and imm1 are non-zero 12-bit unsigned int.
335 if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
336 (Imm & ~static_cast<T>(0xffffff)) != 0)
337 return false;
338
339 // The immediate can not be composed via a single instruction.
342 if (Insn.size() == 1)
343 return false;
344
345 // Split Imm into (Imm0 << 12) + Imm1;
346 Imm0 = (Imm >> 12) & 0xfff;
347 Imm1 = Imm & 0xfff;
348 return true;
349}
350
351template <typename T>
352bool AArch64MIPeepholeOpt::visitADDSUB(
353 unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
354 // Try below transformation.
355 //
356 // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
357 // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
358 //
359 // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
360 // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
361 //
362 // The mov pseudo instruction could be expanded to multiple mov instructions
363 // later. Let's try to split the constant operand of mov instruction into two
364 // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
365 // multiple `mov` + `and/sub` instructions.
366
367 // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
368 // folded. Make sure that we don't generate invalid instructions that use XZR
369 // in those cases.
370 if (MI.getOperand(1).getReg() == AArch64::XZR ||
371 MI.getOperand(1).getReg() == AArch64::WZR)
372 return false;
373
374 return splitTwoPartImm<T>(
375 MI,
376 [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
377 T &Imm1) -> std::optional<OpcodePair> {
378 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
379 return std::make_pair(PosOpc, PosOpc);
380 if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
381 return std::make_pair(NegOpc, NegOpc);
382 return std::nullopt;
383 },
384 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
385 unsigned Imm1, Register SrcReg, Register NewTmpReg,
386 Register NewDstReg) {
387 DebugLoc DL = MI.getDebugLoc();
388 MachineBasicBlock *MBB = MI.getParent();
389 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
390 .addReg(SrcReg)
391 .addImm(Imm0)
392 .addImm(12);
393 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
394 .addReg(NewTmpReg)
395 .addImm(Imm1)
396 .addImm(0);
397 });
398}
399
400template <typename T>
401bool AArch64MIPeepholeOpt::visitADDSSUBS(
402 OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
403 // Try the same transformation as ADDSUB but with additional requirement
404 // that the condition code usages are only for Equal and Not Equal
405
406 if (MI.getOperand(1).getReg() == AArch64::XZR ||
407 MI.getOperand(1).getReg() == AArch64::WZR)
408 return false;
409
410 return splitTwoPartImm<T>(
411 MI,
412 [PosOpcs, NegOpcs, &MI, &TRI = TRI,
413 &MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
414 T &Imm1) -> std::optional<OpcodePair> {
415 OpcodePair OP;
416 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
417 OP = PosOpcs;
418 else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
419 OP = NegOpcs;
420 else
421 return std::nullopt;
422 // Check conditional uses last since it is expensive for scanning
423 // proceeding instructions
424 MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
425 std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
426 if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
427 return std::nullopt;
428 return OP;
429 },
430 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
431 unsigned Imm1, Register SrcReg, Register NewTmpReg,
432 Register NewDstReg) {
433 DebugLoc DL = MI.getDebugLoc();
434 MachineBasicBlock *MBB = MI.getParent();
435 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
436 .addReg(SrcReg)
437 .addImm(Imm0)
438 .addImm(12);
439 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
440 .addReg(NewTmpReg)
441 .addImm(Imm1)
442 .addImm(0);
443 });
444}
445
446// Checks if the corresponding MOV immediate instruction is applicable for
447// this peephole optimization.
448bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
449 MachineInstr *&MovMI,
450 MachineInstr *&SubregToRegMI) {
451 // Check whether current MBB is in loop and the AND is loop invariant.
452 MachineBasicBlock *MBB = MI.getParent();
453 MachineLoop *L = MLI->getLoopFor(MBB);
454 if (L && !L->isLoopInvariant(MI))
455 return false;
456
457 // Check whether current MI's operand is MOV with immediate.
458 MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
459 if (!MovMI)
460 return false;
461
462 // If it is SUBREG_TO_REG, check its operand.
463 SubregToRegMI = nullptr;
464 if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
465 SubregToRegMI = MovMI;
466 MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
467 if (!MovMI)
468 return false;
469 }
470
471 if (MovMI->getOpcode() != AArch64::MOVi32imm &&
472 MovMI->getOpcode() != AArch64::MOVi64imm)
473 return false;
474
475 // If the MOV has multiple uses, do not split the immediate because it causes
476 // more instructions.
477 if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
478 return false;
479 if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
480 return false;
481
482 // It is OK to perform this peephole optimization.
483 return true;
484}
485
486template <typename T>
487bool AArch64MIPeepholeOpt::splitTwoPartImm(
489 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
490 unsigned RegSize = sizeof(T) * 8;
491 assert((RegSize == 32 || RegSize == 64) &&
492 "Invalid RegSize for legal immediate peephole optimization");
493
494 // Perform several essential checks against current MI.
495 MachineInstr *MovMI, *SubregToRegMI;
496 if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
497 return false;
498
499 // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
500 T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
501 // For the 32 bit form of instruction, the upper 32 bits of the destination
502 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
503 // of Imm to zero. This is essential if the Immediate value was a negative
504 // number since it was sign extended when we assign to the 64-bit Imm.
505 if (SubregToRegMI)
506 Imm &= 0xFFFFFFFF;
507 OpcodePair Opcode;
508 if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
509 Opcode = *R;
510 else
511 return false;
512
513 // Create new MIs using the first and second opcodes. Opcodes might differ for
514 // flag setting operations that should only set flags on second instruction.
515 // NewTmpReg = Opcode.first SrcReg Imm0
516 // NewDstReg = Opcode.second NewTmpReg Imm1
517
518 // Determine register classes for destinations and register operands
519 MachineFunction *MF = MI.getMF();
520 const TargetRegisterClass *FirstInstrDstRC =
521 TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
522 const TargetRegisterClass *FirstInstrOperandRC =
523 TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
524 const TargetRegisterClass *SecondInstrDstRC =
525 (Opcode.first == Opcode.second)
526 ? FirstInstrDstRC
527 : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
528 const TargetRegisterClass *SecondInstrOperandRC =
529 (Opcode.first == Opcode.second)
530 ? FirstInstrOperandRC
531 : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
532
533 // Get old registers destinations and new register destinations
534 Register DstReg = MI.getOperand(0).getReg();
535 Register SrcReg = MI.getOperand(1).getReg();
536 Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
537 // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
538 // reuse that same destination register.
539 Register NewDstReg = DstReg.isVirtual()
540 ? MRI->createVirtualRegister(SecondInstrDstRC)
541 : DstReg;
542
543 // Constrain registers based on their new uses
544 MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
545 MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
546 if (DstReg != NewDstReg)
547 MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
548
549 // Call the delegating operation to build the instruction
550 BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
551
552 // replaceRegWith changes MI's definition register. Keep it for SSA form until
553 // deleting MI. Only if we made a new destination register.
554 if (DstReg != NewDstReg) {
555 MRI->replaceRegWith(DstReg, NewDstReg);
556 MI.getOperand(0).setReg(DstReg);
557 }
558
559 // Record the MIs need to be removed.
560 MI.eraseFromParent();
561 if (SubregToRegMI)
562 SubregToRegMI->eraseFromParent();
563 MovMI->eraseFromParent();
564
565 return true;
566}
567
568bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
569 // Check if this INSvi[X]gpr comes from COPY of a source FPR128
570 //
571 // From
572 // %intermediate1:gpr64 = COPY %src:fpr128
573 // %intermediate2:gpr32 = COPY %intermediate1:gpr64
574 // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
575 // To
576 // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
577 // src_index
578 // where src_index = 0, X = [8|16|32|64]
579
580 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
581
582 // For a chain of COPY instructions, find the initial source register
583 // and check if it's an FPR128
584 while (true) {
585 if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
586 return false;
587
588 if (!SrcMI->getOperand(1).getReg().isVirtual())
589 return false;
590
591 if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
592 &AArch64::FPR128RegClass) {
593 break;
594 }
595 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
596 }
597
598 Register DstReg = MI.getOperand(0).getReg();
599 Register SrcReg = SrcMI->getOperand(1).getReg();
600 MachineInstr *INSvilaneMI =
601 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
602 .add(MI.getOperand(1))
603 .add(MI.getOperand(2))
604 .addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
605 .addImm(0);
606
607 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n");
608 (void)INSvilaneMI;
609 MI.eraseFromParent();
610 return true;
611}
612
613// All instructions that set a FPR64 will implicitly zero the top bits of the
614// register.
617 if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
618 return false;
619 const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
620 if (RC != &AArch64::FPR64RegClass)
621 return false;
622 return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
623}
624
625bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
626 // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
627 // We are expecting below case.
628 //
629 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
630 // %6:fpr128 = IMPLICIT_DEF
631 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
632 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
633 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
634 if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
635 return false;
636 Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
637 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
638 return false;
639
640 // Check there is `mov 0` MI for high 64-bits.
641 // We are expecting below cases.
642 //
643 // %2:fpr64 = MOVID 0
644 // %4:fpr128 = IMPLICIT_DEF
645 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
646 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
647 // or
648 // %5:fpr128 = MOVIv2d_ns 0
649 // %6:fpr64 = COPY %5.dsub:fpr128
650 // %8:fpr128 = IMPLICIT_DEF
651 // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
652 // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
653 MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
654 if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
655 return false;
656 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
657 if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
658 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
659 if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
660 High64MI->getOpcode() != AArch64::MOVIv2d_ns))
661 return false;
662 if (High64MI->getOperand(1).getImm() != 0)
663 return false;
664
665 // Let's remove MIs for high 64-bits.
666 Register OldDef = MI.getOperand(0).getReg();
667 Register NewDef = MI.getOperand(1).getReg();
668 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
669 MRI->replaceRegWith(OldDef, NewDef);
670 MI.eraseFromParent();
671
672 return true;
673}
674
675bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
676 // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
677 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
678 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
679 return false;
680
681 // Let's remove MIs for high 64-bits.
682 Register OldDef = MI.getOperand(0).getReg();
683 Register NewDef = MI.getOperand(1).getReg();
684 LLVM_DEBUG(dbgs() << "Removing: " << MI << "\n");
685 MRI->clearKillFlags(OldDef);
686 MRI->clearKillFlags(NewDef);
687 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
688 MRI->replaceRegWith(OldDef, NewDef);
689 MI.eraseFromParent();
690
691 return true;
692}
693
694// Across a basic-block we might have in i32 extract from a value that only
695// operates on upper bits (for example a sxtw). We can replace the COPY with a
696// new version skipping the sxtw.
697bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) {
698 Register InputReg = MI.getOperand(1).getReg();
699 if (MI.getOperand(1).getSubReg() != AArch64::sub_32 ||
700 !MRI->hasOneNonDBGUse(InputReg))
701 return false;
702
703 MachineInstr *SrcMI = MRI->getUniqueVRegDef(InputReg);
704 MachineInstr *CopyMI = SrcMI;
705 while (SrcMI && SrcMI->isFullCopy() &&
706 MRI->hasOneNonDBGUse(SrcMI->getOperand(1).getReg()))
707 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
708
709 if (!SrcMI || SrcMI->getOpcode() != AArch64::SBFMXri ||
710 SrcMI->getOperand(2).getImm() != 0 || SrcMI->getOperand(3).getImm() != 31)
711 return false;
712
713 Register SrcReg = SrcMI->getOperand(1).getReg();
714 MRI->constrainRegClass(SrcReg, MRI->getRegClass(InputReg));
715 MI.getOperand(1).setReg(SrcReg);
716 if (CopyMI != SrcMI)
717 CopyMI->eraseFromParent();
718 SrcMI->eraseFromParent();
719 return true;
720}
721
722bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
723 if (skipFunction(MF.getFunction()))
724 return false;
725
726 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
727 TRI = static_cast<const AArch64RegisterInfo *>(
729 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
730 MRI = &MF.getRegInfo();
731
732 assert(MRI->isSSA() && "Expected to be run on SSA form!");
733
734 bool Changed = false;
735
736 for (MachineBasicBlock &MBB : MF) {
738 switch (MI.getOpcode()) {
739 default:
740 break;
741 case AArch64::INSERT_SUBREG:
742 Changed |= visitINSERT(MI);
743 break;
744 case AArch64::ANDWrr:
745 Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
746 break;
747 case AArch64::ANDXrr:
748 Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
749 break;
750 case AArch64::ORRWrs:
751 Changed |= visitORR(MI);
752 break;
753 case AArch64::ADDWrr:
754 Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
755 break;
756 case AArch64::SUBWrr:
757 Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
758 break;
759 case AArch64::ADDXrr:
760 Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
761 break;
762 case AArch64::SUBXrr:
763 Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
764 break;
765 case AArch64::ADDSWrr:
766 Changed |=
767 visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
768 {AArch64::SUBWri, AArch64::SUBSWri}, MI);
769 break;
770 case AArch64::SUBSWrr:
771 Changed |=
772 visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
773 {AArch64::ADDWri, AArch64::ADDSWri}, MI);
774 break;
775 case AArch64::ADDSXrr:
776 Changed |=
777 visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
778 {AArch64::SUBXri, AArch64::SUBSXri}, MI);
779 break;
780 case AArch64::SUBSXrr:
781 Changed |=
782 visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
783 {AArch64::ADDXri, AArch64::ADDSXri}, MI);
784 break;
785 case AArch64::INSvi64gpr:
786 Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);
787 break;
788 case AArch64::INSvi32gpr:
789 Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);
790 break;
791 case AArch64::INSvi16gpr:
792 Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);
793 break;
794 case AArch64::INSvi8gpr:
795 Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);
796 break;
797 case AArch64::INSvi64lane:
798 Changed |= visitINSvi64lane(MI);
799 break;
800 case AArch64::FMOVDr:
801 Changed |= visitFMOVDr(MI);
802 break;
803 case AArch64::COPY:
804 Changed |= visitCopy(MI);
805 break;
806 }
807 }
808 }
809
810 return Changed;
811}
812
814 return new AArch64MIPeepholeOpt();
815}
unsigned const MachineRegisterInfo * MRI
unsigned HighestBitSet
unsigned T T & Imm2Enc
unsigned T & Imm1Enc
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
unsigned RegSize
static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI, MachineRegisterInfo *MRI)
static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1)
unsigned LowestBitSet
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define LLVM_DEBUG(X)
Definition: Debug.h:101
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
#define OP(OPC)
Definition: SandboxIR.h:500
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
A debug info location.
Definition: DebugLoc.h:33
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
bool isFullCopy() const
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
unsigned getSubReg() const
int64_t getImm() const
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetInstrInfo * getInstrInfo() const
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createAArch64MIPeepholeOptPass()
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:346
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
void initializeAArch64MIPeepholeOptPass(PassRegistry &)