LLVM 18.0.0git
AArch64MIPeepholeOpt.cpp
Go to the documentation of this file.
1//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass performs below peephole optimizations on MIR level.
10//
11// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
12// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
13//
14// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16//
17// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19//
20// The mov pseudo instruction could be expanded to multiple mov instructions
21// later. In this case, we could try to split the constant operand of mov
22// instruction into two immediates which can be directly encoded into
23// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24// multiple `mov` + `and/add/sub` instructions.
25//
26// 4. Remove redundant ORRWrs which is generated by zero-extend.
27//
28// %3:gpr32 = ORRWrs $wzr, %2, 0
29// %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
30//
31// If AArch64's 32-bit form of instruction defines the source operand of
32// ORRWrs, we can remove the ORRWrs because the upper 32 bits of the source
33// operand are set to zero.
34//
35// 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
36// ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
37//
38// 6. %intermediate:gpr32 = COPY %src:fpr128
39// %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32
40// ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0
41//
42// In cases where a source FPR is copied to a GPR in order to be copied
43// to a destination FPR, we can directly copy the values between the FPRs,
44// eliminating the use of the Integer unit. When we match a pattern of
45// INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR
46// source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
47// instructions.
48//
49// 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
50// 64-bits. For example,
51//
52// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
53// %2:fpr64 = MOVID 0
54// %4:fpr128 = IMPLICIT_DEF
55// %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
56// %6:fpr128 = IMPLICIT_DEF
57// %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
58// %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
59// ==>
60// %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
61// %6:fpr128 = IMPLICIT_DEF
62// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
63//
64//===----------------------------------------------------------------------===//
65
66#include "AArch64ExpandImm.h"
67#include "AArch64InstrInfo.h"
71
72using namespace llvm;
73
74#define DEBUG_TYPE "aarch64-mi-peephole-opt"
75
76namespace {
77
78struct AArch64MIPeepholeOpt : public MachineFunctionPass {
79 static char ID;
80
81 AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {
83 }
84
85 const AArch64InstrInfo *TII;
87 MachineLoopInfo *MLI;
89
90 using OpcodePair = std::pair<unsigned, unsigned>;
91 template <typename T>
92 using SplitAndOpcFunc =
93 std::function<std::optional<OpcodePair>(T, unsigned, T &, T &)>;
94 using BuildMIFunc =
95 std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
97
98 /// For instructions where an immediate operand could be split into two
99 /// separate immediate instructions, use the splitTwoPartImm two handle the
100 /// optimization.
101 ///
102 /// To implement, the following function types must be passed to
103 /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
104 /// splitting the immediate is valid and returns the associated new opcode. A
105 /// BuildMIFunc must be implemented to build the two immediate instructions.
106 ///
107 /// Example Pattern (where IMM would require 2+ MOV instructions):
108 /// %dst = <Instr>rr %src IMM [...]
109 /// becomes:
110 /// %tmp = <Instr>ri %src (encode half IMM) [...]
111 /// %dst = <Instr>ri %tmp (encode half IMM) [...]
112 template <typename T>
113 bool splitTwoPartImm(MachineInstr &MI,
114 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
115
116 bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
117 MachineInstr *&SubregToRegMI);
118
119 template <typename T>
120 bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI);
121 template <typename T>
122 bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
123
124 template <typename T>
125 bool visitAND(unsigned Opc, MachineInstr &MI);
126 bool visitORR(MachineInstr &MI);
127 bool visitINSERT(MachineInstr &MI);
128 bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
129 bool visitINSvi64lane(MachineInstr &MI);
130 bool runOnMachineFunction(MachineFunction &MF) override;
131
132 StringRef getPassName() const override {
133 return "AArch64 MI Peephole Optimization pass";
134 }
135
136 void getAnalysisUsage(AnalysisUsage &AU) const override {
137 AU.setPreservesCFG();
140 }
141};
142
143char AArch64MIPeepholeOpt::ID = 0;
144
145} // end anonymous namespace
146
147INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
148 "AArch64 MI Peephole Optimization", false, false)
149
150template <typename T>
151static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
152 T UImm = static_cast<T>(Imm);
154 return false;
155
156 // If this immediate can be handled by one instruction, do not split it.
159 if (Insn.size() == 1)
160 return false;
161
162 // The bitmask immediate consists of consecutive ones. Let's say there is
163 // constant 0b00000000001000000000010000000000 which does not consist of
164 // consecutive ones. We can split it in to two bitmask immediate like
165 // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111.
166 // If we do AND with these two bitmask immediate, we can see original one.
168 unsigned HighestBitSet = Log2_64(UImm);
169
170 // Create a mask which is filled with one from the position of lowest bit set
171 // to the position of highest bit set.
172 T NewImm1 = (static_cast<T>(2) << HighestBitSet) -
173 (static_cast<T>(1) << LowestBitSet);
174 // Create a mask which is filled with one outside the position of lowest bit
175 // set and the position of highest bit set.
176 T NewImm2 = UImm | ~NewImm1;
177
178 // If the split value is not valid bitmask immediate, do not split this
179 // constant.
181 return false;
182
185 return true;
186}
187
188template <typename T>
189bool AArch64MIPeepholeOpt::visitAND(
190 unsigned Opc, MachineInstr &MI) {
191 // Try below transformation.
192 //
193 // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
194 // MOVi64imm + ANDXrr ==> ANDXri + ANDXri
195 //
196 // The mov pseudo instruction could be expanded to multiple mov instructions
197 // later. Let's try to split the constant operand of mov instruction into two
198 // bitmask immediates. It makes only two AND instructions intead of multiple
199 // mov + and instructions.
200
201 return splitTwoPartImm<T>(
202 MI,
203 [Opc](T Imm, unsigned RegSize, T &Imm0,
204 T &Imm1) -> std::optional<OpcodePair> {
205 if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
206 return std::make_pair(Opc, Opc);
207 return std::nullopt;
208 },
209 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
210 unsigned Imm1, Register SrcReg, Register NewTmpReg,
211 Register NewDstReg) {
212 DebugLoc DL = MI.getDebugLoc();
213 MachineBasicBlock *MBB = MI.getParent();
214 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
215 .addReg(SrcReg)
216 .addImm(Imm0);
217 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
218 .addReg(NewTmpReg)
219 .addImm(Imm1);
220 });
221}
222
223bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) {
224 // Check this ORR comes from below zero-extend pattern.
225 //
226 // def : Pat<(i64 (zext GPR32:$src)),
227 // (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
228 if (MI.getOperand(3).getImm() != 0)
229 return false;
230
231 if (MI.getOperand(1).getReg() != AArch64::WZR)
232 return false;
233
234 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
235 if (!SrcMI)
236 return false;
237
238 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
239 //
240 // When you use the 32-bit form of an instruction, the upper 32 bits of the
241 // source registers are ignored and the upper 32 bits of the destination
242 // register are set to zero.
243 //
244 // If AArch64's 32-bit form of instruction defines the source operand of
245 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
246 // real AArch64 instruction and if it is not, do not process the opcode
247 // conservatively.
248 if (SrcMI->getOpcode() == TargetOpcode::COPY &&
249 SrcMI->getOperand(1).getReg().isVirtual()) {
250 const TargetRegisterClass *RC =
251 MRI->getRegClass(SrcMI->getOperand(1).getReg());
252
253 // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
254 // that the upper bits are zero.
255 if (RC != &AArch64::FPR32RegClass &&
256 ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
257 SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
258 return false;
259 Register CpySrc = SrcMI->getOperand(1).getReg();
260 if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
261 CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
262 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
263 TII->get(TargetOpcode::COPY), CpySrc)
264 .add(SrcMI->getOperand(1));
265 }
266 BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
267 TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
268 .addReg(CpySrc);
269 SrcMI->eraseFromParent();
270 }
271 else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
272 return false;
273
274 Register DefReg = MI.getOperand(0).getReg();
275 Register SrcReg = MI.getOperand(2).getReg();
276 MRI->replaceRegWith(DefReg, SrcReg);
277 MRI->clearKillFlags(SrcReg);
278 LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
279 MI.eraseFromParent();
280
281 return true;
282}
283
284bool AArch64MIPeepholeOpt::visitINSERT(MachineInstr &MI) {
285 // Check this INSERT_SUBREG comes from below zero-extend pattern.
286 //
287 // From %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx
288 // To %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx
289 //
290 // We're assuming the first operand to INSERT_SUBREG is irrelevant because a
291 // COPY would destroy the upper part of the register anyway
292 if (!MI.isRegTiedToDefOperand(1))
293 return false;
294
295 Register DstReg = MI.getOperand(0).getReg();
296 const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
297 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
298 if (!SrcMI)
299 return false;
300
301 // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
302 //
303 // When you use the 32-bit form of an instruction, the upper 32 bits of the
304 // source registers are ignored and the upper 32 bits of the destination
305 // register are set to zero.
306 //
307 // If AArch64's 32-bit form of instruction defines the source operand of
308 // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
309 // real AArch64 instruction and if it is not, do not process the opcode
310 // conservatively.
311 if ((SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) ||
312 !AArch64::GPR64allRegClass.hasSubClassEq(RC))
313 return false;
314
315 // Build a SUBREG_TO_REG instruction
316 MachineInstr *SubregMI =
317 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
318 TII->get(TargetOpcode::SUBREG_TO_REG), DstReg)
319 .addImm(0)
320 .add(MI.getOperand(2))
321 .add(MI.getOperand(3));
322 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *SubregMI << "\n");
323 (void)SubregMI;
324 MI.eraseFromParent();
325
326 return true;
327}
328
329template <typename T>
330static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
331 // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
332 // imm0 and imm1 are non-zero 12-bit unsigned int.
333 if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
334 (Imm & ~static_cast<T>(0xffffff)) != 0)
335 return false;
336
337 // The immediate can not be composed via a single instruction.
340 if (Insn.size() == 1)
341 return false;
342
343 // Split Imm into (Imm0 << 12) + Imm1;
344 Imm0 = (Imm >> 12) & 0xfff;
345 Imm1 = Imm & 0xfff;
346 return true;
347}
348
349template <typename T>
350bool AArch64MIPeepholeOpt::visitADDSUB(
351 unsigned PosOpc, unsigned NegOpc, MachineInstr &MI) {
352 // Try below transformation.
353 //
354 // ADDWrr X, MOVi32imm ==> ADDWri + ADDWri
355 // ADDXrr X, MOVi64imm ==> ADDXri + ADDXri
356 //
357 // SUBWrr X, MOVi32imm ==> SUBWri + SUBWri
358 // SUBXrr X, MOVi64imm ==> SUBXri + SUBXri
359 //
360 // The mov pseudo instruction could be expanded to multiple mov instructions
361 // later. Let's try to split the constant operand of mov instruction into two
362 // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
363 // multiple `mov` + `and/sub` instructions.
364
365 // We can sometimes have ADDWrr WZR, MULi32imm that have not been constant
366 // folded. Make sure that we don't generate invalid instructions that use XZR
367 // in those cases.
368 if (MI.getOperand(1).getReg() == AArch64::XZR ||
369 MI.getOperand(1).getReg() == AArch64::WZR)
370 return false;
371
372 return splitTwoPartImm<T>(
373 MI,
374 [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
375 T &Imm1) -> std::optional<OpcodePair> {
376 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
377 return std::make_pair(PosOpc, PosOpc);
378 if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
379 return std::make_pair(NegOpc, NegOpc);
380 return std::nullopt;
381 },
382 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
383 unsigned Imm1, Register SrcReg, Register NewTmpReg,
384 Register NewDstReg) {
385 DebugLoc DL = MI.getDebugLoc();
386 MachineBasicBlock *MBB = MI.getParent();
387 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
388 .addReg(SrcReg)
389 .addImm(Imm0)
390 .addImm(12);
391 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
392 .addReg(NewTmpReg)
393 .addImm(Imm1)
394 .addImm(0);
395 });
396}
397
398template <typename T>
399bool AArch64MIPeepholeOpt::visitADDSSUBS(
400 OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI) {
401 // Try the same transformation as ADDSUB but with additional requirement
402 // that the condition code usages are only for Equal and Not Equal
403
404 if (MI.getOperand(1).getReg() == AArch64::XZR ||
405 MI.getOperand(1).getReg() == AArch64::WZR)
406 return false;
407
408 return splitTwoPartImm<T>(
409 MI,
410 [PosOpcs, NegOpcs, &MI, &TRI = TRI,
411 &MRI = MRI](T Imm, unsigned RegSize, T &Imm0,
412 T &Imm1) -> std::optional<OpcodePair> {
413 OpcodePair OP;
414 if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
415 OP = PosOpcs;
416 else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
417 OP = NegOpcs;
418 else
419 return std::nullopt;
420 // Check conditional uses last since it is expensive for scanning
421 // proceeding instructions
422 MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
423 std::optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
424 if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
425 return std::nullopt;
426 return OP;
427 },
428 [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
429 unsigned Imm1, Register SrcReg, Register NewTmpReg,
430 Register NewDstReg) {
431 DebugLoc DL = MI.getDebugLoc();
432 MachineBasicBlock *MBB = MI.getParent();
433 BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
434 .addReg(SrcReg)
435 .addImm(Imm0)
436 .addImm(12);
437 BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
438 .addReg(NewTmpReg)
439 .addImm(Imm1)
440 .addImm(0);
441 });
442}
443
444// Checks if the corresponding MOV immediate instruction is applicable for
445// this peephole optimization.
446bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
447 MachineInstr *&MovMI,
448 MachineInstr *&SubregToRegMI) {
449 // Check whether current MBB is in loop and the AND is loop invariant.
450 MachineBasicBlock *MBB = MI.getParent();
451 MachineLoop *L = MLI->getLoopFor(MBB);
452 if (L && !L->isLoopInvariant(MI))
453 return false;
454
455 // Check whether current MI's operand is MOV with immediate.
456 MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
457 if (!MovMI)
458 return false;
459
460 // If it is SUBREG_TO_REG, check its operand.
461 SubregToRegMI = nullptr;
462 if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
463 SubregToRegMI = MovMI;
464 MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
465 if (!MovMI)
466 return false;
467 }
468
469 if (MovMI->getOpcode() != AArch64::MOVi32imm &&
470 MovMI->getOpcode() != AArch64::MOVi64imm)
471 return false;
472
473 // If the MOV has multiple uses, do not split the immediate because it causes
474 // more instructions.
475 if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
476 return false;
477 if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
478 return false;
479
480 // It is OK to perform this peephole optimization.
481 return true;
482}
483
484template <typename T>
485bool AArch64MIPeepholeOpt::splitTwoPartImm(
487 SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
488 unsigned RegSize = sizeof(T) * 8;
489 assert((RegSize == 32 || RegSize == 64) &&
490 "Invalid RegSize for legal immediate peephole optimization");
491
492 // Perform several essential checks against current MI.
493 MachineInstr *MovMI, *SubregToRegMI;
494 if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
495 return false;
496
497 // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
498 T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
499 // For the 32 bit form of instruction, the upper 32 bits of the destination
500 // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
501 // of Imm to zero. This is essential if the Immediate value was a negative
502 // number since it was sign extended when we assign to the 64-bit Imm.
503 if (SubregToRegMI)
504 Imm &= 0xFFFFFFFF;
505 OpcodePair Opcode;
506 if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
507 Opcode = *R;
508 else
509 return false;
510
511 // Create new MIs using the first and second opcodes. Opcodes might differ for
512 // flag setting operations that should only set flags on second instruction.
513 // NewTmpReg = Opcode.first SrcReg Imm0
514 // NewDstReg = Opcode.second NewTmpReg Imm1
515
516 // Determine register classes for destinations and register operands
517 MachineFunction *MF = MI.getMF();
518 const TargetRegisterClass *FirstInstrDstRC =
519 TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
520 const TargetRegisterClass *FirstInstrOperandRC =
521 TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
522 const TargetRegisterClass *SecondInstrDstRC =
523 (Opcode.first == Opcode.second)
524 ? FirstInstrDstRC
525 : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
526 const TargetRegisterClass *SecondInstrOperandRC =
527 (Opcode.first == Opcode.second)
528 ? FirstInstrOperandRC
529 : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
530
531 // Get old registers destinations and new register destinations
532 Register DstReg = MI.getOperand(0).getReg();
533 Register SrcReg = MI.getOperand(1).getReg();
534 Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
535 // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
536 // reuse that same destination register.
537 Register NewDstReg = DstReg.isVirtual()
538 ? MRI->createVirtualRegister(SecondInstrDstRC)
539 : DstReg;
540
541 // Constrain registers based on their new uses
542 MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
543 MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
544 if (DstReg != NewDstReg)
545 MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
546
547 // Call the delegating operation to build the instruction
548 BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
549
550 // replaceRegWith changes MI's definition register. Keep it for SSA form until
551 // deleting MI. Only if we made a new destination register.
552 if (DstReg != NewDstReg) {
553 MRI->replaceRegWith(DstReg, NewDstReg);
554 MI.getOperand(0).setReg(DstReg);
555 }
556
557 // Record the MIs need to be removed.
558 MI.eraseFromParent();
559 if (SubregToRegMI)
560 SubregToRegMI->eraseFromParent();
561 MovMI->eraseFromParent();
562
563 return true;
564}
565
566bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) {
567 // Check if this INSvi[X]gpr comes from COPY of a source FPR128
568 //
569 // From
570 // %intermediate1:gpr64 = COPY %src:fpr128
571 // %intermediate2:gpr32 = COPY %intermediate1:gpr64
572 // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32
573 // To
574 // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128,
575 // src_index
576 // where src_index = 0, X = [8|16|32|64]
577
578 MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
579
580 // For a chain of COPY instructions, find the initial source register
581 // and check if it's an FPR128
582 while (true) {
583 if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY)
584 return false;
585
586 if (!SrcMI->getOperand(1).getReg().isVirtual())
587 return false;
588
589 if (MRI->getRegClass(SrcMI->getOperand(1).getReg()) ==
590 &AArch64::FPR128RegClass) {
591 break;
592 }
593 SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg());
594 }
595
596 Register DstReg = MI.getOperand(0).getReg();
597 Register SrcReg = SrcMI->getOperand(1).getReg();
598 MachineInstr *INSvilaneMI =
599 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opc), DstReg)
600 .add(MI.getOperand(1))
601 .add(MI.getOperand(2))
602 .addUse(SrcReg, getRegState(SrcMI->getOperand(1)))
603 .addImm(0);
604
605 LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n");
606 (void)INSvilaneMI;
607 MI.eraseFromParent();
608 return true;
609}
610
611// All instructions that set a FPR64 will implicitly zero the top bits of the
612// register.
615 if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
616 return false;
617 const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg());
618 if (RC != &AArch64::FPR64RegClass)
619 return false;
620 return MI->getOpcode() > TargetOpcode::GENERIC_OP_END;
621}
622
623bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
624 // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
625 // We are expecting below case.
626 //
627 // %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
628 // %6:fpr128 = IMPLICIT_DEF
629 // %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
630 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
631 MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
632 if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
633 return false;
634 Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
635 if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
636 return false;
637
638 // Check there is `mov 0` MI for high 64-bits.
639 // We are expecting below cases.
640 //
641 // %2:fpr64 = MOVID 0
642 // %4:fpr128 = IMPLICIT_DEF
643 // %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
644 // %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
645 // or
646 // %5:fpr128 = MOVIv2d_ns 0
647 // %6:fpr64 = COPY %5.dsub:fpr128
648 // %8:fpr128 = IMPLICIT_DEF
649 // %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
650 // %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
651 MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
652 if (!High64MI || High64MI->getOpcode() != AArch64::INSERT_SUBREG)
653 return false;
654 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
655 if (High64MI && High64MI->getOpcode() == TargetOpcode::COPY)
656 High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
657 if (!High64MI || (High64MI->getOpcode() != AArch64::MOVID &&
658 High64MI->getOpcode() != AArch64::MOVIv2d_ns))
659 return false;
660 if (High64MI->getOperand(1).getImm() != 0)
661 return false;
662
663 // Let's remove MIs for high 64-bits.
664 Register OldDef = MI.getOperand(0).getReg();
665 Register NewDef = MI.getOperand(1).getReg();
666 MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
667 MRI->replaceRegWith(OldDef, NewDef);
668 MI.eraseFromParent();
669
670 return true;
671}
672
673bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
674 if (skipFunction(MF.getFunction()))
675 return false;
676
677 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
678 TRI = static_cast<const AArch64RegisterInfo *>(
680 MLI = &getAnalysis<MachineLoopInfo>();
681 MRI = &MF.getRegInfo();
682
683 assert(MRI->isSSA() && "Expected to be run on SSA form!");
684
685 bool Changed = false;
686
687 for (MachineBasicBlock &MBB : MF) {
689 switch (MI.getOpcode()) {
690 default:
691 break;
692 case AArch64::INSERT_SUBREG:
693 Changed |= visitINSERT(MI);
694 break;
695 case AArch64::ANDWrr:
696 Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
697 break;
698 case AArch64::ANDXrr:
699 Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
700 break;
701 case AArch64::ORRWrs:
702 Changed |= visitORR(MI);
703 break;
704 case AArch64::ADDWrr:
705 Changed |= visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI);
706 break;
707 case AArch64::SUBWrr:
708 Changed |= visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI);
709 break;
710 case AArch64::ADDXrr:
711 Changed |= visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI);
712 break;
713 case AArch64::SUBXrr:
714 Changed |= visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI);
715 break;
716 case AArch64::ADDSWrr:
717 Changed |=
718 visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
719 {AArch64::SUBWri, AArch64::SUBSWri}, MI);
720 break;
721 case AArch64::SUBSWrr:
722 Changed |=
723 visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
724 {AArch64::ADDWri, AArch64::ADDSWri}, MI);
725 break;
726 case AArch64::ADDSXrr:
727 Changed |=
728 visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
729 {AArch64::SUBXri, AArch64::SUBSXri}, MI);
730 break;
731 case AArch64::SUBSXrr:
732 Changed |=
733 visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
734 {AArch64::ADDXri, AArch64::ADDSXri}, MI);
735 break;
736 case AArch64::INSvi64gpr:
737 Changed |= visitINSviGPR(MI, AArch64::INSvi64lane);
738 break;
739 case AArch64::INSvi32gpr:
740 Changed |= visitINSviGPR(MI, AArch64::INSvi32lane);
741 break;
742 case AArch64::INSvi16gpr:
743 Changed |= visitINSviGPR(MI, AArch64::INSvi16lane);
744 break;
745 case AArch64::INSvi8gpr:
746 Changed |= visitINSviGPR(MI, AArch64::INSvi8lane);
747 break;
748 case AArch64::INSvi64lane:
749 Changed |= visitINSvi64lane(MI);
750 break;
751 }
752 }
753 }
754
755 return Changed;
756}
757
759 return new AArch64MIPeepholeOpt();
760}
unsigned const MachineRegisterInfo * MRI
unsigned HighestBitSet
unsigned T T & Imm2Enc
unsigned T & Imm1Enc
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
unsigned RegSize
static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI, MachineRegisterInfo *MRI)
static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1)
unsigned LowestBitSet
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define LLVM_DEBUG(X)
Definition: Debug.h:101
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
A debug info location.
Definition: DebugLoc.h:33
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:326
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:472
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:553
unsigned getSubReg() const
int64_t getImm() const
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetInstrInfo * getInstrInfo() const
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createAArch64MIPeepholeOptPass()
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:666
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:319
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:179
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
void initializeAArch64MIPeepholeOptPass(PassRegistry &)
#define OP(n)
Definition: regex2.h:73