Line data Source code
1 : //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : /// The pass tries to use the 32-bit encoding for instructions when possible.
9 : //===----------------------------------------------------------------------===//
10 : //
11 :
12 : #include "AMDGPU.h"
13 : #include "AMDGPUSubtarget.h"
14 : #include "SIInstrInfo.h"
15 : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 : #include "llvm/ADT/Statistic.h"
17 : #include "llvm/CodeGen/MachineFunctionPass.h"
18 : #include "llvm/CodeGen/MachineInstrBuilder.h"
19 : #include "llvm/CodeGen/MachineRegisterInfo.h"
20 : #include "llvm/IR/Constants.h"
21 : #include "llvm/IR/Function.h"
22 : #include "llvm/IR/LLVMContext.h"
23 : #include "llvm/Support/Debug.h"
24 : #include "llvm/Support/raw_ostream.h"
25 : #include "llvm/Target/TargetMachine.h"
26 :
27 : #define DEBUG_TYPE "si-shrink-instructions"
28 :
29 : STATISTIC(NumInstructionsShrunk,
30 : "Number of 64-bit instruction reduced to 32-bit.");
31 : STATISTIC(NumLiteralConstantsFolded,
32 : "Number of literal constants folded into 32-bit instructions.");
33 :
34 : using namespace llvm;
35 :
36 : namespace {
37 :
38 : class SIShrinkInstructions : public MachineFunctionPass {
39 : public:
40 : static char ID;
41 :
42 : public:
43 3888 : SIShrinkInstructions() : MachineFunctionPass(ID) {
44 : }
45 :
46 : bool runOnMachineFunction(MachineFunction &MF) override;
47 :
48 3867 : StringRef getPassName() const override { return "SI Shrink Instructions"; }
49 :
50 3867 : void getAnalysisUsage(AnalysisUsage &AU) const override {
51 3867 : AU.setPreservesCFG();
52 3867 : MachineFunctionPass::getAnalysisUsage(AU);
53 3867 : }
54 : };
55 :
56 : } // End anonymous namespace.
57 :
58 199024 : INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
59 : "SI Shrink Instructions", false, false)
60 :
61 : char SIShrinkInstructions::ID = 0;
62 :
63 3883 : FunctionPass *llvm::createSIShrinkInstructionsPass() {
64 3883 : return new SIShrinkInstructions();
65 : }
66 :
67 : /// This function checks \p MI for operands defined by a move immediate
68 : /// instruction and then folds the literal constant into the instruction if it
69 : /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
70 50585 : static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
71 : MachineRegisterInfo &MRI, bool TryToCommute = true) {
72 : assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
73 :
74 50585 : int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
75 :
76 : // Try to fold Src0
77 50585 : MachineOperand &Src0 = MI.getOperand(Src0Idx);
78 50585 : if (Src0.isReg()) {
79 32522 : unsigned Reg = Src0.getReg();
80 32522 : if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
81 11379 : MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
82 22758 : if (Def && Def->isMoveImmediate()) {
83 868 : MachineOperand &MovSrc = Def->getOperand(1);
84 : bool ConstantFolded = false;
85 :
86 868 : if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
87 0 : isUInt<32>(MovSrc.getImm()))) {
88 : // It's possible to have only one component of a super-reg defined by
89 : // a single mov, so we need to clear any subregister flag.
90 : Src0.setSubReg(0);
91 856 : Src0.ChangeToImmediate(MovSrc.getImm());
92 : ConstantFolded = true;
93 12 : } else if (MovSrc.isFI()) {
94 : Src0.setSubReg(0);
95 12 : Src0.ChangeToFrameIndex(MovSrc.getIndex());
96 : ConstantFolded = true;
97 : }
98 :
99 : if (ConstantFolded) {
100 : assert(MRI.use_empty(Reg));
101 868 : Def->eraseFromParent();
102 : ++NumLiteralConstantsFolded;
103 868 : return true;
104 : }
105 : }
106 : }
107 : }
108 :
109 : // We have failed to fold src0, so commute the instruction and try again.
110 91753 : if (TryToCommute && MI.isCommutable()) {
111 31718 : if (TII->commuteInstruction(MI)) {
112 7686 : if (foldImmediates(MI, TII, MRI, false))
113 : return true;
114 :
115 : // Commute back.
116 7681 : TII->commuteInstruction(MI);
117 : }
118 : }
119 :
120 : return false;
121 : }
122 :
123 26529 : static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
124 68224 : return isInt<16>(Src.getImm()) &&
125 15166 : !TII->isInlineConstant(*Src.getParent(),
126 26529 : Src.getParent()->getOperandNo(&Src));
127 : }
128 :
129 36 : static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
130 56 : return isUInt<16>(Src.getImm()) &&
131 20 : !TII->isInlineConstant(*Src.getParent(),
132 36 : Src.getParent()->getOperandNo(&Src));
133 : }
134 :
135 329 : static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
136 : const MachineOperand &Src,
137 : bool &IsUnsigned) {
138 658 : if (isInt<16>(Src.getImm())) {
139 313 : IsUnsigned = false;
140 313 : return !TII->isInlineConstant(Src);
141 : }
142 :
143 16 : if (isUInt<16>(Src.getImm())) {
144 8 : IsUnsigned = true;
145 8 : return !TII->isInlineConstant(Src);
146 : }
147 :
148 : return false;
149 : }
150 :
151 : /// \returns true if the constant in \p Src should be replaced with a bitreverse
152 : /// of an inline immediate.
153 34956 : static bool isReverseInlineImm(const SIInstrInfo *TII,
154 : const MachineOperand &Src,
155 : int32_t &ReverseImm) {
156 69912 : if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
157 21682 : return false;
158 :
159 13274 : ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
160 13274 : return ReverseImm >= -16 && ReverseImm <= 64;
161 : }
162 :
163 : /// Copy implicit register operands from specified instruction to this
164 : /// instruction that are not part of the instruction definition.
165 42899 : static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
166 : const MachineInstr &MI) {
167 42899 : for (unsigned i = MI.getDesc().getNumOperands() +
168 42899 : MI.getDesc().getNumImplicitUses() +
169 42899 : MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
170 42899 : i != e; ++i) {
171 0 : const MachineOperand &MO = MI.getOperand(i);
172 0 : if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
173 0 : NewMI.addOperand(MF, MO);
174 : }
175 42899 : }
176 :
177 505 : static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
178 : // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
179 : // get constants on the RHS.
180 1010 : if (!MI.getOperand(0).isReg())
181 20 : TII->commuteInstruction(MI, false, 0, 1);
182 :
183 505 : const MachineOperand &Src1 = MI.getOperand(1);
184 505 : if (!Src1.isImm())
185 : return;
186 :
187 459 : int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
188 459 : if (SOPKOpc == -1)
189 : return;
190 :
191 : // eq/ne is special because the imm16 can be treated as signed or unsigned,
192 : // and initially selectd to the unsigned versions.
193 443 : if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
194 : bool HasUImm;
195 329 : if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
196 34 : if (!HasUImm) {
197 26 : SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
198 : AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
199 : }
200 :
201 34 : MI.setDesc(TII->get(SOPKOpc));
202 : }
203 :
204 : return;
205 : }
206 :
207 114 : const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
208 :
209 228 : if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
210 78 : (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
211 : MI.setDesc(NewDesc);
212 : }
213 : }
214 :
215 39296 : bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
216 39296 : if (skipFunction(MF.getFunction()))
217 : return false;
218 :
219 39289 : MachineRegisterInfo &MRI = MF.getRegInfo();
220 39289 : const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
221 39289 : const SIInstrInfo *TII = ST.getInstrInfo();
222 :
223 : std::vector<unsigned> I1Defs;
224 :
225 : for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
226 83407 : BI != BE; ++BI) {
227 :
228 : MachineBasicBlock &MBB = *BI;
229 : MachineBasicBlock::iterator I, Next;
230 830348 : for (I = MBB.begin(); I != MBB.end(); I = Next) {
231 786230 : Next = std::next(I);
232 : MachineInstr &MI = *I;
233 :
234 1572460 : if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
235 : // If this has a literal constant source that is the same as the
236 : // reversed bits of an inline immediate, replace with a bitreverse of
237 : // that constant. This saves 4 bytes in the common case of materializing
238 : // sign bits.
239 :
240 : // Test if we are after regalloc. We only want to do this after any
241 : // optimizations happen because this will confuse them.
242 : // XXX - not exactly a check for post-regalloc run.
243 60444 : MachineOperand &Src = MI.getOperand(1);
244 60444 : if (Src.isImm() &&
245 17192 : TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
246 : int32_t ReverseImm;
247 10248 : if (isReverseInlineImm(TII, Src, ReverseImm)) {
248 79 : MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
249 79 : Src.setImm(ReverseImm);
250 79 : continue;
251 : }
252 : }
253 : }
254 :
255 : // Combine adjacent s_nops to use the immediate operand encoding how long
256 : // to wait.
257 : //
258 : // s_nop N
259 : // s_nop M
260 : // =>
261 : // s_nop (N + M)
262 786151 : if (MI.getOpcode() == AMDGPU::S_NOP &&
263 787254 : Next != MBB.end() &&
264 1103 : (*Next).getOpcode() == AMDGPU::S_NOP) {
265 :
266 : MachineInstr &NextMI = *Next;
267 : // The instruction encodes the amount to wait with an offset of 1,
268 : // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
269 : // after adding.
270 213 : uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
271 213 : uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
272 :
273 : // Make sure we don't overflow the bounds.
274 213 : if (Nop0 + Nop1 <= 8) {
275 213 : NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
276 213 : MI.eraseFromParent();
277 : }
278 :
279 213 : continue;
280 : }
281 :
282 : // FIXME: We also need to consider movs of constant operands since
283 : // immediate operands are not folded if they have more than one use, and
284 : // the operand folding pass is unaware if the immediate will be free since
285 : // it won't know if the src == dest constraint will end up being
286 : // satisfied.
287 785938 : if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
288 : MI.getOpcode() == AMDGPU::S_MUL_I32) {
289 3124 : const MachineOperand *Dest = &MI.getOperand(0);
290 : MachineOperand *Src0 = &MI.getOperand(1);
291 : MachineOperand *Src1 = &MI.getOperand(2);
292 :
293 3124 : if (!Src0->isReg() && Src1->isReg()) {
294 4 : if (TII->commuteInstruction(MI, false, 1, 2))
295 : std::swap(Src0, Src1);
296 : }
297 :
298 : // FIXME: This could work better if hints worked with subregisters. If
299 : // we have a vector add of a constant, we usually don't get the correct
300 : // allocation due to the subregister usage.
301 6248 : if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
302 : Src0->isReg()) {
303 1529 : MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
304 3058 : MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
305 : continue;
306 : }
307 :
308 1595 : if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
309 628 : if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
310 170 : unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
311 : AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
312 :
313 85 : MI.setDesc(TII->get(Opc));
314 85 : MI.tieOperands(0, 1);
315 : }
316 : }
317 : }
318 :
319 : // Try to use s_cmpk_*
320 784409 : if (MI.isCompare() && TII->isSOPC(MI)) {
321 505 : shrinkScalarCompare(TII, MI);
322 505 : continue;
323 : }
324 :
325 : // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
326 1567808 : if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
327 69065 : const MachineOperand &Dst = MI.getOperand(0);
328 : MachineOperand &Src = MI.getOperand(1);
329 :
330 69065 : if (Src.isImm() &&
331 48218 : TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
332 : int32_t ReverseImm;
333 26090 : if (isKImmOperand(TII, Src))
334 1382 : MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
335 24708 : else if (isReverseInlineImm(TII, Src, ReverseImm)) {
336 104 : MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
337 104 : Src.setImm(ReverseImm);
338 : }
339 : }
340 :
341 69065 : continue;
342 : }
343 :
344 714839 : if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
345 : continue;
346 :
347 71404 : if (!TII->canShrink(MI, MRI)) {
348 : // Try commuting the instruction and see if that enables us to shrink
349 : // it.
350 38450 : if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
351 16657 : !TII->canShrink(MI, MRI))
352 9140 : continue;
353 : }
354 :
355 : // getVOPe32 could be -1 here if we started with an instruction that had
356 : // a 32-bit encoding and then commuted it to an instruction that did not.
357 124528 : if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
358 : continue;
359 :
360 62264 : int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
361 :
362 124528 : if (TII->isVOPC(Op32)) {
363 6483 : unsigned DstReg = MI.getOperand(0).getReg();
364 6483 : if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
365 : // VOPC instructions can only write to the VCC register. We can't
366 : // force them to use VCC here, because this is only one register and
367 : // cannot deal with sequences which would require multiple copies of
368 : // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
369 : //
370 : // So, instead of forcing the instruction to write to VCC, we provide
371 : // a hint to the register allocator to use VCC and then we will run
372 : // this pass again after RA and shrink it if it outputs to VCC.
373 3212 : MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
374 3212 : continue;
375 : }
376 3271 : if (DstReg != AMDGPU::VCC)
377 : continue;
378 : }
379 :
380 58010 : if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
381 : // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
382 : // instructions.
383 : const MachineOperand *Src2 =
384 6250 : TII->getNamedOperand(MI, AMDGPU::OpName::src2);
385 6250 : if (!Src2->isReg())
386 : continue;
387 6250 : unsigned SReg = Src2->getReg();
388 6250 : if (TargetRegisterInfo::isVirtualRegister(SReg)) {
389 3103 : MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
390 3103 : continue;
391 : }
392 3147 : if (SReg != AMDGPU::VCC)
393 : continue;
394 : }
395 :
396 : // Check for the bool flag output for instructions like V_ADD_I32_e64.
397 54018 : const MachineOperand *SDst = TII->getNamedOperand(MI,
398 : AMDGPU::OpName::sdst);
399 :
400 : // Check the carry-in operand for v_addc_u32_e64.
401 54018 : const MachineOperand *Src2 = TII->getNamedOperand(MI,
402 : AMDGPU::OpName::src2);
403 :
404 54018 : if (SDst) {
405 23632 : if (SDst->getReg() != AMDGPU::VCC) {
406 11008 : if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
407 10673 : MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
408 11008 : continue;
409 : }
410 :
411 : // All of the instructions with carry outs also have an SGPR input in
412 : // src2.
413 12624 : if (Src2 && Src2->getReg() != AMDGPU::VCC) {
414 111 : if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
415 1 : MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
416 :
417 111 : continue;
418 : }
419 : }
420 :
421 : // We can shrink this instruction
422 : LLVM_DEBUG(dbgs() << "Shrinking " << MI);
423 :
424 42899 : MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
425 : ++NumInstructionsShrunk;
426 :
427 : // Copy extra operands not present in the instruction definition.
428 42899 : copyExtraImplicitOps(*Inst32, MF, MI);
429 :
430 42899 : MI.eraseFromParent();
431 42899 : foldImmediates(*Inst32, TII, MRI);
432 :
433 : LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
434 : }
435 : }
436 : return false;
437 : }
|