File: | lib/Target/AMDGPU/SIInstrInfo.cpp |
Warning: | line 4773, column 7 Value stored to 'Not' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// SI Implementation of TargetInstrInfo. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "SIInstrInfo.h" |
15 | #include "AMDGPU.h" |
16 | #include "AMDGPUSubtarget.h" |
17 | #include "GCNHazardRecognizer.h" |
18 | #include "SIDefines.h" |
19 | #include "SIMachineFunctionInfo.h" |
20 | #include "SIRegisterInfo.h" |
21 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
22 | #include "Utils/AMDGPUBaseInfo.h" |
23 | #include "llvm/ADT/APInt.h" |
24 | #include "llvm/ADT/ArrayRef.h" |
25 | #include "llvm/ADT/SmallVector.h" |
26 | #include "llvm/ADT/StringRef.h" |
27 | #include "llvm/ADT/iterator_range.h" |
28 | #include "llvm/Analysis/AliasAnalysis.h" |
29 | #include "llvm/Analysis/MemoryLocation.h" |
30 | #include "llvm/Analysis/ValueTracking.h" |
31 | #include "llvm/CodeGen/MachineBasicBlock.h" |
32 | #include "llvm/CodeGen/MachineDominators.h" |
33 | #include "llvm/CodeGen/MachineFrameInfo.h" |
34 | #include "llvm/CodeGen/MachineFunction.h" |
35 | #include "llvm/CodeGen/MachineInstr.h" |
36 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
37 | #include "llvm/CodeGen/MachineInstrBundle.h" |
38 | #include "llvm/CodeGen/MachineMemOperand.h" |
39 | #include "llvm/CodeGen/MachineOperand.h" |
40 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
41 | #include "llvm/CodeGen/RegisterScavenging.h" |
42 | #include "llvm/CodeGen/ScheduleDAG.h" |
43 | #include "llvm/CodeGen/SelectionDAGNodes.h" |
44 | #include "llvm/CodeGen/TargetOpcodes.h" |
45 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
46 | #include "llvm/IR/DebugLoc.h" |
47 | #include "llvm/IR/DiagnosticInfo.h" |
48 | #include "llvm/IR/Function.h" |
49 | #include "llvm/IR/InlineAsm.h" |
50 | #include "llvm/IR/LLVMContext.h" |
51 | #include "llvm/MC/MCInstrDesc.h" |
52 | #include "llvm/Support/Casting.h" |
53 | #include "llvm/Support/CommandLine.h" |
54 | #include "llvm/Support/Compiler.h" |
55 | #include "llvm/Support/ErrorHandling.h" |
56 | #include "llvm/Support/MachineValueType.h" |
57 | #include "llvm/Support/MathExtras.h" |
58 | #include "llvm/Target/TargetMachine.h" |
59 | #include <cassert> |
60 | #include <cstdint> |
61 | #include <iterator> |
62 | #include <utility> |
63 | |
64 | using namespace llvm; |
65 | |
66 | #define GET_INSTRINFO_CTOR_DTOR |
67 | #include "AMDGPUGenInstrInfo.inc" |
68 | |
69 | namespace llvm { |
70 | namespace AMDGPU { |
71 | #define GET_D16ImageDimIntrinsics_IMPL |
72 | #define GET_ImageDimIntrinsicTable_IMPL |
73 | #define GET_RsrcIntrinsics_IMPL |
74 | #include "AMDGPUGenSearchableTables.inc" |
75 | } |
76 | } |
77 | |
78 | |
79 | // Must be at least 4 to be able to branch over minimum unconditional branch |
80 | // code. This is only for making it possible to write reasonably small tests for |
81 | // long branches. |
82 | static cl::opt<unsigned> |
83 | BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), |
84 | cl::desc("Restrict range of branch instructions (DEBUG)")); |
85 | |
86 | SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) |
87 | : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), |
88 | RI(ST), ST(ST) {} |
89 | |
90 | //===----------------------------------------------------------------------===// |
91 | // TargetInstrInfo callbacks |
92 | //===----------------------------------------------------------------------===// |
93 | |
94 | static unsigned getNumOperandsNoGlue(SDNode *Node) { |
95 | unsigned N = Node->getNumOperands(); |
96 | while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) |
97 | --N; |
98 | return N; |
99 | } |
100 | |
101 | /// Returns true if both nodes have the same value for the given |
102 | /// operand \p Op, or if both nodes do not have this operand. |
103 | static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { |
104 | unsigned Opc0 = N0->getMachineOpcode(); |
105 | unsigned Opc1 = N1->getMachineOpcode(); |
106 | |
107 | int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); |
108 | int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); |
109 | |
110 | if (Op0Idx == -1 && Op1Idx == -1) |
111 | return true; |
112 | |
113 | |
114 | if ((Op0Idx == -1 && Op1Idx != -1) || |
115 | (Op1Idx == -1 && Op0Idx != -1)) |
116 | return false; |
117 | |
118 | // getNamedOperandIdx returns the index for the MachineInstr's operands, |
119 | // which includes the result as the first operand. We are indexing into the |
120 | // MachineSDNode's operands, so we need to skip the result operand to get |
121 | // the real index. |
122 | --Op0Idx; |
123 | --Op1Idx; |
124 | |
125 | return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); |
126 | } |
127 | |
128 | bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, |
129 | AliasAnalysis *AA) const { |
130 | // TODO: The generic check fails for VALU instructions that should be |
131 | // rematerializable due to implicit reads of exec. We really want all of the |
132 | // generic logic for this except for this. |
133 | switch (MI.getOpcode()) { |
134 | case AMDGPU::V_MOV_B32_e32: |
135 | case AMDGPU::V_MOV_B32_e64: |
136 | case AMDGPU::V_MOV_B64_PSEUDO: |
137 | // No implicit operands. |
138 | return MI.getNumOperands() == MI.getDesc().getNumOperands(); |
139 | default: |
140 | return false; |
141 | } |
142 | } |
143 | |
144 | bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, |
145 | int64_t &Offset0, |
146 | int64_t &Offset1) const { |
147 | if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) |
148 | return false; |
149 | |
150 | unsigned Opc0 = Load0->getMachineOpcode(); |
151 | unsigned Opc1 = Load1->getMachineOpcode(); |
152 | |
153 | // Make sure both are actually loads. |
154 | if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) |
155 | return false; |
156 | |
157 | if (isDS(Opc0) && isDS(Opc1)) { |
158 | |
159 | // FIXME: Handle this case: |
160 | if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) |
161 | return false; |
162 | |
163 | // Check base reg. |
164 | if (Load0->getOperand(0) != Load1->getOperand(0)) |
165 | return false; |
166 | |
167 | // Skip read2 / write2 variants for simplicity. |
168 | // TODO: We should report true if the used offsets are adjacent (excluded |
169 | // st64 versions). |
170 | int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); |
171 | int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); |
172 | if (Offset0Idx == -1 || Offset1Idx == -1) |
173 | return false; |
174 | |
175 | // XXX - be careful of datalesss loads |
176 | // getNamedOperandIdx returns the index for MachineInstrs. Since they |
177 | // include the output in the operand list, but SDNodes don't, we need to |
178 | // subtract the index by one. |
179 | Offset0Idx -= get(Opc0).NumDefs; |
180 | Offset1Idx -= get(Opc1).NumDefs; |
181 | Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); |
182 | Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); |
183 | return true; |
184 | } |
185 | |
186 | if (isSMRD(Opc0) && isSMRD(Opc1)) { |
187 | // Skip time and cache invalidation instructions. |
188 | if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || |
189 | AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) |
190 | return false; |
191 | |
192 | assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1))((getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)) ? static_cast<void> (0) : __assert_fail ("getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 192, __PRETTY_FUNCTION__)); |
193 | |
194 | // Check base reg. |
195 | if (Load0->getOperand(0) != Load1->getOperand(0)) |
196 | return false; |
197 | |
198 | const ConstantSDNode *Load0Offset = |
199 | dyn_cast<ConstantSDNode>(Load0->getOperand(1)); |
200 | const ConstantSDNode *Load1Offset = |
201 | dyn_cast<ConstantSDNode>(Load1->getOperand(1)); |
202 | |
203 | if (!Load0Offset || !Load1Offset) |
204 | return false; |
205 | |
206 | Offset0 = Load0Offset->getZExtValue(); |
207 | Offset1 = Load1Offset->getZExtValue(); |
208 | return true; |
209 | } |
210 | |
211 | // MUBUF and MTBUF can access the same addresses. |
212 | if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { |
213 | |
214 | // MUBUF and MTBUF have vaddr at different indices. |
215 | if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || |
216 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || |
217 | !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) |
218 | return false; |
219 | |
220 | int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); |
221 | int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); |
222 | |
223 | if (OffIdx0 == -1 || OffIdx1 == -1) |
224 | return false; |
225 | |
226 | // getNamedOperandIdx returns the index for MachineInstrs. Since they |
227 | // include the output in the operand list, but SDNodes don't, we need to |
228 | // subtract the index by one. |
229 | OffIdx0 -= get(Opc0).NumDefs; |
230 | OffIdx1 -= get(Opc1).NumDefs; |
231 | |
232 | SDValue Off0 = Load0->getOperand(OffIdx0); |
233 | SDValue Off1 = Load1->getOperand(OffIdx1); |
234 | |
235 | // The offset might be a FrameIndexSDNode. |
236 | if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) |
237 | return false; |
238 | |
239 | Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); |
240 | Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); |
241 | return true; |
242 | } |
243 | |
244 | return false; |
245 | } |
246 | |
247 | static bool isStride64(unsigned Opc) { |
248 | switch (Opc) { |
249 | case AMDGPU::DS_READ2ST64_B32: |
250 | case AMDGPU::DS_READ2ST64_B64: |
251 | case AMDGPU::DS_WRITE2ST64_B32: |
252 | case AMDGPU::DS_WRITE2ST64_B64: |
253 | return true; |
254 | default: |
255 | return false; |
256 | } |
257 | } |
258 | |
259 | bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, |
260 | const MachineOperand *&BaseOp, |
261 | int64_t &Offset, |
262 | const TargetRegisterInfo *TRI) const { |
263 | unsigned Opc = LdSt.getOpcode(); |
264 | |
265 | if (isDS(LdSt)) { |
266 | const MachineOperand *OffsetImm = |
267 | getNamedOperand(LdSt, AMDGPU::OpName::offset); |
268 | if (OffsetImm) { |
269 | // Normal, single offset LDS instruction. |
270 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); |
271 | // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to |
272 | // report that here? |
273 | if (!BaseOp) |
274 | return false; |
275 | |
276 | Offset = OffsetImm->getImm(); |
277 | assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "((BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register.") ? static_cast<void> (0) : __assert_fail ("BaseOp->isReg() && \"getMemOperandWithOffset only supports base \" \"operands of type register.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 278, __PRETTY_FUNCTION__)) |
278 | "operands of type register.")((BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register.") ? static_cast<void> (0) : __assert_fail ("BaseOp->isReg() && \"getMemOperandWithOffset only supports base \" \"operands of type register.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 278, __PRETTY_FUNCTION__)); |
279 | return true; |
280 | } |
281 | |
282 | // The 2 offset instructions use offset0 and offset1 instead. We can treat |
283 | // these as a load with a single offset if the 2 offsets are consecutive. We |
284 | // will use this for some partially aligned loads. |
285 | const MachineOperand *Offset0Imm = |
286 | getNamedOperand(LdSt, AMDGPU::OpName::offset0); |
287 | const MachineOperand *Offset1Imm = |
288 | getNamedOperand(LdSt, AMDGPU::OpName::offset1); |
289 | |
290 | uint8_t Offset0 = Offset0Imm->getImm(); |
291 | uint8_t Offset1 = Offset1Imm->getImm(); |
292 | |
293 | if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { |
294 | // Each of these offsets is in element sized units, so we need to convert |
295 | // to bytes of the individual reads. |
296 | |
297 | unsigned EltSize; |
298 | if (LdSt.mayLoad()) |
299 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; |
300 | else { |
301 | assert(LdSt.mayStore())((LdSt.mayStore()) ? static_cast<void> (0) : __assert_fail ("LdSt.mayStore()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 301, __PRETTY_FUNCTION__)); |
302 | int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); |
303 | EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; |
304 | } |
305 | |
306 | if (isStride64(Opc)) |
307 | EltSize *= 64; |
308 | |
309 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); |
310 | Offset = EltSize * Offset0; |
311 | assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "((BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register.") ? static_cast<void> (0) : __assert_fail ("BaseOp->isReg() && \"getMemOperandWithOffset only supports base \" \"operands of type register.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 312, __PRETTY_FUNCTION__)) |
312 | "operands of type register.")((BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register.") ? static_cast<void> (0) : __assert_fail ("BaseOp->isReg() && \"getMemOperandWithOffset only supports base \" \"operands of type register.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 312, __PRETTY_FUNCTION__)); |
313 | return true; |
314 | } |
315 | |
316 | return false; |
317 | } |
318 | |
319 | if (isMUBUF(LdSt) || isMTBUF(LdSt)) { |
320 | const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); |
321 | if (SOffset && SOffset->isReg()) |
322 | return false; |
323 | |
324 | const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); |
325 | if (!AddrReg) |
326 | return false; |
327 | |
328 | const MachineOperand *OffsetImm = |
329 | getNamedOperand(LdSt, AMDGPU::OpName::offset); |
330 | BaseOp = AddrReg; |
331 | Offset = OffsetImm->getImm(); |
332 | |
333 | if (SOffset) // soffset can be an inline immediate. |
334 | Offset += SOffset->getImm(); |
335 | |
336 | assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "((BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register.") ? static_cast<void> (0) : __assert_fail ("BaseOp->isReg() && \"getMemOperandWithOffset only supports base \" \"operands of type register.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 337, __PRETTY_FUNCTION__)) |
337 | "operands of type register.")((BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register.") ? static_cast<void> (0) : __assert_fail ("BaseOp->isReg() && \"getMemOperandWithOffset only supports base \" \"operands of type register.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 337, __PRETTY_FUNCTION__)); |
338 | return true; |
339 | } |
340 | |
341 | if (isSMRD(LdSt)) { |
342 | const MachineOperand *OffsetImm = |
343 | getNamedOperand(LdSt, AMDGPU::OpName::offset); |
344 | if (!OffsetImm) |
345 | return false; |
346 | |
347 | const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); |
348 | BaseOp = SBaseReg; |
349 | Offset = OffsetImm->getImm(); |
350 | assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "((BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register.") ? static_cast<void> (0) : __assert_fail ("BaseOp->isReg() && \"getMemOperandWithOffset only supports base \" \"operands of type register.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 351, __PRETTY_FUNCTION__)) |
351 | "operands of type register.")((BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register.") ? static_cast<void> (0) : __assert_fail ("BaseOp->isReg() && \"getMemOperandWithOffset only supports base \" \"operands of type register.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 351, __PRETTY_FUNCTION__)); |
352 | return true; |
353 | } |
354 | |
355 | if (isFLAT(LdSt)) { |
356 | const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); |
357 | if (VAddr) { |
358 | // Can't analyze 2 offsets. |
359 | if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) |
360 | return false; |
361 | |
362 | BaseOp = VAddr; |
363 | } else { |
364 | // scratch instructions have either vaddr or saddr. |
365 | BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); |
366 | } |
367 | |
368 | Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); |
369 | assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "((BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register.") ? static_cast<void> (0) : __assert_fail ("BaseOp->isReg() && \"getMemOperandWithOffset only supports base \" \"operands of type register.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 370, __PRETTY_FUNCTION__)) |
370 | "operands of type register.")((BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register.") ? static_cast<void> (0) : __assert_fail ("BaseOp->isReg() && \"getMemOperandWithOffset only supports base \" \"operands of type register.\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 370, __PRETTY_FUNCTION__)); |
371 | return true; |
372 | } |
373 | |
374 | return false; |
375 | } |
376 | |
377 | static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, |
378 | const MachineOperand &BaseOp1, |
379 | const MachineInstr &MI2, |
380 | const MachineOperand &BaseOp2) { |
381 | // Support only base operands with base registers. |
382 | // Note: this could be extended to support FI operands. |
383 | if (!BaseOp1.isReg() || !BaseOp2.isReg()) |
384 | return false; |
385 | |
386 | if (BaseOp1.isIdenticalTo(BaseOp2)) |
387 | return true; |
388 | |
389 | if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) |
390 | return false; |
391 | |
392 | auto MO1 = *MI1.memoperands_begin(); |
393 | auto MO2 = *MI2.memoperands_begin(); |
394 | if (MO1->getAddrSpace() != MO2->getAddrSpace()) |
395 | return false; |
396 | |
397 | auto Base1 = MO1->getValue(); |
398 | auto Base2 = MO2->getValue(); |
399 | if (!Base1 || !Base2) |
400 | return false; |
401 | const MachineFunction &MF = *MI1.getParent()->getParent(); |
402 | const DataLayout &DL = MF.getFunction().getParent()->getDataLayout(); |
403 | Base1 = GetUnderlyingObject(Base1, DL); |
404 | Base2 = GetUnderlyingObject(Base1, DL); |
405 | |
406 | if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) |
407 | return false; |
408 | |
409 | return Base1 == Base2; |
410 | } |
411 | |
412 | bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, |
413 | const MachineOperand &BaseOp2, |
414 | unsigned NumLoads) const { |
415 | const MachineInstr &FirstLdSt = *BaseOp1.getParent(); |
416 | const MachineInstr &SecondLdSt = *BaseOp2.getParent(); |
417 | |
418 | if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2)) |
419 | return false; |
420 | |
421 | const MachineOperand *FirstDst = nullptr; |
422 | const MachineOperand *SecondDst = nullptr; |
423 | |
424 | if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || |
425 | (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || |
426 | (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { |
427 | const unsigned MaxGlobalLoadCluster = 6; |
428 | if (NumLoads > MaxGlobalLoadCluster) |
429 | return false; |
430 | |
431 | FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); |
432 | if (!FirstDst) |
433 | FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); |
434 | SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); |
435 | if (!SecondDst) |
436 | SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); |
437 | } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { |
438 | FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); |
439 | SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); |
440 | } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { |
441 | FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); |
442 | SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); |
443 | } |
444 | |
445 | if (!FirstDst || !SecondDst) |
446 | return false; |
447 | |
448 | // Try to limit clustering based on the total number of bytes loaded |
449 | // rather than the number of instructions. This is done to help reduce |
450 | // register pressure. The method used is somewhat inexact, though, |
451 | // because it assumes that all loads in the cluster will load the |
452 | // same number of bytes as FirstLdSt. |
453 | |
454 | // The unit of this value is bytes. |
455 | // FIXME: This needs finer tuning. |
456 | unsigned LoadClusterThreshold = 16; |
457 | |
458 | const MachineRegisterInfo &MRI = |
459 | FirstLdSt.getParent()->getParent()->getRegInfo(); |
460 | |
461 | const unsigned Reg = FirstDst->getReg(); |
462 | |
463 | const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg) |
464 | ? MRI.getRegClass(Reg) |
465 | : RI.getPhysRegClass(Reg); |
466 | |
467 | return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; |
468 | } |
469 | |
470 | // FIXME: This behaves strangely. If, for example, you have 32 load + stores, |
471 | // the first 16 loads will be interleaved with the stores, and the next 16 will |
472 | // be clustered as expected. It should really split into 2 16 store batches. |
473 | // |
474 | // Loads are clustered until this returns false, rather than trying to schedule |
475 | // groups of stores. This also means we have to deal with saying different |
476 | // address space loads should be clustered, and ones which might cause bank |
477 | // conflicts. |
478 | // |
479 | // This might be deprecated so it might not be worth that much effort to fix. |
480 | bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, |
481 | int64_t Offset0, int64_t Offset1, |
482 | unsigned NumLoads) const { |
483 | assert(Offset1 > Offset0 &&((Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? static_cast<void> (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 484, __PRETTY_FUNCTION__)) |
484 | "Second offset should be larger than first offset!")((Offset1 > Offset0 && "Second offset should be larger than first offset!" ) ? static_cast<void> (0) : __assert_fail ("Offset1 > Offset0 && \"Second offset should be larger than first offset!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 484, __PRETTY_FUNCTION__)); |
485 | // If we have less than 16 loads in a row, and the offsets are within 64 |
486 | // bytes, then schedule together. |
487 | |
488 | // A cacheline is 64 bytes (for global memory). |
489 | return (NumLoads <= 16 && (Offset1 - Offset0) < 64); |
490 | } |
491 | |
492 | static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, |
493 | MachineBasicBlock::iterator MI, |
494 | const DebugLoc &DL, unsigned DestReg, |
495 | unsigned SrcReg, bool KillSrc) { |
496 | MachineFunction *MF = MBB.getParent(); |
497 | DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), |
498 | "illegal SGPR to VGPR copy", |
499 | DL, DS_Error); |
500 | LLVMContext &C = MF->getFunction().getContext(); |
501 | C.diagnose(IllegalCopy); |
502 | |
503 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) |
504 | .addReg(SrcReg, getKillRegState(KillSrc)); |
505 | } |
506 | |
507 | void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, |
508 | MachineBasicBlock::iterator MI, |
509 | const DebugLoc &DL, unsigned DestReg, |
510 | unsigned SrcReg, bool KillSrc) const { |
511 | const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); |
512 | |
513 | if (RC == &AMDGPU::VGPR_32RegClass) { |
514 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||((AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass .contains(SrcReg)) ? static_cast<void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 515, __PRETTY_FUNCTION__)) |
515 | AMDGPU::SReg_32RegClass.contains(SrcReg))((AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass .contains(SrcReg)) ? static_cast<void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 515, __PRETTY_FUNCTION__)); |
516 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) |
517 | .addReg(SrcReg, getKillRegState(KillSrc)); |
518 | return; |
519 | } |
520 | |
521 | if (RC == &AMDGPU::SReg_32_XM0RegClass || |
522 | RC == &AMDGPU::SReg_32RegClass) { |
523 | if (SrcReg == AMDGPU::SCC) { |
524 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) |
525 | .addImm(-1) |
526 | .addImm(0); |
527 | return; |
528 | } |
529 | |
530 | if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { |
531 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); |
532 | return; |
533 | } |
534 | |
535 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) |
536 | .addReg(SrcReg, getKillRegState(KillSrc)); |
537 | return; |
538 | } |
539 | |
540 | if (RC == &AMDGPU::SReg_64RegClass) { |
541 | if (DestReg == AMDGPU::VCC) { |
542 | if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { |
543 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) |
544 | .addReg(SrcReg, getKillRegState(KillSrc)); |
545 | } else { |
546 | // FIXME: Hack until VReg_1 removed. |
547 | assert(AMDGPU::VGPR_32RegClass.contains(SrcReg))((AMDGPU::VGPR_32RegClass.contains(SrcReg)) ? static_cast< void> (0) : __assert_fail ("AMDGPU::VGPR_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 547, __PRETTY_FUNCTION__)); |
548 | BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) |
549 | .addImm(0) |
550 | .addReg(SrcReg, getKillRegState(KillSrc)); |
551 | } |
552 | |
553 | return; |
554 | } |
555 | |
556 | if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { |
557 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); |
558 | return; |
559 | } |
560 | |
561 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) |
562 | .addReg(SrcReg, getKillRegState(KillSrc)); |
563 | return; |
564 | } |
565 | |
566 | if (DestReg == AMDGPU::SCC) { |
567 | assert(AMDGPU::SReg_32RegClass.contains(SrcReg))((AMDGPU::SReg_32RegClass.contains(SrcReg)) ? static_cast< void> (0) : __assert_fail ("AMDGPU::SReg_32RegClass.contains(SrcReg)" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 567, __PRETTY_FUNCTION__)); |
568 | BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) |
569 | .addReg(SrcReg, getKillRegState(KillSrc)) |
570 | .addImm(0); |
571 | return; |
572 | } |
573 | |
574 | unsigned EltSize = 4; |
575 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; |
576 | if (RI.isSGPRClass(RC)) { |
577 | // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32. |
578 | if (!(RI.getRegSizeInBits(*RC) % 64)) { |
579 | Opcode = AMDGPU::S_MOV_B64; |
580 | EltSize = 8; |
581 | } else { |
582 | Opcode = AMDGPU::S_MOV_B32; |
583 | EltSize = 4; |
584 | } |
585 | |
586 | if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { |
587 | reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); |
588 | return; |
589 | } |
590 | } |
591 | |
592 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); |
593 | bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); |
594 | |
595 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { |
596 | unsigned SubIdx; |
597 | if (Forward) |
598 | SubIdx = SubIndices[Idx]; |
599 | else |
600 | SubIdx = SubIndices[SubIndices.size() - Idx - 1]; |
601 | |
602 | MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, |
603 | get(Opcode), RI.getSubReg(DestReg, SubIdx)); |
604 | |
605 | Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); |
606 | |
607 | if (Idx == 0) |
608 | Builder.addReg(DestReg, RegState::Define | RegState::Implicit); |
609 | |
610 | bool UseKill = KillSrc && Idx == SubIndices.size() - 1; |
611 | Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); |
612 | } |
613 | } |
614 | |
615 | int SIInstrInfo::commuteOpcode(unsigned Opcode) const { |
616 | int NewOpc; |
617 | |
618 | // Try to map original to commuted opcode |
619 | NewOpc = AMDGPU::getCommuteRev(Opcode); |
620 | if (NewOpc != -1) |
621 | // Check if the commuted (REV) opcode exists on the target. |
622 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; |
623 | |
624 | // Try to map commuted to original opcode |
625 | NewOpc = AMDGPU::getCommuteOrig(Opcode); |
626 | if (NewOpc != -1) |
627 | // Check if the original (non-REV) opcode exists on the target. |
628 | return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; |
629 | |
630 | return Opcode; |
631 | } |
632 | |
633 | void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, |
634 | MachineBasicBlock::iterator MI, |
635 | const DebugLoc &DL, unsigned DestReg, |
636 | int64_t Value) const { |
637 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
638 | const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); |
639 | if (RegClass == &AMDGPU::SReg_32RegClass || |
640 | RegClass == &AMDGPU::SGPR_32RegClass || |
641 | RegClass == &AMDGPU::SReg_32_XM0RegClass || |
642 | RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { |
643 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) |
644 | .addImm(Value); |
645 | return; |
646 | } |
647 | |
648 | if (RegClass == &AMDGPU::SReg_64RegClass || |
649 | RegClass == &AMDGPU::SGPR_64RegClass || |
650 | RegClass == &AMDGPU::SReg_64_XEXECRegClass) { |
651 | BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) |
652 | .addImm(Value); |
653 | return; |
654 | } |
655 | |
656 | if (RegClass == &AMDGPU::VGPR_32RegClass) { |
657 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) |
658 | .addImm(Value); |
659 | return; |
660 | } |
661 | if (RegClass == &AMDGPU::VReg_64RegClass) { |
662 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) |
663 | .addImm(Value); |
664 | return; |
665 | } |
666 | |
667 | unsigned EltSize = 4; |
668 | unsigned Opcode = AMDGPU::V_MOV_B32_e32; |
669 | if (RI.isSGPRClass(RegClass)) { |
670 | if (RI.getRegSizeInBits(*RegClass) > 32) { |
671 | Opcode = AMDGPU::S_MOV_B64; |
672 | EltSize = 8; |
673 | } else { |
674 | Opcode = AMDGPU::S_MOV_B32; |
675 | EltSize = 4; |
676 | } |
677 | } |
678 | |
679 | ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); |
680 | for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { |
681 | int64_t IdxValue = Idx == 0 ? Value : 0; |
682 | |
683 | MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, |
684 | get(Opcode), RI.getSubReg(DestReg, Idx)); |
685 | Builder.addImm(IdxValue); |
686 | } |
687 | } |
688 | |
689 | const TargetRegisterClass * |
690 | SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { |
691 | return &AMDGPU::VGPR_32RegClass; |
692 | } |
693 | |
694 | void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, |
695 | MachineBasicBlock::iterator I, |
696 | const DebugLoc &DL, unsigned DstReg, |
697 | ArrayRef<MachineOperand> Cond, |
698 | unsigned TrueReg, |
699 | unsigned FalseReg) const { |
700 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
701 | assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&((MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && "Not a VGPR32 reg") ? static_cast<void> (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 702, __PRETTY_FUNCTION__)) |
702 | "Not a VGPR32 reg")((MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && "Not a VGPR32 reg") ? static_cast<void> (0) : __assert_fail ("MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && \"Not a VGPR32 reg\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 702, __PRETTY_FUNCTION__)); |
703 | |
704 | if (Cond.size() == 1) { |
705 | unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
706 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) |
707 | .add(Cond[0]); |
708 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
709 | .addImm(0) |
710 | .addReg(FalseReg) |
711 | .addImm(0) |
712 | .addReg(TrueReg) |
713 | .addReg(SReg); |
714 | } else if (Cond.size() == 2) { |
715 | assert(Cond[0].isImm() && "Cond[0] is not an immediate")((Cond[0].isImm() && "Cond[0] is not an immediate") ? static_cast<void> (0) : __assert_fail ("Cond[0].isImm() && \"Cond[0] is not an immediate\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 715, __PRETTY_FUNCTION__)); |
716 | switch (Cond[0].getImm()) { |
717 | case SIInstrInfo::SCC_TRUE: { |
718 | unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
719 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) |
720 | .addImm(-1) |
721 | .addImm(0); |
722 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
723 | .addImm(0) |
724 | .addReg(FalseReg) |
725 | .addImm(0) |
726 | .addReg(TrueReg) |
727 | .addReg(SReg); |
728 | break; |
729 | } |
730 | case SIInstrInfo::SCC_FALSE: { |
731 | unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
732 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) |
733 | .addImm(0) |
734 | .addImm(-1); |
735 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
736 | .addImm(0) |
737 | .addReg(FalseReg) |
738 | .addImm(0) |
739 | .addReg(TrueReg) |
740 | .addReg(SReg); |
741 | break; |
742 | } |
743 | case SIInstrInfo::VCCNZ: { |
744 | MachineOperand RegOp = Cond[1]; |
745 | RegOp.setImplicit(false); |
746 | unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
747 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) |
748 | .add(RegOp); |
749 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
750 | .addImm(0) |
751 | .addReg(FalseReg) |
752 | .addImm(0) |
753 | .addReg(TrueReg) |
754 | .addReg(SReg); |
755 | break; |
756 | } |
757 | case SIInstrInfo::VCCZ: { |
758 | MachineOperand RegOp = Cond[1]; |
759 | RegOp.setImplicit(false); |
760 | unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
761 | BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) |
762 | .add(RegOp); |
763 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
764 | .addImm(0) |
765 | .addReg(TrueReg) |
766 | .addImm(0) |
767 | .addReg(FalseReg) |
768 | .addReg(SReg); |
769 | break; |
770 | } |
771 | case SIInstrInfo::EXECNZ: { |
772 | unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
773 | unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
774 | BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) |
775 | .addImm(0); |
776 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) |
777 | .addImm(-1) |
778 | .addImm(0); |
779 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
780 | .addImm(0) |
781 | .addReg(FalseReg) |
782 | .addImm(0) |
783 | .addReg(TrueReg) |
784 | .addReg(SReg); |
785 | break; |
786 | } |
787 | case SIInstrInfo::EXECZ: { |
788 | unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
789 | unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
790 | BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) |
791 | .addImm(0); |
792 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) |
793 | .addImm(0) |
794 | .addImm(-1); |
795 | BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
796 | .addImm(0) |
797 | .addReg(FalseReg) |
798 | .addImm(0) |
799 | .addReg(TrueReg) |
800 | .addReg(SReg); |
801 | llvm_unreachable("Unhandled branch predicate EXECZ")::llvm::llvm_unreachable_internal("Unhandled branch predicate EXECZ" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 801); |
802 | break; |
803 | } |
804 | default: |
805 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 805); |
806 | } |
807 | } else { |
808 | llvm_unreachable("Can only handle Cond size 1 or 2")::llvm::llvm_unreachable_internal("Can only handle Cond size 1 or 2" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 808); |
809 | } |
810 | } |
811 | |
812 | unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, |
813 | MachineBasicBlock::iterator I, |
814 | const DebugLoc &DL, |
815 | unsigned SrcReg, int Value) const { |
816 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); |
817 | unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
818 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) |
819 | .addImm(Value) |
820 | .addReg(SrcReg); |
821 | |
822 | return Reg; |
823 | } |
824 | |
825 | unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, |
826 | MachineBasicBlock::iterator I, |
827 | const DebugLoc &DL, |
828 | unsigned SrcReg, int Value) const { |
829 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); |
830 | unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
831 | BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) |
832 | .addImm(Value) |
833 | .addReg(SrcReg); |
834 | |
835 | return Reg; |
836 | } |
837 | |
838 | unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { |
839 | |
840 | if (RI.getRegSizeInBits(*DstRC) == 32) { |
841 | return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; |
842 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { |
843 | return AMDGPU::S_MOV_B64; |
844 | } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { |
845 | return AMDGPU::V_MOV_B64_PSEUDO; |
846 | } |
847 | return AMDGPU::COPY; |
848 | } |
849 | |
850 | static unsigned getSGPRSpillSaveOpcode(unsigned Size) { |
851 | switch (Size) { |
852 | case 4: |
853 | return AMDGPU::SI_SPILL_S32_SAVE; |
854 | case 8: |
855 | return AMDGPU::SI_SPILL_S64_SAVE; |
856 | case 12: |
857 | return AMDGPU::SI_SPILL_S96_SAVE; |
858 | case 16: |
859 | return AMDGPU::SI_SPILL_S128_SAVE; |
860 | case 20: |
861 | return AMDGPU::SI_SPILL_S160_SAVE; |
862 | case 32: |
863 | return AMDGPU::SI_SPILL_S256_SAVE; |
864 | case 64: |
865 | return AMDGPU::SI_SPILL_S512_SAVE; |
866 | default: |
867 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 867); |
868 | } |
869 | } |
870 | |
871 | static unsigned getVGPRSpillSaveOpcode(unsigned Size) { |
872 | switch (Size) { |
873 | case 4: |
874 | return AMDGPU::SI_SPILL_V32_SAVE; |
875 | case 8: |
876 | return AMDGPU::SI_SPILL_V64_SAVE; |
877 | case 12: |
878 | return AMDGPU::SI_SPILL_V96_SAVE; |
879 | case 16: |
880 | return AMDGPU::SI_SPILL_V128_SAVE; |
881 | case 20: |
882 | return AMDGPU::SI_SPILL_V160_SAVE; |
883 | case 32: |
884 | return AMDGPU::SI_SPILL_V256_SAVE; |
885 | case 64: |
886 | return AMDGPU::SI_SPILL_V512_SAVE; |
887 | default: |
888 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 888); |
889 | } |
890 | } |
891 | |
892 | void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, |
893 | MachineBasicBlock::iterator MI, |
894 | unsigned SrcReg, bool isKill, |
895 | int FrameIndex, |
896 | const TargetRegisterClass *RC, |
897 | const TargetRegisterInfo *TRI) const { |
898 | MachineFunction *MF = MBB.getParent(); |
899 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
900 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); |
901 | const DebugLoc &DL = MBB.findDebugLoc(MI); |
902 | |
903 | unsigned Size = FrameInfo.getObjectSize(FrameIndex); |
904 | unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); |
905 | MachinePointerInfo PtrInfo |
906 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); |
907 | MachineMemOperand *MMO |
908 | = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, |
909 | Size, Align); |
910 | unsigned SpillSize = TRI->getSpillSize(*RC); |
911 | |
912 | if (RI.isSGPRClass(RC)) { |
913 | MFI->setHasSpilledSGPRs(); |
914 | |
915 | // We are only allowed to create one new instruction when spilling |
916 | // registers, so we need to use pseudo instruction for spilling SGPRs. |
917 | const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); |
918 | |
919 | // The SGPR spill/restore instructions only work on number sgprs, so we need |
920 | // to make sure we are using the correct register class. |
921 | if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) { |
922 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
923 | MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); |
924 | } |
925 | |
926 | MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) |
927 | .addReg(SrcReg, getKillRegState(isKill)) // data |
928 | .addFrameIndex(FrameIndex) // addr |
929 | .addMemOperand(MMO) |
930 | .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) |
931 | .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); |
932 | // Add the scratch resource registers as implicit uses because we may end up |
933 | // needing them, and need to ensure that the reserved registers are |
934 | // correctly handled. |
935 | |
936 | FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); |
937 | if (ST.hasScalarStores()) { |
938 | // m0 is used for offset to scalar stores if used to spill. |
939 | Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); |
940 | } |
941 | |
942 | return; |
943 | } |
944 | |
945 | assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected")((RI.hasVGPRs(RC) && "Only VGPR spilling expected") ? static_cast<void> (0) : __assert_fail ("RI.hasVGPRs(RC) && \"Only VGPR spilling expected\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 945, __PRETTY_FUNCTION__)); |
946 | |
947 | unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); |
948 | MFI->setHasSpilledVGPRs(); |
949 | BuildMI(MBB, MI, DL, get(Opcode)) |
950 | .addReg(SrcReg, getKillRegState(isKill)) // data |
951 | .addFrameIndex(FrameIndex) // addr |
952 | .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc |
953 | .addReg(MFI->getFrameOffsetReg()) // scratch_offset |
954 | .addImm(0) // offset |
955 | .addMemOperand(MMO); |
956 | } |
957 | |
958 | static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { |
959 | switch (Size) { |
960 | case 4: |
961 | return AMDGPU::SI_SPILL_S32_RESTORE; |
962 | case 8: |
963 | return AMDGPU::SI_SPILL_S64_RESTORE; |
964 | case 12: |
965 | return AMDGPU::SI_SPILL_S96_RESTORE; |
966 | case 16: |
967 | return AMDGPU::SI_SPILL_S128_RESTORE; |
968 | case 20: |
969 | return AMDGPU::SI_SPILL_S160_RESTORE; |
970 | case 32: |
971 | return AMDGPU::SI_SPILL_S256_RESTORE; |
972 | case 64: |
973 | return AMDGPU::SI_SPILL_S512_RESTORE; |
974 | default: |
975 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 975); |
976 | } |
977 | } |
978 | |
979 | static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { |
980 | switch (Size) { |
981 | case 4: |
982 | return AMDGPU::SI_SPILL_V32_RESTORE; |
983 | case 8: |
984 | return AMDGPU::SI_SPILL_V64_RESTORE; |
985 | case 12: |
986 | return AMDGPU::SI_SPILL_V96_RESTORE; |
987 | case 16: |
988 | return AMDGPU::SI_SPILL_V128_RESTORE; |
989 | case 20: |
990 | return AMDGPU::SI_SPILL_V160_RESTORE; |
991 | case 32: |
992 | return AMDGPU::SI_SPILL_V256_RESTORE; |
993 | case 64: |
994 | return AMDGPU::SI_SPILL_V512_RESTORE; |
995 | default: |
996 | llvm_unreachable("unknown register size")::llvm::llvm_unreachable_internal("unknown register size", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 996); |
997 | } |
998 | } |
999 | |
1000 | void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, |
1001 | MachineBasicBlock::iterator MI, |
1002 | unsigned DestReg, int FrameIndex, |
1003 | const TargetRegisterClass *RC, |
1004 | const TargetRegisterInfo *TRI) const { |
1005 | MachineFunction *MF = MBB.getParent(); |
1006 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
1007 | MachineFrameInfo &FrameInfo = MF->getFrameInfo(); |
1008 | const DebugLoc &DL = MBB.findDebugLoc(MI); |
1009 | unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); |
1010 | unsigned Size = FrameInfo.getObjectSize(FrameIndex); |
1011 | unsigned SpillSize = TRI->getSpillSize(*RC); |
1012 | |
1013 | MachinePointerInfo PtrInfo |
1014 | = MachinePointerInfo::getFixedStack(*MF, FrameIndex); |
1015 | |
1016 | MachineMemOperand *MMO = MF->getMachineMemOperand( |
1017 | PtrInfo, MachineMemOperand::MOLoad, Size, Align); |
1018 | |
1019 | if (RI.isSGPRClass(RC)) { |
1020 | MFI->setHasSpilledSGPRs(); |
1021 | |
1022 | // FIXME: Maybe this should not include a memoperand because it will be |
1023 | // lowered to non-memory instructions. |
1024 | const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); |
1025 | if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) { |
1026 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
1027 | MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); |
1028 | } |
1029 | |
1030 | FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); |
1031 | MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) |
1032 | .addFrameIndex(FrameIndex) // addr |
1033 | .addMemOperand(MMO) |
1034 | .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) |
1035 | .addReg(MFI->getFrameOffsetReg(), RegState::Implicit); |
1036 | |
1037 | if (ST.hasScalarStores()) { |
1038 | // m0 is used for offset to scalar stores if used to spill. |
1039 | Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); |
1040 | } |
1041 | |
1042 | return; |
1043 | } |
1044 | |
1045 | assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected")((RI.hasVGPRs(RC) && "Only VGPR spilling expected") ? static_cast<void> (0) : __assert_fail ("RI.hasVGPRs(RC) && \"Only VGPR spilling expected\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1045, __PRETTY_FUNCTION__)); |
1046 | |
1047 | unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); |
1048 | BuildMI(MBB, MI, DL, get(Opcode), DestReg) |
1049 | .addFrameIndex(FrameIndex) // vaddr |
1050 | .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc |
1051 | .addReg(MFI->getFrameOffsetReg()) // scratch_offset |
1052 | .addImm(0) // offset |
1053 | .addMemOperand(MMO); |
1054 | } |
1055 | |
1056 | /// \param @Offset Offset in bytes of the FrameIndex being spilled |
1057 | unsigned SIInstrInfo::calculateLDSSpillAddress( |
1058 | MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, |
1059 | unsigned FrameOffset, unsigned Size) const { |
1060 | MachineFunction *MF = MBB.getParent(); |
1061 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
1062 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
1063 | const DebugLoc &DL = MBB.findDebugLoc(MI); |
1064 | unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); |
1065 | unsigned WavefrontSize = ST.getWavefrontSize(); |
1066 | |
1067 | unsigned TIDReg = MFI->getTIDReg(); |
1068 | if (!MFI->hasCalculatedTID()) { |
1069 | MachineBasicBlock &Entry = MBB.getParent()->front(); |
1070 | MachineBasicBlock::iterator Insert = Entry.front(); |
1071 | const DebugLoc &DL = Insert->getDebugLoc(); |
1072 | |
1073 | TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, |
1074 | *MF); |
1075 | if (TIDReg == AMDGPU::NoRegister) |
1076 | return TIDReg; |
1077 | |
1078 | if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) && |
1079 | WorkGroupSize > WavefrontSize) { |
1080 | unsigned TIDIGXReg |
1081 | = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); |
1082 | unsigned TIDIGYReg |
1083 | = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); |
1084 | unsigned TIDIGZReg |
1085 | = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); |
1086 | unsigned InputPtrReg = |
1087 | MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); |
1088 | for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { |
1089 | if (!Entry.isLiveIn(Reg)) |
1090 | Entry.addLiveIn(Reg); |
1091 | } |
1092 | |
1093 | RS->enterBasicBlock(Entry); |
1094 | // FIXME: Can we scavenge an SReg_64 and access the subregs? |
1095 | unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); |
1096 | unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); |
1097 | BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) |
1098 | .addReg(InputPtrReg) |
1099 | .addImm(SI::KernelInputOffsets::NGROUPS_Z); |
1100 | BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) |
1101 | .addReg(InputPtrReg) |
1102 | .addImm(SI::KernelInputOffsets::NGROUPS_Y); |
1103 | |
1104 | // NGROUPS.X * NGROUPS.Y |
1105 | BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) |
1106 | .addReg(STmp1) |
1107 | .addReg(STmp0); |
1108 | // (NGROUPS.X * NGROUPS.Y) * TIDIG.X |
1109 | BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) |
1110 | .addReg(STmp1) |
1111 | .addReg(TIDIGXReg); |
1112 | // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) |
1113 | BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) |
1114 | .addReg(STmp0) |
1115 | .addReg(TIDIGYReg) |
1116 | .addReg(TIDReg); |
1117 | // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z |
1118 | getAddNoCarry(Entry, Insert, DL, TIDReg) |
1119 | .addReg(TIDReg) |
1120 | .addReg(TIDIGZReg) |
1121 | .addImm(0); // clamp bit |
1122 | } else { |
1123 | // Get the wave id |
1124 | BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), |
1125 | TIDReg) |
1126 | .addImm(-1) |
1127 | .addImm(0); |
1128 | |
1129 | BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), |
1130 | TIDReg) |
1131 | .addImm(-1) |
1132 | .addReg(TIDReg); |
1133 | } |
1134 | |
1135 | BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), |
1136 | TIDReg) |
1137 | .addImm(2) |
1138 | .addReg(TIDReg); |
1139 | MFI->setTIDReg(TIDReg); |
1140 | } |
1141 | |
1142 | // Add FrameIndex to LDS offset |
1143 | unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); |
1144 | getAddNoCarry(MBB, MI, DL, TmpReg) |
1145 | .addImm(LDSOffset) |
1146 | .addReg(TIDReg) |
1147 | .addImm(0); // clamp bit |
1148 | |
1149 | return TmpReg; |
1150 | } |
1151 | |
1152 | void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, |
1153 | MachineBasicBlock::iterator MI, |
1154 | int Count) const { |
1155 | DebugLoc DL = MBB.findDebugLoc(MI); |
1156 | while (Count > 0) { |
1157 | int Arg; |
1158 | if (Count >= 8) |
1159 | Arg = 7; |
1160 | else |
1161 | Arg = Count - 1; |
1162 | Count -= 8; |
1163 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) |
1164 | .addImm(Arg); |
1165 | } |
1166 | } |
1167 | |
1168 | void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, |
1169 | MachineBasicBlock::iterator MI) const { |
1170 | insertWaitStates(MBB, MI, 1); |
1171 | } |
1172 | |
1173 | void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { |
1174 | auto MF = MBB.getParent(); |
1175 | SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
1176 | |
1177 | assert(Info->isEntryFunction())((Info->isEntryFunction()) ? static_cast<void> (0) : __assert_fail ("Info->isEntryFunction()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1177, __PRETTY_FUNCTION__)); |
1178 | |
1179 | if (MBB.succ_empty()) { |
1180 | bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); |
1181 | if (HasNoTerminator) { |
1182 | if (Info->returnsVoid()) { |
1183 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); |
1184 | } else { |
1185 | BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); |
1186 | } |
1187 | } |
1188 | } |
1189 | } |
1190 | |
1191 | unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { |
1192 | switch (MI.getOpcode()) { |
1193 | default: return 1; // FIXME: Do wait states equal cycles? |
1194 | |
1195 | case AMDGPU::S_NOP: |
1196 | return MI.getOperand(0).getImm() + 1; |
1197 | } |
1198 | } |
1199 | |
1200 | bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { |
1201 | MachineBasicBlock &MBB = *MI.getParent(); |
1202 | DebugLoc DL = MBB.findDebugLoc(MI); |
1203 | switch (MI.getOpcode()) { |
1204 | default: return TargetInstrInfo::expandPostRAPseudo(MI); |
1205 | case AMDGPU::S_MOV_B64_term: |
1206 | // This is only a terminator to get the correct spill code placement during |
1207 | // register allocation. |
1208 | MI.setDesc(get(AMDGPU::S_MOV_B64)); |
1209 | break; |
1210 | |
1211 | case AMDGPU::S_XOR_B64_term: |
1212 | // This is only a terminator to get the correct spill code placement during |
1213 | // register allocation. |
1214 | MI.setDesc(get(AMDGPU::S_XOR_B64)); |
1215 | break; |
1216 | |
1217 | case AMDGPU::S_ANDN2_B64_term: |
1218 | // This is only a terminator to get the correct spill code placement during |
1219 | // register allocation. |
1220 | MI.setDesc(get(AMDGPU::S_ANDN2_B64)); |
1221 | break; |
1222 | |
1223 | case AMDGPU::V_MOV_B64_PSEUDO: { |
1224 | unsigned Dst = MI.getOperand(0).getReg(); |
1225 | unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); |
1226 | unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); |
1227 | |
1228 | const MachineOperand &SrcOp = MI.getOperand(1); |
1229 | // FIXME: Will this work for 64-bit floating point immediates? |
1230 | assert(!SrcOp.isFPImm())((!SrcOp.isFPImm()) ? static_cast<void> (0) : __assert_fail ("!SrcOp.isFPImm()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1230, __PRETTY_FUNCTION__)); |
1231 | if (SrcOp.isImm()) { |
1232 | APInt Imm(64, SrcOp.getImm()); |
1233 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) |
1234 | .addImm(Imm.getLoBits(32).getZExtValue()) |
1235 | .addReg(Dst, RegState::Implicit | RegState::Define); |
1236 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) |
1237 | .addImm(Imm.getHiBits(32).getZExtValue()) |
1238 | .addReg(Dst, RegState::Implicit | RegState::Define); |
1239 | } else { |
1240 | assert(SrcOp.isReg())((SrcOp.isReg()) ? static_cast<void> (0) : __assert_fail ("SrcOp.isReg()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1240, __PRETTY_FUNCTION__)); |
1241 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) |
1242 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) |
1243 | .addReg(Dst, RegState::Implicit | RegState::Define); |
1244 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) |
1245 | .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) |
1246 | .addReg(Dst, RegState::Implicit | RegState::Define); |
1247 | } |
1248 | MI.eraseFromParent(); |
1249 | break; |
1250 | } |
1251 | case AMDGPU::V_SET_INACTIVE_B32: { |
1252 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) |
1253 | .addReg(AMDGPU::EXEC); |
1254 | BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) |
1255 | .add(MI.getOperand(2)); |
1256 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) |
1257 | .addReg(AMDGPU::EXEC); |
1258 | MI.eraseFromParent(); |
1259 | break; |
1260 | } |
1261 | case AMDGPU::V_SET_INACTIVE_B64: { |
1262 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) |
1263 | .addReg(AMDGPU::EXEC); |
1264 | MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), |
1265 | MI.getOperand(0).getReg()) |
1266 | .add(MI.getOperand(2)); |
1267 | expandPostRAPseudo(*Copy); |
1268 | BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) |
1269 | .addReg(AMDGPU::EXEC); |
1270 | MI.eraseFromParent(); |
1271 | break; |
1272 | } |
1273 | case AMDGPU::V_MOVRELD_B32_V1: |
1274 | case AMDGPU::V_MOVRELD_B32_V2: |
1275 | case AMDGPU::V_MOVRELD_B32_V4: |
1276 | case AMDGPU::V_MOVRELD_B32_V8: |
1277 | case AMDGPU::V_MOVRELD_B32_V16: { |
1278 | const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); |
1279 | unsigned VecReg = MI.getOperand(0).getReg(); |
1280 | bool IsUndef = MI.getOperand(1).isUndef(); |
1281 | unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); |
1282 | assert(VecReg == MI.getOperand(1).getReg())((VecReg == MI.getOperand(1).getReg()) ? static_cast<void> (0) : __assert_fail ("VecReg == MI.getOperand(1).getReg()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1282, __PRETTY_FUNCTION__)); |
1283 | |
1284 | MachineInstr *MovRel = |
1285 | BuildMI(MBB, MI, DL, MovRelDesc) |
1286 | .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) |
1287 | .add(MI.getOperand(2)) |
1288 | .addReg(VecReg, RegState::ImplicitDefine) |
1289 | .addReg(VecReg, |
1290 | RegState::Implicit | (IsUndef ? RegState::Undef : 0)); |
1291 | |
1292 | const int ImpDefIdx = |
1293 | MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); |
1294 | const int ImpUseIdx = ImpDefIdx + 1; |
1295 | MovRel->tieOperands(ImpDefIdx, ImpUseIdx); |
1296 | |
1297 | MI.eraseFromParent(); |
1298 | break; |
1299 | } |
1300 | case AMDGPU::SI_PC_ADD_REL_OFFSET: { |
1301 | MachineFunction &MF = *MBB.getParent(); |
1302 | unsigned Reg = MI.getOperand(0).getReg(); |
1303 | unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); |
1304 | unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); |
1305 | |
1306 | // Create a bundle so these instructions won't be re-ordered by the |
1307 | // post-RA scheduler. |
1308 | MIBundleBuilder Bundler(MBB, MI); |
1309 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); |
1310 | |
1311 | // Add 32-bit offset from this instruction to the start of the |
1312 | // constant data. |
1313 | Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) |
1314 | .addReg(RegLo) |
1315 | .add(MI.getOperand(1))); |
1316 | |
1317 | MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) |
1318 | .addReg(RegHi); |
1319 | if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) |
1320 | MIB.addImm(0); |
1321 | else |
1322 | MIB.add(MI.getOperand(2)); |
1323 | |
1324 | Bundler.append(MIB); |
1325 | finalizeBundle(MBB, Bundler.begin()); |
1326 | |
1327 | MI.eraseFromParent(); |
1328 | break; |
1329 | } |
1330 | case AMDGPU::ENTER_WWM: { |
1331 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when |
1332 | // WWM is entered. |
1333 | MI.setDesc(get(AMDGPU::S_OR_SAVEEXEC_B64)); |
1334 | break; |
1335 | } |
1336 | case AMDGPU::EXIT_WWM: { |
1337 | // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when |
1338 | // WWM is exited. |
1339 | MI.setDesc(get(AMDGPU::S_MOV_B64)); |
1340 | break; |
1341 | } |
1342 | case TargetOpcode::BUNDLE: { |
1343 | if (!MI.mayLoad()) |
1344 | return false; |
1345 | |
1346 | // If it is a load it must be a memory clause |
1347 | for (MachineBasicBlock::instr_iterator I = MI.getIterator(); |
1348 | I->isBundledWithSucc(); ++I) { |
1349 | I->unbundleFromSucc(); |
1350 | for (MachineOperand &MO : I->operands()) |
1351 | if (MO.isReg()) |
1352 | MO.setIsInternalRead(false); |
1353 | } |
1354 | |
1355 | MI.eraseFromParent(); |
1356 | break; |
1357 | } |
1358 | } |
1359 | return true; |
1360 | } |
1361 | |
1362 | bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, |
1363 | MachineOperand &Src0, |
1364 | unsigned Src0OpName, |
1365 | MachineOperand &Src1, |
1366 | unsigned Src1OpName) const { |
1367 | MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); |
1368 | if (!Src0Mods) |
1369 | return false; |
1370 | |
1371 | MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); |
1372 | assert(Src1Mods &&((Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? static_cast<void> (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1373, __PRETTY_FUNCTION__)) |
1373 | "All commutable instructions have both src0 and src1 modifiers")((Src1Mods && "All commutable instructions have both src0 and src1 modifiers" ) ? static_cast<void> (0) : __assert_fail ("Src1Mods && \"All commutable instructions have both src0 and src1 modifiers\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1373, __PRETTY_FUNCTION__)); |
1374 | |
1375 | int Src0ModsVal = Src0Mods->getImm(); |
1376 | int Src1ModsVal = Src1Mods->getImm(); |
1377 | |
1378 | Src1Mods->setImm(Src0ModsVal); |
1379 | Src0Mods->setImm(Src1ModsVal); |
1380 | return true; |
1381 | } |
1382 | |
1383 | static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, |
1384 | MachineOperand &RegOp, |
1385 | MachineOperand &NonRegOp) { |
1386 | unsigned Reg = RegOp.getReg(); |
1387 | unsigned SubReg = RegOp.getSubReg(); |
1388 | bool IsKill = RegOp.isKill(); |
1389 | bool IsDead = RegOp.isDead(); |
1390 | bool IsUndef = RegOp.isUndef(); |
1391 | bool IsDebug = RegOp.isDebug(); |
1392 | |
1393 | if (NonRegOp.isImm()) |
1394 | RegOp.ChangeToImmediate(NonRegOp.getImm()); |
1395 | else if (NonRegOp.isFI()) |
1396 | RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); |
1397 | else |
1398 | return nullptr; |
1399 | |
1400 | NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); |
1401 | NonRegOp.setSubReg(SubReg); |
1402 | |
1403 | return &MI; |
1404 | } |
1405 | |
1406 | MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, |
1407 | unsigned Src0Idx, |
1408 | unsigned Src1Idx) const { |
1409 | assert(!NewMI && "this should never be used")((!NewMI && "this should never be used") ? static_cast <void> (0) : __assert_fail ("!NewMI && \"this should never be used\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1409, __PRETTY_FUNCTION__)); |
1410 | |
1411 | unsigned Opc = MI.getOpcode(); |
1412 | int CommutedOpcode = commuteOpcode(Opc); |
1413 | if (CommutedOpcode == -1) |
1414 | return nullptr; |
1415 | |
1416 | assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1420, __PRETTY_FUNCTION__)) |
1417 | static_cast<int>(Src0Idx) &&((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1420, __PRETTY_FUNCTION__)) |
1418 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1420, __PRETTY_FUNCTION__)) |
1419 | static_cast<int>(Src1Idx) &&((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1420, __PRETTY_FUNCTION__)) |
1420 | "inconsistency with findCommutedOpIndices")((AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast <int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc , AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices") ? static_cast< void> (0) : __assert_fail ("AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && \"inconsistency with findCommutedOpIndices\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1420, __PRETTY_FUNCTION__)); |
1421 | |
1422 | MachineOperand &Src0 = MI.getOperand(Src0Idx); |
1423 | MachineOperand &Src1 = MI.getOperand(Src1Idx); |
1424 | |
1425 | MachineInstr *CommutedMI = nullptr; |
1426 | if (Src0.isReg() && Src1.isReg()) { |
1427 | if (isOperandLegal(MI, Src1Idx, &Src0)) { |
1428 | // Be sure to copy the source modifiers to the right place. |
1429 | CommutedMI |
1430 | = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); |
1431 | } |
1432 | |
1433 | } else if (Src0.isReg() && !Src1.isReg()) { |
1434 | // src0 should always be able to support any operand type, so no need to |
1435 | // check operand legality. |
1436 | CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); |
1437 | } else if (!Src0.isReg() && Src1.isReg()) { |
1438 | if (isOperandLegal(MI, Src1Idx, &Src0)) |
1439 | CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); |
1440 | } else { |
1441 | // FIXME: Found two non registers to commute. This does happen. |
1442 | return nullptr; |
1443 | } |
1444 | |
1445 | if (CommutedMI) { |
1446 | swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, |
1447 | Src1, AMDGPU::OpName::src1_modifiers); |
1448 | |
1449 | CommutedMI->setDesc(get(CommutedOpcode)); |
1450 | } |
1451 | |
1452 | return CommutedMI; |
1453 | } |
1454 | |
1455 | // This needs to be implemented because the source modifiers may be inserted |
1456 | // between the true commutable operands, and the base |
1457 | // TargetInstrInfo::commuteInstruction uses it. |
1458 | bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, |
1459 | unsigned &SrcOpIdx1) const { |
1460 | return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); |
1461 | } |
1462 | |
1463 | bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, |
1464 | unsigned &SrcOpIdx1) const { |
1465 | if (!Desc.isCommutable()) |
1466 | return false; |
1467 | |
1468 | unsigned Opc = Desc.getOpcode(); |
1469 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); |
1470 | if (Src0Idx == -1) |
1471 | return false; |
1472 | |
1473 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); |
1474 | if (Src1Idx == -1) |
1475 | return false; |
1476 | |
1477 | return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); |
1478 | } |
1479 | |
1480 | bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, |
1481 | int64_t BrOffset) const { |
1482 | // BranchRelaxation should never have to check s_setpc_b64 because its dest |
1483 | // block is unanalyzable. |
1484 | assert(BranchOp != AMDGPU::S_SETPC_B64)((BranchOp != AMDGPU::S_SETPC_B64) ? static_cast<void> ( 0) : __assert_fail ("BranchOp != AMDGPU::S_SETPC_B64", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1484, __PRETTY_FUNCTION__)); |
1485 | |
1486 | // Convert to dwords. |
1487 | BrOffset /= 4; |
1488 | |
1489 | // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is |
1490 | // from the next instruction. |
1491 | BrOffset -= 1; |
1492 | |
1493 | return isIntN(BranchOffsetBits, BrOffset); |
1494 | } |
1495 | |
1496 | MachineBasicBlock *SIInstrInfo::getBranchDestBlock( |
1497 | const MachineInstr &MI) const { |
1498 | if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { |
1499 | // This would be a difficult analysis to perform, but can always be legal so |
1500 | // there's no need to analyze it. |
1501 | return nullptr; |
1502 | } |
1503 | |
1504 | return MI.getOperand(0).getMBB(); |
1505 | } |
1506 | |
1507 | unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, |
1508 | MachineBasicBlock &DestBB, |
1509 | const DebugLoc &DL, |
1510 | int64_t BrOffset, |
1511 | RegScavenger *RS) const { |
1512 | assert(RS && "RegScavenger required for long branching")((RS && "RegScavenger required for long branching") ? static_cast<void> (0) : __assert_fail ("RS && \"RegScavenger required for long branching\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1512, __PRETTY_FUNCTION__)); |
1513 | assert(MBB.empty() &&((MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? static_cast<void> (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1514, __PRETTY_FUNCTION__)) |
1514 | "new block should be inserted for expanding unconditional branch")((MBB.empty() && "new block should be inserted for expanding unconditional branch" ) ? static_cast<void> (0) : __assert_fail ("MBB.empty() && \"new block should be inserted for expanding unconditional branch\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1514, __PRETTY_FUNCTION__)); |
1515 | assert(MBB.pred_size() == 1)((MBB.pred_size() == 1) ? static_cast<void> (0) : __assert_fail ("MBB.pred_size() == 1", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1515, __PRETTY_FUNCTION__)); |
1516 | |
1517 | MachineFunction *MF = MBB.getParent(); |
1518 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
1519 | |
1520 | // FIXME: Virtual register workaround for RegScavenger not working with empty |
1521 | // blocks. |
1522 | unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
1523 | |
1524 | auto I = MBB.end(); |
1525 | |
1526 | // We need to compute the offset relative to the instruction immediately after |
1527 | // s_getpc_b64. Insert pc arithmetic code before last terminator. |
1528 | MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); |
1529 | |
1530 | // TODO: Handle > 32-bit block address. |
1531 | if (BrOffset >= 0) { |
1532 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) |
1533 | .addReg(PCReg, RegState::Define, AMDGPU::sub0) |
1534 | .addReg(PCReg, 0, AMDGPU::sub0) |
1535 | .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); |
1536 | BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) |
1537 | .addReg(PCReg, RegState::Define, AMDGPU::sub1) |
1538 | .addReg(PCReg, 0, AMDGPU::sub1) |
1539 | .addImm(0); |
1540 | } else { |
1541 | // Backwards branch. |
1542 | BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) |
1543 | .addReg(PCReg, RegState::Define, AMDGPU::sub0) |
1544 | .addReg(PCReg, 0, AMDGPU::sub0) |
1545 | .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); |
1546 | BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) |
1547 | .addReg(PCReg, RegState::Define, AMDGPU::sub1) |
1548 | .addReg(PCReg, 0, AMDGPU::sub1) |
1549 | .addImm(0); |
1550 | } |
1551 | |
1552 | // Insert the indirect branch after the other terminator. |
1553 | BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) |
1554 | .addReg(PCReg); |
1555 | |
1556 | // FIXME: If spilling is necessary, this will fail because this scavenger has |
1557 | // no emergency stack slots. It is non-trivial to spill in this situation, |
1558 | // because the restore code needs to be specially placed after the |
1559 | // jump. BranchRelaxation then needs to be made aware of the newly inserted |
1560 | // block. |
1561 | // |
1562 | // If a spill is needed for the pc register pair, we need to insert a spill |
1563 | // restore block right before the destination block, and insert a short branch |
1564 | // into the old destination block's fallthrough predecessor. |
1565 | // e.g.: |
1566 | // |
1567 | // s_cbranch_scc0 skip_long_branch: |
1568 | // |
1569 | // long_branch_bb: |
1570 | // spill s[8:9] |
1571 | // s_getpc_b64 s[8:9] |
1572 | // s_add_u32 s8, s8, restore_bb |
1573 | // s_addc_u32 s9, s9, 0 |
1574 | // s_setpc_b64 s[8:9] |
1575 | // |
1576 | // skip_long_branch: |
1577 | // foo; |
1578 | // |
1579 | // ..... |
1580 | // |
1581 | // dest_bb_fallthrough_predecessor: |
1582 | // bar; |
1583 | // s_branch dest_bb |
1584 | // |
1585 | // restore_bb: |
1586 | // restore s[8:9] |
1587 | // fallthrough dest_bb |
1588 | /// |
1589 | // dest_bb: |
1590 | // buzz; |
1591 | |
1592 | RS->enterBasicBlockEnd(MBB); |
1593 | unsigned Scav = RS->scavengeRegisterBackwards( |
1594 | AMDGPU::SReg_64RegClass, |
1595 | MachineBasicBlock::iterator(GetPC), false, 0); |
1596 | MRI.replaceRegWith(PCReg, Scav); |
1597 | MRI.clearVirtRegs(); |
1598 | RS->setRegUsed(Scav); |
1599 | |
1600 | return 4 + 8 + 4 + 4; |
1601 | } |
1602 | |
1603 | unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { |
1604 | switch (Cond) { |
1605 | case SIInstrInfo::SCC_TRUE: |
1606 | return AMDGPU::S_CBRANCH_SCC1; |
1607 | case SIInstrInfo::SCC_FALSE: |
1608 | return AMDGPU::S_CBRANCH_SCC0; |
1609 | case SIInstrInfo::VCCNZ: |
1610 | return AMDGPU::S_CBRANCH_VCCNZ; |
1611 | case SIInstrInfo::VCCZ: |
1612 | return AMDGPU::S_CBRANCH_VCCZ; |
1613 | case SIInstrInfo::EXECNZ: |
1614 | return AMDGPU::S_CBRANCH_EXECNZ; |
1615 | case SIInstrInfo::EXECZ: |
1616 | return AMDGPU::S_CBRANCH_EXECZ; |
1617 | default: |
1618 | llvm_unreachable("invalid branch predicate")::llvm::llvm_unreachable_internal("invalid branch predicate", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1618); |
1619 | } |
1620 | } |
1621 | |
1622 | SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { |
1623 | switch (Opcode) { |
1624 | case AMDGPU::S_CBRANCH_SCC0: |
1625 | return SCC_FALSE; |
1626 | case AMDGPU::S_CBRANCH_SCC1: |
1627 | return SCC_TRUE; |
1628 | case AMDGPU::S_CBRANCH_VCCNZ: |
1629 | return VCCNZ; |
1630 | case AMDGPU::S_CBRANCH_VCCZ: |
1631 | return VCCZ; |
1632 | case AMDGPU::S_CBRANCH_EXECNZ: |
1633 | return EXECNZ; |
1634 | case AMDGPU::S_CBRANCH_EXECZ: |
1635 | return EXECZ; |
1636 | default: |
1637 | return INVALID_BR; |
1638 | } |
1639 | } |
1640 | |
1641 | bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, |
1642 | MachineBasicBlock::iterator I, |
1643 | MachineBasicBlock *&TBB, |
1644 | MachineBasicBlock *&FBB, |
1645 | SmallVectorImpl<MachineOperand> &Cond, |
1646 | bool AllowModify) const { |
1647 | if (I->getOpcode() == AMDGPU::S_BRANCH) { |
1648 | // Unconditional Branch |
1649 | TBB = I->getOperand(0).getMBB(); |
1650 | return false; |
1651 | } |
1652 | |
1653 | MachineBasicBlock *CondBB = nullptr; |
1654 | |
1655 | if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { |
1656 | CondBB = I->getOperand(1).getMBB(); |
1657 | Cond.push_back(I->getOperand(0)); |
1658 | } else { |
1659 | BranchPredicate Pred = getBranchPredicate(I->getOpcode()); |
1660 | if (Pred == INVALID_BR) |
1661 | return true; |
1662 | |
1663 | CondBB = I->getOperand(0).getMBB(); |
1664 | Cond.push_back(MachineOperand::CreateImm(Pred)); |
1665 | Cond.push_back(I->getOperand(1)); // Save the branch register. |
1666 | } |
1667 | ++I; |
1668 | |
1669 | if (I == MBB.end()) { |
1670 | // Conditional branch followed by fall-through. |
1671 | TBB = CondBB; |
1672 | return false; |
1673 | } |
1674 | |
1675 | if (I->getOpcode() == AMDGPU::S_BRANCH) { |
1676 | TBB = CondBB; |
1677 | FBB = I->getOperand(0).getMBB(); |
1678 | return false; |
1679 | } |
1680 | |
1681 | return true; |
1682 | } |
1683 | |
1684 | bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, |
1685 | MachineBasicBlock *&FBB, |
1686 | SmallVectorImpl<MachineOperand> &Cond, |
1687 | bool AllowModify) const { |
1688 | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); |
1689 | auto E = MBB.end(); |
1690 | if (I == E) |
1691 | return false; |
1692 | |
1693 | // Skip over the instructions that are artificially terminators for special |
1694 | // exec management. |
1695 | while (I != E && !I->isBranch() && !I->isReturn() && |
1696 | I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { |
1697 | switch (I->getOpcode()) { |
1698 | case AMDGPU::SI_MASK_BRANCH: |
1699 | case AMDGPU::S_MOV_B64_term: |
1700 | case AMDGPU::S_XOR_B64_term: |
1701 | case AMDGPU::S_ANDN2_B64_term: |
1702 | break; |
1703 | case AMDGPU::SI_IF: |
1704 | case AMDGPU::SI_ELSE: |
1705 | case AMDGPU::SI_KILL_I1_TERMINATOR: |
1706 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: |
1707 | // FIXME: It's messy that these need to be considered here at all. |
1708 | return true; |
1709 | default: |
1710 | llvm_unreachable("unexpected non-branch terminator inst")::llvm::llvm_unreachable_internal("unexpected non-branch terminator inst" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1710); |
1711 | } |
1712 | |
1713 | ++I; |
1714 | } |
1715 | |
1716 | if (I == E) |
1717 | return false; |
1718 | |
1719 | if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) |
1720 | return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); |
1721 | |
1722 | ++I; |
1723 | |
1724 | // TODO: Should be able to treat as fallthrough? |
1725 | if (I == MBB.end()) |
1726 | return true; |
1727 | |
1728 | if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) |
1729 | return true; |
1730 | |
1731 | MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); |
1732 | |
1733 | // Specifically handle the case where the conditional branch is to the same |
1734 | // destination as the mask branch. e.g. |
1735 | // |
1736 | // si_mask_branch BB8 |
1737 | // s_cbranch_execz BB8 |
1738 | // s_cbranch BB9 |
1739 | // |
1740 | // This is required to understand divergent loops which may need the branches |
1741 | // to be relaxed. |
1742 | if (TBB != MaskBrDest || Cond.empty()) |
1743 | return true; |
1744 | |
1745 | auto Pred = Cond[0].getImm(); |
1746 | return (Pred != EXECZ && Pred != EXECNZ); |
1747 | } |
1748 | |
1749 | unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, |
1750 | int *BytesRemoved) const { |
1751 | MachineBasicBlock::iterator I = MBB.getFirstTerminator(); |
1752 | |
1753 | unsigned Count = 0; |
1754 | unsigned RemovedSize = 0; |
1755 | while (I != MBB.end()) { |
1756 | MachineBasicBlock::iterator Next = std::next(I); |
1757 | if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { |
1758 | I = Next; |
1759 | continue; |
1760 | } |
1761 | |
1762 | RemovedSize += getInstSizeInBytes(*I); |
1763 | I->eraseFromParent(); |
1764 | ++Count; |
1765 | I = Next; |
1766 | } |
1767 | |
1768 | if (BytesRemoved) |
1769 | *BytesRemoved = RemovedSize; |
1770 | |
1771 | return Count; |
1772 | } |
1773 | |
1774 | // Copy the flags onto the implicit condition register operand. |
1775 | static void preserveCondRegFlags(MachineOperand &CondReg, |
1776 | const MachineOperand &OrigCond) { |
1777 | CondReg.setIsUndef(OrigCond.isUndef()); |
1778 | CondReg.setIsKill(OrigCond.isKill()); |
1779 | } |
1780 | |
1781 | unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, |
1782 | MachineBasicBlock *TBB, |
1783 | MachineBasicBlock *FBB, |
1784 | ArrayRef<MachineOperand> Cond, |
1785 | const DebugLoc &DL, |
1786 | int *BytesAdded) const { |
1787 | if (!FBB && Cond.empty()) { |
1788 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) |
1789 | .addMBB(TBB); |
1790 | if (BytesAdded) |
1791 | *BytesAdded = 4; |
1792 | return 1; |
1793 | } |
1794 | |
1795 | if(Cond.size() == 1 && Cond[0].isReg()) { |
1796 | BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) |
1797 | .add(Cond[0]) |
1798 | .addMBB(TBB); |
1799 | return 1; |
1800 | } |
1801 | |
1802 | assert(TBB && Cond[0].isImm())((TBB && Cond[0].isImm()) ? static_cast<void> ( 0) : __assert_fail ("TBB && Cond[0].isImm()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1802, __PRETTY_FUNCTION__)); |
1803 | |
1804 | unsigned Opcode |
1805 | = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); |
1806 | |
1807 | if (!FBB) { |
1808 | Cond[1].isUndef(); |
1809 | MachineInstr *CondBr = |
1810 | BuildMI(&MBB, DL, get(Opcode)) |
1811 | .addMBB(TBB); |
1812 | |
1813 | // Copy the flags onto the implicit condition register operand. |
1814 | preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); |
1815 | |
1816 | if (BytesAdded) |
1817 | *BytesAdded = 4; |
1818 | return 1; |
1819 | } |
1820 | |
1821 | assert(TBB && FBB)((TBB && FBB) ? static_cast<void> (0) : __assert_fail ("TBB && FBB", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1821, __PRETTY_FUNCTION__)); |
1822 | |
1823 | MachineInstr *CondBr = |
1824 | BuildMI(&MBB, DL, get(Opcode)) |
1825 | .addMBB(TBB); |
1826 | BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) |
1827 | .addMBB(FBB); |
1828 | |
1829 | MachineOperand &CondReg = CondBr->getOperand(1); |
1830 | CondReg.setIsUndef(Cond[1].isUndef()); |
1831 | CondReg.setIsKill(Cond[1].isKill()); |
1832 | |
1833 | if (BytesAdded) |
1834 | *BytesAdded = 8; |
1835 | |
1836 | return 2; |
1837 | } |
1838 | |
1839 | bool SIInstrInfo::reverseBranchCondition( |
1840 | SmallVectorImpl<MachineOperand> &Cond) const { |
1841 | if (Cond.size() != 2) { |
1842 | return true; |
1843 | } |
1844 | |
1845 | if (Cond[0].isImm()) { |
1846 | Cond[0].setImm(-Cond[0].getImm()); |
1847 | return false; |
1848 | } |
1849 | |
1850 | return true; |
1851 | } |
1852 | |
1853 | bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, |
1854 | ArrayRef<MachineOperand> Cond, |
1855 | unsigned TrueReg, unsigned FalseReg, |
1856 | int &CondCycles, |
1857 | int &TrueCycles, int &FalseCycles) const { |
1858 | switch (Cond[0].getImm()) { |
1859 | case VCCNZ: |
1860 | case VCCZ: { |
1861 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
1862 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); |
1863 | assert(MRI.getRegClass(FalseReg) == RC)((MRI.getRegClass(FalseReg) == RC) ? static_cast<void> ( 0) : __assert_fail ("MRI.getRegClass(FalseReg) == RC", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1863, __PRETTY_FUNCTION__)); |
1864 | |
1865 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; |
1866 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? |
1867 | |
1868 | // Limit to equal cost for branch vs. N v_cndmask_b32s. |
1869 | return !RI.isSGPRClass(RC) && NumInsts <= 6; |
1870 | } |
1871 | case SCC_TRUE: |
1872 | case SCC_FALSE: { |
1873 | // FIXME: We could insert for VGPRs if we could replace the original compare |
1874 | // with a vector one. |
1875 | const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
1876 | const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); |
1877 | assert(MRI.getRegClass(FalseReg) == RC)((MRI.getRegClass(FalseReg) == RC) ? static_cast<void> ( 0) : __assert_fail ("MRI.getRegClass(FalseReg) == RC", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 1877, __PRETTY_FUNCTION__)); |
1878 | |
1879 | int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; |
1880 | |
1881 | // Multiples of 8 can do s_cselect_b64 |
1882 | if (NumInsts % 2 == 0) |
1883 | NumInsts /= 2; |
1884 | |
1885 | CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? |
1886 | return RI.isSGPRClass(RC); |
1887 | } |
1888 | default: |
1889 | return false; |
1890 | } |
1891 | } |
1892 | |
1893 | void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, |
1894 | MachineBasicBlock::iterator I, const DebugLoc &DL, |
1895 | unsigned DstReg, ArrayRef<MachineOperand> Cond, |
1896 | unsigned TrueReg, unsigned FalseReg) const { |
1897 | BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); |
1898 | if (Pred == VCCZ || Pred == SCC_FALSE) { |
1899 | Pred = static_cast<BranchPredicate>(-Pred); |
1900 | std::swap(TrueReg, FalseReg); |
1901 | } |
1902 | |
1903 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
1904 | const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); |
1905 | unsigned DstSize = RI.getRegSizeInBits(*DstRC); |
1906 | |
1907 | if (DstSize == 32) { |
1908 | unsigned SelOp = Pred == SCC_TRUE ? |
1909 | AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; |
1910 | |
1911 | // Instruction's operands are backwards from what is expected. |
1912 | MachineInstr *Select = |
1913 | BuildMI(MBB, I, DL, get(SelOp), DstReg) |
1914 | .addReg(FalseReg) |
1915 | .addReg(TrueReg); |
1916 | |
1917 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); |
1918 | return; |
1919 | } |
1920 | |
1921 | if (DstSize == 64 && Pred == SCC_TRUE) { |
1922 | MachineInstr *Select = |
1923 | BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) |
1924 | .addReg(FalseReg) |
1925 | .addReg(TrueReg); |
1926 | |
1927 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); |
1928 | return; |
1929 | } |
1930 | |
1931 | static const int16_t Sub0_15[] = { |
1932 | AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, |
1933 | AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, |
1934 | AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, |
1935 | AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, |
1936 | }; |
1937 | |
1938 | static const int16_t Sub0_15_64[] = { |
1939 | AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, |
1940 | AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, |
1941 | AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, |
1942 | AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, |
1943 | }; |
1944 | |
1945 | unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; |
1946 | const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; |
1947 | const int16_t *SubIndices = Sub0_15; |
1948 | int NElts = DstSize / 32; |
1949 | |
1950 | // 64-bit select is only available for SALU. |
1951 | // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. |
1952 | if (Pred == SCC_TRUE) { |
1953 | if (NElts % 2) { |
1954 | SelOp = AMDGPU::S_CSELECT_B32; |
1955 | EltRC = &AMDGPU::SGPR_32RegClass; |
1956 | } else { |
1957 | SelOp = AMDGPU::S_CSELECT_B64; |
1958 | EltRC = &AMDGPU::SGPR_64RegClass; |
1959 | SubIndices = Sub0_15_64; |
1960 | NElts /= 2; |
1961 | } |
1962 | } |
1963 | |
1964 | MachineInstrBuilder MIB = BuildMI( |
1965 | MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); |
1966 | |
1967 | I = MIB->getIterator(); |
1968 | |
1969 | SmallVector<unsigned, 8> Regs; |
1970 | for (int Idx = 0; Idx != NElts; ++Idx) { |
1971 | unsigned DstElt = MRI.createVirtualRegister(EltRC); |
1972 | Regs.push_back(DstElt); |
1973 | |
1974 | unsigned SubIdx = SubIndices[Idx]; |
1975 | |
1976 | MachineInstr *Select = |
1977 | BuildMI(MBB, I, DL, get(SelOp), DstElt) |
1978 | .addReg(FalseReg, 0, SubIdx) |
1979 | .addReg(TrueReg, 0, SubIdx); |
1980 | preserveCondRegFlags(Select->getOperand(3), Cond[1]); |
1981 | |
1982 | MIB.addReg(DstElt) |
1983 | .addImm(SubIdx); |
1984 | } |
1985 | } |
1986 | |
1987 | bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { |
1988 | switch (MI.getOpcode()) { |
1989 | case AMDGPU::V_MOV_B32_e32: |
1990 | case AMDGPU::V_MOV_B32_e64: |
1991 | case AMDGPU::V_MOV_B64_PSEUDO: { |
1992 | // If there are additional implicit register operands, this may be used for |
1993 | // register indexing so the source register operand isn't simply copied. |
1994 | unsigned NumOps = MI.getDesc().getNumOperands() + |
1995 | MI.getDesc().getNumImplicitUses(); |
1996 | |
1997 | return MI.getNumOperands() == NumOps; |
1998 | } |
1999 | case AMDGPU::S_MOV_B32: |
2000 | case AMDGPU::S_MOV_B64: |
2001 | case AMDGPU::COPY: |
2002 | return true; |
2003 | default: |
2004 | return false; |
2005 | } |
2006 | } |
2007 | |
2008 | unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( |
2009 | unsigned Kind) const { |
2010 | switch(Kind) { |
2011 | case PseudoSourceValue::Stack: |
2012 | case PseudoSourceValue::FixedStack: |
2013 | return AMDGPUAS::PRIVATE_ADDRESS; |
2014 | case PseudoSourceValue::ConstantPool: |
2015 | case PseudoSourceValue::GOT: |
2016 | case PseudoSourceValue::JumpTable: |
2017 | case PseudoSourceValue::GlobalValueCallEntry: |
2018 | case PseudoSourceValue::ExternalSymbolCallEntry: |
2019 | case PseudoSourceValue::TargetCustom: |
2020 | return AMDGPUAS::CONSTANT_ADDRESS; |
2021 | } |
2022 | return AMDGPUAS::FLAT_ADDRESS; |
2023 | } |
2024 | |
2025 | static void removeModOperands(MachineInstr &MI) { |
2026 | unsigned Opc = MI.getOpcode(); |
2027 | int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, |
2028 | AMDGPU::OpName::src0_modifiers); |
2029 | int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, |
2030 | AMDGPU::OpName::src1_modifiers); |
2031 | int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, |
2032 | AMDGPU::OpName::src2_modifiers); |
2033 | |
2034 | MI.RemoveOperand(Src2ModIdx); |
2035 | MI.RemoveOperand(Src1ModIdx); |
2036 | MI.RemoveOperand(Src0ModIdx); |
2037 | } |
2038 | |
2039 | bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, |
2040 | unsigned Reg, MachineRegisterInfo *MRI) const { |
2041 | if (!MRI->hasOneNonDBGUse(Reg)) |
2042 | return false; |
2043 | |
2044 | switch (DefMI.getOpcode()) { |
2045 | default: |
2046 | return false; |
2047 | case AMDGPU::S_MOV_B64: |
2048 | // TODO: We could fold 64-bit immediates, but this get compilicated |
2049 | // when there are sub-registers. |
2050 | return false; |
2051 | |
2052 | case AMDGPU::V_MOV_B32_e32: |
2053 | case AMDGPU::S_MOV_B32: |
2054 | break; |
2055 | } |
2056 | |
2057 | const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); |
2058 | assert(ImmOp)((ImmOp) ? static_cast<void> (0) : __assert_fail ("ImmOp" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2058, __PRETTY_FUNCTION__)); |
2059 | // FIXME: We could handle FrameIndex values here. |
2060 | if (!ImmOp->isImm()) |
2061 | return false; |
2062 | |
2063 | unsigned Opc = UseMI.getOpcode(); |
2064 | if (Opc == AMDGPU::COPY) { |
2065 | bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); |
2066 | unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; |
2067 | UseMI.setDesc(get(NewOpc)); |
2068 | UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); |
2069 | UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); |
2070 | return true; |
2071 | } |
2072 | |
2073 | if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || |
2074 | Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 || |
2075 | Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || |
2076 | Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) { |
2077 | // Don't fold if we are using source or output modifiers. The new VOP2 |
2078 | // instructions don't have them. |
2079 | if (hasAnyModifiersSet(UseMI)) |
2080 | return false; |
2081 | |
2082 | // If this is a free constant, there's no reason to do this. |
2083 | // TODO: We could fold this here instead of letting SIFoldOperands do it |
2084 | // later. |
2085 | MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); |
2086 | |
2087 | // Any src operand can be used for the legality check. |
2088 | if (isInlineConstant(UseMI, *Src0, *ImmOp)) |
2089 | return false; |
2090 | |
2091 | bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || |
2092 | Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64; |
2093 | bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || |
2094 | Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64; |
2095 | MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); |
2096 | MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); |
2097 | |
2098 | // Multiplied part is the constant: Use v_madmk_{f16, f32}. |
2099 | // We should only expect these to be on src0 due to canonicalizations. |
2100 | if (Src0->isReg() && Src0->getReg() == Reg) { |
2101 | if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) |
2102 | return false; |
2103 | |
2104 | if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) |
2105 | return false; |
2106 | |
2107 | unsigned NewOpc = |
2108 | IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) |
2109 | : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); |
2110 | if (pseudoToMCOpcode(NewOpc) == -1) |
2111 | return false; |
2112 | |
2113 | // We need to swap operands 0 and 1 since madmk constant is at operand 1. |
2114 | |
2115 | const int64_t Imm = ImmOp->getImm(); |
2116 | |
2117 | // FIXME: This would be a lot easier if we could return a new instruction |
2118 | // instead of having to modify in place. |
2119 | |
2120 | // Remove these first since they are at the end. |
2121 | UseMI.RemoveOperand( |
2122 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); |
2123 | UseMI.RemoveOperand( |
2124 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); |
2125 | |
2126 | unsigned Src1Reg = Src1->getReg(); |
2127 | unsigned Src1SubReg = Src1->getSubReg(); |
2128 | Src0->setReg(Src1Reg); |
2129 | Src0->setSubReg(Src1SubReg); |
2130 | Src0->setIsKill(Src1->isKill()); |
2131 | |
2132 | if (Opc == AMDGPU::V_MAC_F32_e64 || |
2133 | Opc == AMDGPU::V_MAC_F16_e64 || |
2134 | Opc == AMDGPU::V_FMAC_F32_e64 || |
2135 | Opc == AMDGPU::V_FMAC_F16_e64) |
2136 | UseMI.untieRegOperand( |
2137 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); |
2138 | |
2139 | Src1->ChangeToImmediate(Imm); |
2140 | |
2141 | removeModOperands(UseMI); |
2142 | UseMI.setDesc(get(NewOpc)); |
2143 | |
2144 | bool DeleteDef = MRI->hasOneNonDBGUse(Reg); |
2145 | if (DeleteDef) |
2146 | DefMI.eraseFromParent(); |
2147 | |
2148 | return true; |
2149 | } |
2150 | |
2151 | // Added part is the constant: Use v_madak_{f16, f32}. |
2152 | if (Src2->isReg() && Src2->getReg() == Reg) { |
2153 | // Not allowed to use constant bus for another operand. |
2154 | // We can however allow an inline immediate as src0. |
2155 | bool Src0Inlined = false; |
2156 | if (Src0->isReg()) { |
2157 | // Try to inline constant if possible. |
2158 | // If the Def moves immediate and the use is single |
2159 | // We are saving VGPR here. |
2160 | MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); |
2161 | if (Def && Def->isMoveImmediate() && |
2162 | isInlineConstant(Def->getOperand(1)) && |
2163 | MRI->hasOneUse(Src0->getReg())) { |
2164 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); |
2165 | Src0Inlined = true; |
2166 | } else if ((RI.isPhysicalRegister(Src0->getReg()) && |
2167 | (ST.getConstantBusLimit(Opc) <= 1 && |
2168 | RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || |
2169 | (RI.isVirtualRegister(Src0->getReg()) && |
2170 | (ST.getConstantBusLimit(Opc) <= 1 && |
2171 | RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) |
2172 | return false; |
2173 | // VGPR is okay as Src0 - fallthrough |
2174 | } |
2175 | |
2176 | if (Src1->isReg() && !Src0Inlined ) { |
2177 | // We have one slot for inlinable constant so far - try to fill it |
2178 | MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); |
2179 | if (Def && Def->isMoveImmediate() && |
2180 | isInlineConstant(Def->getOperand(1)) && |
2181 | MRI->hasOneUse(Src1->getReg()) && |
2182 | commuteInstruction(UseMI)) { |
2183 | Src0->ChangeToImmediate(Def->getOperand(1).getImm()); |
2184 | } else if ((RI.isPhysicalRegister(Src1->getReg()) && |
2185 | RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || |
2186 | (RI.isVirtualRegister(Src1->getReg()) && |
2187 | RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) |
2188 | return false; |
2189 | // VGPR is okay as Src1 - fallthrough |
2190 | } |
2191 | |
2192 | unsigned NewOpc = |
2193 | IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) |
2194 | : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); |
2195 | if (pseudoToMCOpcode(NewOpc) == -1) |
2196 | return false; |
2197 | |
2198 | const int64_t Imm = ImmOp->getImm(); |
2199 | |
2200 | // FIXME: This would be a lot easier if we could return a new instruction |
2201 | // instead of having to modify in place. |
2202 | |
2203 | // Remove these first since they are at the end. |
2204 | UseMI.RemoveOperand( |
2205 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); |
2206 | UseMI.RemoveOperand( |
2207 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); |
2208 | |
2209 | if (Opc == AMDGPU::V_MAC_F32_e64 || |
2210 | Opc == AMDGPU::V_MAC_F16_e64 || |
2211 | Opc == AMDGPU::V_FMAC_F32_e64 || |
2212 | Opc == AMDGPU::V_FMAC_F16_e64) |
2213 | UseMI.untieRegOperand( |
2214 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); |
2215 | |
2216 | // ChangingToImmediate adds Src2 back to the instruction. |
2217 | Src2->ChangeToImmediate(Imm); |
2218 | |
2219 | // These come before src2. |
2220 | removeModOperands(UseMI); |
2221 | UseMI.setDesc(get(NewOpc)); |
2222 | // It might happen that UseMI was commuted |
2223 | // and we now have SGPR as SRC1. If so 2 inlined |
2224 | // constant and SGPR are illegal. |
2225 | legalizeOperands(UseMI); |
2226 | |
2227 | bool DeleteDef = MRI->hasOneNonDBGUse(Reg); |
2228 | if (DeleteDef) |
2229 | DefMI.eraseFromParent(); |
2230 | |
2231 | return true; |
2232 | } |
2233 | } |
2234 | |
2235 | return false; |
2236 | } |
2237 | |
2238 | static bool offsetsDoNotOverlap(int WidthA, int OffsetA, |
2239 | int WidthB, int OffsetB) { |
2240 | int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; |
2241 | int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; |
2242 | int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; |
2243 | return LowOffset + LowWidth <= HighOffset; |
2244 | } |
2245 | |
2246 | bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, |
2247 | const MachineInstr &MIb) const { |
2248 | const MachineOperand *BaseOp0, *BaseOp1; |
2249 | int64_t Offset0, Offset1; |
2250 | |
2251 | if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) && |
2252 | getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) { |
2253 | if (!BaseOp0->isIdenticalTo(*BaseOp1)) |
2254 | return false; |
2255 | |
2256 | if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { |
2257 | // FIXME: Handle ds_read2 / ds_write2. |
2258 | return false; |
2259 | } |
2260 | unsigned Width0 = (*MIa.memoperands_begin())->getSize(); |
2261 | unsigned Width1 = (*MIb.memoperands_begin())->getSize(); |
2262 | if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { |
2263 | return true; |
2264 | } |
2265 | } |
2266 | |
2267 | return false; |
2268 | } |
2269 | |
2270 | bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, |
2271 | const MachineInstr &MIb, |
2272 | AliasAnalysis *AA) const { |
2273 | assert((MIa.mayLoad() || MIa.mayStore()) &&(((MIa.mayLoad() || MIa.mayStore()) && "MIa must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("(MIa.mayLoad() || MIa.mayStore()) && \"MIa must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2274, __PRETTY_FUNCTION__)) |
2274 | "MIa must load from or modify a memory location")(((MIa.mayLoad() || MIa.mayStore()) && "MIa must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("(MIa.mayLoad() || MIa.mayStore()) && \"MIa must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2274, __PRETTY_FUNCTION__)); |
2275 | assert((MIb.mayLoad() || MIb.mayStore()) &&(((MIb.mayLoad() || MIb.mayStore()) && "MIb must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("(MIb.mayLoad() || MIb.mayStore()) && \"MIb must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2276, __PRETTY_FUNCTION__)) |
2276 | "MIb must load from or modify a memory location")(((MIb.mayLoad() || MIb.mayStore()) && "MIb must load from or modify a memory location" ) ? static_cast<void> (0) : __assert_fail ("(MIb.mayLoad() || MIb.mayStore()) && \"MIb must load from or modify a memory location\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2276, __PRETTY_FUNCTION__)); |
2277 | |
2278 | if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) |
2279 | return false; |
2280 | |
2281 | // XXX - Can we relax this between address spaces? |
2282 | if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) |
2283 | return false; |
2284 | |
2285 | // TODO: Should we check the address space from the MachineMemOperand? That |
2286 | // would allow us to distinguish objects we know don't alias based on the |
2287 | // underlying address space, even if it was lowered to a different one, |
2288 | // e.g. private accesses lowered to use MUBUF instructions on a scratch |
2289 | // buffer. |
2290 | if (isDS(MIa)) { |
2291 | if (isDS(MIb)) |
2292 | return checkInstOffsetsDoNotOverlap(MIa, MIb); |
2293 | |
2294 | return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); |
2295 | } |
2296 | |
2297 | if (isMUBUF(MIa) || isMTBUF(MIa)) { |
2298 | if (isMUBUF(MIb) || isMTBUF(MIb)) |
2299 | return checkInstOffsetsDoNotOverlap(MIa, MIb); |
2300 | |
2301 | return !isFLAT(MIb) && !isSMRD(MIb); |
2302 | } |
2303 | |
2304 | if (isSMRD(MIa)) { |
2305 | if (isSMRD(MIb)) |
2306 | return checkInstOffsetsDoNotOverlap(MIa, MIb); |
2307 | |
2308 | return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); |
2309 | } |
2310 | |
2311 | if (isFLAT(MIa)) { |
2312 | if (isFLAT(MIb)) |
2313 | return checkInstOffsetsDoNotOverlap(MIa, MIb); |
2314 | |
2315 | return false; |
2316 | } |
2317 | |
2318 | return false; |
2319 | } |
2320 | |
2321 | static int64_t getFoldableImm(const MachineOperand* MO) { |
2322 | if (!MO->isReg()) |
2323 | return false; |
2324 | const MachineFunction *MF = MO->getParent()->getParent()->getParent(); |
2325 | const MachineRegisterInfo &MRI = MF->getRegInfo(); |
2326 | auto Def = MRI.getUniqueVRegDef(MO->getReg()); |
2327 | if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && |
2328 | Def->getOperand(1).isImm()) |
2329 | return Def->getOperand(1).getImm(); |
2330 | return AMDGPU::NoRegister; |
2331 | } |
2332 | |
2333 | MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, |
2334 | MachineInstr &MI, |
2335 | LiveVariables *LV) const { |
2336 | unsigned Opc = MI.getOpcode(); |
2337 | bool IsF16 = false; |
2338 | bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || |
2339 | Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; |
2340 | |
2341 | switch (Opc) { |
2342 | default: |
2343 | return nullptr; |
2344 | case AMDGPU::V_MAC_F16_e64: |
2345 | case AMDGPU::V_FMAC_F16_e64: |
2346 | IsF16 = true; |
2347 | LLVM_FALLTHROUGH[[clang::fallthrough]]; |
2348 | case AMDGPU::V_MAC_F32_e64: |
2349 | case AMDGPU::V_FMAC_F32_e64: |
2350 | break; |
2351 | case AMDGPU::V_MAC_F16_e32: |
2352 | case AMDGPU::V_FMAC_F16_e32: |
2353 | IsF16 = true; |
2354 | LLVM_FALLTHROUGH[[clang::fallthrough]]; |
2355 | case AMDGPU::V_MAC_F32_e32: |
2356 | case AMDGPU::V_FMAC_F32_e32: { |
2357 | int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), |
2358 | AMDGPU::OpName::src0); |
2359 | const MachineOperand *Src0 = &MI.getOperand(Src0Idx); |
2360 | if (!Src0->isReg() && !Src0->isImm()) |
2361 | return nullptr; |
2362 | |
2363 | if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) |
2364 | return nullptr; |
2365 | |
2366 | break; |
2367 | } |
2368 | } |
2369 | |
2370 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); |
2371 | const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); |
2372 | const MachineOperand *Src0Mods = |
2373 | getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); |
2374 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); |
2375 | const MachineOperand *Src1Mods = |
2376 | getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); |
2377 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); |
2378 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); |
2379 | const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); |
2380 | |
2381 | if (!Src0Mods && !Src1Mods && !Clamp && !Omod && |
2382 | // If we have an SGPR input, we will violate the constant bus restriction. |
2383 | (ST.getConstantBusLimit(Opc) > 1 || |
2384 | !Src0->isReg() || |
2385 | !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { |
2386 | if (auto Imm = getFoldableImm(Src2)) { |
2387 | unsigned NewOpc = |
2388 | IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) |
2389 | : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); |
2390 | if (pseudoToMCOpcode(NewOpc) != -1) |
2391 | return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) |
2392 | .add(*Dst) |
2393 | .add(*Src0) |
2394 | .add(*Src1) |
2395 | .addImm(Imm); |
2396 | } |
2397 | unsigned NewOpc = |
2398 | IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) |
2399 | : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); |
2400 | if (auto Imm = getFoldableImm(Src1)) { |
2401 | if (pseudoToMCOpcode(NewOpc) != -1) |
2402 | return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) |
2403 | .add(*Dst) |
2404 | .add(*Src0) |
2405 | .addImm(Imm) |
2406 | .add(*Src2); |
2407 | } |
2408 | if (auto Imm = getFoldableImm(Src0)) { |
2409 | if (pseudoToMCOpcode(NewOpc) != -1 && |
2410 | isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc, |
2411 | AMDGPU::OpName::src0), Src1)) |
2412 | return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) |
2413 | .add(*Dst) |
2414 | .add(*Src1) |
2415 | .addImm(Imm) |
2416 | .add(*Src2); |
2417 | } |
2418 | } |
2419 | |
2420 | unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32) |
2421 | : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); |
2422 | if (pseudoToMCOpcode(NewOpc) == -1) |
2423 | return nullptr; |
2424 | |
2425 | return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) |
2426 | .add(*Dst) |
2427 | .addImm(Src0Mods ? Src0Mods->getImm() : 0) |
2428 | .add(*Src0) |
2429 | .addImm(Src1Mods ? Src1Mods->getImm() : 0) |
2430 | .add(*Src1) |
2431 | .addImm(0) // Src mods |
2432 | .add(*Src2) |
2433 | .addImm(Clamp ? Clamp->getImm() : 0) |
2434 | .addImm(Omod ? Omod->getImm() : 0); |
2435 | } |
2436 | |
2437 | // It's not generally safe to move VALU instructions across these since it will |
2438 | // start using the register as a base index rather than directly. |
2439 | // XXX - Why isn't hasSideEffects sufficient for these? |
2440 | static bool changesVGPRIndexingMode(const MachineInstr &MI) { |
2441 | switch (MI.getOpcode()) { |
2442 | case AMDGPU::S_SET_GPR_IDX_ON: |
2443 | case AMDGPU::S_SET_GPR_IDX_MODE: |
2444 | case AMDGPU::S_SET_GPR_IDX_OFF: |
2445 | return true; |
2446 | default: |
2447 | return false; |
2448 | } |
2449 | } |
2450 | |
2451 | bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, |
2452 | const MachineBasicBlock *MBB, |
2453 | const MachineFunction &MF) const { |
2454 | // XXX - Do we want the SP check in the base implementation? |
2455 | |
2456 | // Target-independent instructions do not have an implicit-use of EXEC, even |
2457 | // when they operate on VGPRs. Treating EXEC modifications as scheduling |
2458 | // boundaries prevents incorrect movements of such instructions. |
2459 | return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || |
2460 | MI.modifiesRegister(AMDGPU::EXEC, &RI) || |
2461 | MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || |
2462 | MI.getOpcode() == AMDGPU::S_SETREG_B32 || |
2463 | changesVGPRIndexingMode(MI); |
2464 | } |
2465 | |
2466 | bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { |
2467 | return Opcode == AMDGPU::DS_ORDERED_COUNT || |
2468 | Opcode == AMDGPU::DS_GWS_INIT || |
2469 | Opcode == AMDGPU::DS_GWS_SEMA_V || |
2470 | Opcode == AMDGPU::DS_GWS_SEMA_BR || |
2471 | Opcode == AMDGPU::DS_GWS_SEMA_P || |
2472 | Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || |
2473 | Opcode == AMDGPU::DS_GWS_BARRIER; |
2474 | } |
2475 | |
2476 | bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { |
2477 | unsigned Opcode = MI.getOpcode(); |
2478 | |
2479 | if (MI.mayStore() && isSMRD(MI)) |
2480 | return true; // scalar store or atomic |
2481 | |
2482 | // These instructions cause shader I/O that may cause hardware lockups |
2483 | // when executed with an empty EXEC mask. |
2484 | // |
2485 | // Note: exp with VM = DONE = 0 is automatically skipped by hardware when |
2486 | // EXEC = 0, but checking for that case here seems not worth it |
2487 | // given the typical code patterns. |
2488 | if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || |
2489 | Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || |
2490 | Opcode == AMDGPU::DS_ORDERED_COUNT) |
2491 | return true; |
2492 | |
2493 | if (MI.isCall() || MI.isInlineAsm()) |
2494 | return true; // conservative assumption |
2495 | |
2496 | // These are like SALU instructions in terms of effects, so it's questionable |
2497 | // whether we should return true for those. |
2498 | // |
2499 | // However, executing them with EXEC = 0 causes them to operate on undefined |
2500 | // data, which we avoid by returning true here. |
2501 | if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32) |
2502 | return true; |
2503 | |
2504 | return false; |
2505 | } |
2506 | |
2507 | bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, |
2508 | const MachineInstr &MI) const { |
2509 | if (MI.isMetaInstruction()) |
2510 | return false; |
2511 | |
2512 | // This won't read exec if this is an SGPR->SGPR copy. |
2513 | if (MI.isCopyLike()) { |
2514 | if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) |
2515 | return true; |
2516 | |
2517 | // Make sure this isn't copying exec as a normal operand |
2518 | return MI.readsRegister(AMDGPU::EXEC, &RI); |
2519 | } |
2520 | |
2521 | // Make a conservative assumption about the callee. |
2522 | if (MI.isCall()) |
2523 | return true; |
2524 | |
2525 | // Be conservative with any unhandled generic opcodes. |
2526 | if (!isTargetSpecificOpcode(MI.getOpcode())) |
2527 | return true; |
2528 | |
2529 | return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); |
2530 | } |
2531 | |
2532 | bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { |
2533 | switch (Imm.getBitWidth()) { |
2534 | case 1: // This likely will be a condition code mask. |
2535 | return true; |
2536 | |
2537 | case 32: |
2538 | return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), |
2539 | ST.hasInv2PiInlineImm()); |
2540 | case 64: |
2541 | return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), |
2542 | ST.hasInv2PiInlineImm()); |
2543 | case 16: |
2544 | return ST.has16BitInsts() && |
2545 | AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), |
2546 | ST.hasInv2PiInlineImm()); |
2547 | default: |
2548 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2548); |
2549 | } |
2550 | } |
2551 | |
2552 | bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, |
2553 | uint8_t OperandType) const { |
2554 | if (!MO.isImm() || |
2555 | OperandType < AMDGPU::OPERAND_SRC_FIRST || |
2556 | OperandType > AMDGPU::OPERAND_SRC_LAST) |
2557 | return false; |
2558 | |
2559 | // MachineOperand provides no way to tell the true operand size, since it only |
2560 | // records a 64-bit value. We need to know the size to determine if a 32-bit |
2561 | // floating point immediate bit pattern is legal for an integer immediate. It |
2562 | // would be for any 32-bit integer operand, but would not be for a 64-bit one. |
2563 | |
2564 | int64_t Imm = MO.getImm(); |
2565 | switch (OperandType) { |
2566 | case AMDGPU::OPERAND_REG_IMM_INT32: |
2567 | case AMDGPU::OPERAND_REG_IMM_FP32: |
2568 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: |
2569 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: { |
2570 | int32_t Trunc = static_cast<int32_t>(Imm); |
2571 | return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); |
2572 | } |
2573 | case AMDGPU::OPERAND_REG_IMM_INT64: |
2574 | case AMDGPU::OPERAND_REG_IMM_FP64: |
2575 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: |
2576 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: |
2577 | return AMDGPU::isInlinableLiteral64(MO.getImm(), |
2578 | ST.hasInv2PiInlineImm()); |
2579 | case AMDGPU::OPERAND_REG_IMM_INT16: |
2580 | case AMDGPU::OPERAND_REG_IMM_FP16: |
2581 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: |
2582 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: { |
2583 | if (isInt<16>(Imm) || isUInt<16>(Imm)) { |
2584 | // A few special case instructions have 16-bit operands on subtargets |
2585 | // where 16-bit instructions are not legal. |
2586 | // TODO: Do the 32-bit immediates work? We shouldn't really need to handle |
2587 | // constants in these cases |
2588 | int16_t Trunc = static_cast<int16_t>(Imm); |
2589 | return ST.has16BitInsts() && |
2590 | AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); |
2591 | } |
2592 | |
2593 | return false; |
2594 | } |
2595 | case AMDGPU::OPERAND_REG_IMM_V2INT16: |
2596 | case AMDGPU::OPERAND_REG_IMM_V2FP16: |
2597 | case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: |
2598 | case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { |
2599 | uint32_t Trunc = static_cast<uint32_t>(Imm); |
2600 | return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); |
2601 | } |
2602 | default: |
2603 | llvm_unreachable("invalid bitwidth")::llvm::llvm_unreachable_internal("invalid bitwidth", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2603); |
2604 | } |
2605 | } |
2606 | |
2607 | bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, |
2608 | const MCOperandInfo &OpInfo) const { |
2609 | switch (MO.getType()) { |
2610 | case MachineOperand::MO_Register: |
2611 | return false; |
2612 | case MachineOperand::MO_Immediate: |
2613 | return !isInlineConstant(MO, OpInfo); |
2614 | case MachineOperand::MO_FrameIndex: |
2615 | case MachineOperand::MO_MachineBasicBlock: |
2616 | case MachineOperand::MO_ExternalSymbol: |
2617 | case MachineOperand::MO_GlobalAddress: |
2618 | case MachineOperand::MO_MCSymbol: |
2619 | return true; |
2620 | default: |
2621 | llvm_unreachable("unexpected operand type")::llvm::llvm_unreachable_internal("unexpected operand type", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2621); |
2622 | } |
2623 | } |
2624 | |
2625 | static bool compareMachineOp(const MachineOperand &Op0, |
2626 | const MachineOperand &Op1) { |
2627 | if (Op0.getType() != Op1.getType()) |
2628 | return false; |
2629 | |
2630 | switch (Op0.getType()) { |
2631 | case MachineOperand::MO_Register: |
2632 | return Op0.getReg() == Op1.getReg(); |
2633 | case MachineOperand::MO_Immediate: |
2634 | return Op0.getImm() == Op1.getImm(); |
2635 | default: |
2636 | llvm_unreachable("Didn't expect to be comparing these operand types")::llvm::llvm_unreachable_internal("Didn't expect to be comparing these operand types" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2636); |
2637 | } |
2638 | } |
2639 | |
2640 | bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, |
2641 | const MachineOperand &MO) const { |
2642 | const MCInstrDesc &InstDesc = MI.getDesc(); |
2643 | const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; |
2644 | |
2645 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI())((MO.isImm() || MO.isTargetIndex() || MO.isFI()) ? static_cast <void> (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI()" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2645, __PRETTY_FUNCTION__)); |
2646 | |
2647 | if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) |
2648 | return true; |
2649 | |
2650 | if (OpInfo.RegClass < 0) |
2651 | return false; |
2652 | |
2653 | if (MO.isImm() && isInlineConstant(MO, OpInfo)) |
2654 | return RI.opCanUseInlineConstant(OpInfo.OperandType); |
2655 | |
2656 | if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) |
2657 | return false; |
2658 | |
2659 | if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) |
2660 | return true; |
2661 | |
2662 | const MachineFunction *MF = MI.getParent()->getParent(); |
2663 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
2664 | return ST.hasVOP3Literal(); |
2665 | } |
2666 | |
2667 | bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { |
2668 | int Op32 = AMDGPU::getVOPe32(Opcode); |
2669 | if (Op32 == -1) |
2670 | return false; |
2671 | |
2672 | return pseudoToMCOpcode(Op32) != -1; |
2673 | } |
2674 | |
2675 | bool SIInstrInfo::hasModifiers(unsigned Opcode) const { |
2676 | // The src0_modifier operand is present on all instructions |
2677 | // that have modifiers. |
2678 | |
2679 | return AMDGPU::getNamedOperandIdx(Opcode, |
2680 | AMDGPU::OpName::src0_modifiers) != -1; |
2681 | } |
2682 | |
2683 | bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, |
2684 | unsigned OpName) const { |
2685 | const MachineOperand *Mods = getNamedOperand(MI, OpName); |
2686 | return Mods && Mods->getImm(); |
2687 | } |
2688 | |
2689 | bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { |
2690 | return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || |
2691 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || |
2692 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || |
2693 | hasModifiersSet(MI, AMDGPU::OpName::clamp) || |
2694 | hasModifiersSet(MI, AMDGPU::OpName::omod); |
2695 | } |
2696 | |
2697 | bool SIInstrInfo::canShrink(const MachineInstr &MI, |
2698 | const MachineRegisterInfo &MRI) const { |
2699 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); |
2700 | // Can't shrink instruction with three operands. |
2701 | // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add |
2702 | // a special case for it. It can only be shrunk if the third operand |
2703 | // is vcc, and src0_modifiers and src1_modifiers are not set. |
2704 | // We should handle this the same way we handle vopc, by addding |
2705 | // a register allocation hint pre-regalloc and then do the shrinking |
2706 | // post-regalloc. |
2707 | if (Src2) { |
2708 | switch (MI.getOpcode()) { |
2709 | default: return false; |
2710 | |
2711 | case AMDGPU::V_ADDC_U32_e64: |
2712 | case AMDGPU::V_SUBB_U32_e64: |
2713 | case AMDGPU::V_SUBBREV_U32_e64: { |
2714 | const MachineOperand *Src1 |
2715 | = getNamedOperand(MI, AMDGPU::OpName::src1); |
2716 | if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) |
2717 | return false; |
2718 | // Additional verification is needed for sdst/src2. |
2719 | return true; |
2720 | } |
2721 | case AMDGPU::V_MAC_F32_e64: |
2722 | case AMDGPU::V_MAC_F16_e64: |
2723 | case AMDGPU::V_FMAC_F32_e64: |
2724 | case AMDGPU::V_FMAC_F16_e64: |
2725 | if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || |
2726 | hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) |
2727 | return false; |
2728 | break; |
2729 | |
2730 | case AMDGPU::V_CNDMASK_B32_e64: |
2731 | break; |
2732 | } |
2733 | } |
2734 | |
2735 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); |
2736 | if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || |
2737 | hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) |
2738 | return false; |
2739 | |
2740 | // We don't need to check src0, all input types are legal, so just make sure |
2741 | // src0 isn't using any modifiers. |
2742 | if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) |
2743 | return false; |
2744 | |
2745 | // Can it be shrunk to a valid 32 bit opcode? |
2746 | if (!hasVALU32BitEncoding(MI.getOpcode())) |
2747 | return false; |
2748 | |
2749 | // Check output modifiers |
2750 | return !hasModifiersSet(MI, AMDGPU::OpName::omod) && |
2751 | !hasModifiersSet(MI, AMDGPU::OpName::clamp); |
2752 | } |
2753 | |
2754 | // Set VCC operand with all flags from \p Orig, except for setting it as |
2755 | // implicit. |
2756 | static void copyFlagsToImplicitVCC(MachineInstr &MI, |
2757 | const MachineOperand &Orig) { |
2758 | |
2759 | for (MachineOperand &Use : MI.implicit_operands()) { |
2760 | if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { |
2761 | Use.setIsUndef(Orig.isUndef()); |
2762 | Use.setIsKill(Orig.isKill()); |
2763 | return; |
2764 | } |
2765 | } |
2766 | } |
2767 | |
2768 | MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, |
2769 | unsigned Op32) const { |
2770 | MachineBasicBlock *MBB = MI.getParent();; |
2771 | MachineInstrBuilder Inst32 = |
2772 | BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)); |
2773 | |
2774 | // Add the dst operand if the 32-bit encoding also has an explicit $vdst. |
2775 | // For VOPC instructions, this is replaced by an implicit def of vcc. |
2776 | int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); |
2777 | if (Op32DstIdx != -1) { |
2778 | // dst |
2779 | Inst32.add(MI.getOperand(0)); |
2780 | } else { |
2781 | assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&((MI.getOperand(0).getReg() == AMDGPU::VCC && "Unexpected case" ) ? static_cast<void> (0) : __assert_fail ("MI.getOperand(0).getReg() == AMDGPU::VCC && \"Unexpected case\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2782, __PRETTY_FUNCTION__)) |
2782 | "Unexpected case")((MI.getOperand(0).getReg() == AMDGPU::VCC && "Unexpected case" ) ? static_cast<void> (0) : __assert_fail ("MI.getOperand(0).getReg() == AMDGPU::VCC && \"Unexpected case\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 2782, __PRETTY_FUNCTION__)); |
2783 | } |
2784 | |
2785 | Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); |
2786 | |
2787 | const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); |
2788 | if (Src1) |
2789 | Inst32.add(*Src1); |
2790 | |
2791 | const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); |
2792 | |
2793 | if (Src2) { |
2794 | int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); |
2795 | if (Op32Src2Idx != -1) { |
2796 | Inst32.add(*Src2); |
2797 | } else { |
2798 | // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is |
2799 | // replaced with an implicit read of vcc. This was already added |
2800 | // during the initial BuildMI, so find it to preserve the flags. |
2801 | copyFlagsToImplicitVCC(*Inst32, *Src2); |
2802 | } |
2803 | } |
2804 | |
2805 | return Inst32; |
2806 | } |
2807 | |
2808 | bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, |
2809 | const MachineOperand &MO, |
2810 | const MCOperandInfo &OpInfo) const { |
2811 | // Literal constants use the constant bus. |
2812 | //if (isLiteralConstantLike(MO, OpInfo)) |
2813 | // return true; |
2814 | if (MO.isImm()) |
2815 | return !isInlineConstant(MO, OpInfo); |
2816 | |
2817 | if (!MO.isReg()) |
2818 | return true; // Misc other operands like FrameIndex |
2819 | |
2820 | if (!MO.isUse()) |
2821 | return false; |
2822 | |
2823 | if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) |
2824 | return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); |
2825 | |
2826 | // Null is free |
2827 | if (MO.getReg() == AMDGPU::SGPR_NULL) |
2828 | return false; |
2829 | |
2830 | // SGPRs use the constant bus |
2831 | if (MO.isImplicit()) { |
2832 | return MO.getReg() == AMDGPU::M0 || |
2833 | MO.getReg() == AMDGPU::VCC || |
2834 | MO.getReg() == AMDGPU::VCC_LO; |
2835 | } else { |
2836 | return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || |
2837 | AMDGPU::SReg_64RegClass.contains(MO.getReg()); |
2838 | } |
2839 | } |
2840 | |
2841 | static unsigned findImplicitSGPRRead(const MachineInstr &MI) { |
2842 | for (const MachineOperand &MO : MI.implicit_operands()) { |
2843 | // We only care about reads. |
2844 | if (MO.isDef()) |
2845 | continue; |
2846 | |
2847 | switch (MO.getReg()) { |
2848 | case AMDGPU::VCC: |
2849 | case AMDGPU::M0: |
2850 | case AMDGPU::FLAT_SCR: |
2851 | return MO.getReg(); |
2852 | |
2853 | default: |
2854 | break; |
2855 | } |
2856 | } |
2857 | |
2858 | return AMDGPU::NoRegister; |
2859 | } |
2860 | |
2861 | static bool shouldReadExec(const MachineInstr &MI) { |
2862 | if (SIInstrInfo::isVALU(MI)) { |
2863 | switch (MI.getOpcode()) { |
2864 | case AMDGPU::V_READLANE_B32: |
2865 | case AMDGPU::V_READLANE_B32_gfx6_gfx7: |
2866 | case AMDGPU::V_READLANE_B32_gfx10: |
2867 | case AMDGPU::V_READLANE_B32_vi: |
2868 | case AMDGPU::V_WRITELANE_B32: |
2869 | case AMDGPU::V_WRITELANE_B32_gfx6_gfx7: |
2870 | case AMDGPU::V_WRITELANE_B32_gfx10: |
2871 | case AMDGPU::V_WRITELANE_B32_vi: |
2872 | return false; |
2873 | } |
2874 | |
2875 | return true; |
2876 | } |
2877 | |
2878 | if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || |
2879 | SIInstrInfo::isSALU(MI) || |
2880 | SIInstrInfo::isSMRD(MI)) |
2881 | return false; |
2882 | |
2883 | return true; |
2884 | } |
2885 | |
2886 | static bool isSubRegOf(const SIRegisterInfo &TRI, |
2887 | const MachineOperand &SuperVec, |
2888 | const MachineOperand &SubReg) { |
2889 | if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) |
2890 | return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); |
2891 | |
2892 | return SubReg.getSubReg() != AMDGPU::NoSubRegister && |
2893 | SubReg.getReg() == SuperVec.getReg(); |
2894 | } |
2895 | |
2896 | bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, |
2897 | StringRef &ErrInfo) const { |
2898 | uint16_t Opcode = MI.getOpcode(); |
2899 | if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) |
2900 | return true; |
2901 | |
2902 | const MachineFunction *MF = MI.getParent()->getParent(); |
2903 | const MachineRegisterInfo &MRI = MF->getRegInfo(); |
2904 | |
2905 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); |
2906 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); |
2907 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); |
2908 | |
2909 | // Make sure the number of operands is correct. |
2910 | const MCInstrDesc &Desc = get(Opcode); |
2911 | if (!Desc.isVariadic() && |
2912 | Desc.getNumOperands() != MI.getNumExplicitOperands()) { |
2913 | ErrInfo = "Instruction has wrong number of operands."; |
2914 | return false; |
2915 | } |
2916 | |
2917 | if (MI.isInlineAsm()) { |
2918 | // Verify register classes for inlineasm constraints. |
2919 | for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); |
2920 | I != E; ++I) { |
2921 | const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); |
2922 | if (!RC) |
2923 | continue; |
2924 | |
2925 | const MachineOperand &Op = MI.getOperand(I); |
2926 | if (!Op.isReg()) |
2927 | continue; |
2928 | |
2929 | unsigned Reg = Op.getReg(); |
2930 | if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { |
2931 | ErrInfo = "inlineasm operand has incorrect register class."; |
2932 | return false; |
2933 | } |
2934 | } |
2935 | |
2936 | return true; |
2937 | } |
2938 | |
2939 | // Make sure the register classes are correct. |
2940 | for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { |
2941 | if (MI.getOperand(i).isFPImm()) { |
2942 | ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " |
2943 | "all fp values to integers."; |
2944 | return false; |
2945 | } |
2946 | |
2947 | int RegClass = Desc.OpInfo[i].RegClass; |
2948 | |
2949 | switch (Desc.OpInfo[i].OperandType) { |
2950 | case MCOI::OPERAND_REGISTER: |
2951 | if (MI.getOperand(i).isImm()) { |
2952 | ErrInfo = "Illegal immediate value for operand."; |
2953 | return false; |
2954 | } |
2955 | break; |
2956 | case AMDGPU::OPERAND_REG_IMM_INT32: |
2957 | case AMDGPU::OPERAND_REG_IMM_FP32: |
2958 | break; |
2959 | case AMDGPU::OPERAND_REG_INLINE_C_INT32: |
2960 | case AMDGPU::OPERAND_REG_INLINE_C_FP32: |
2961 | case AMDGPU::OPERAND_REG_INLINE_C_INT64: |
2962 | case AMDGPU::OPERAND_REG_INLINE_C_FP64: |
2963 | case AMDGPU::OPERAND_REG_INLINE_C_INT16: |
2964 | case AMDGPU::OPERAND_REG_INLINE_C_FP16: { |
2965 | const MachineOperand &MO = MI.getOperand(i); |
2966 | if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { |
2967 | ErrInfo = "Illegal immediate value for operand."; |
2968 | return false; |
2969 | } |
2970 | break; |
2971 | } |
2972 | case MCOI::OPERAND_IMMEDIATE: |
2973 | case AMDGPU::OPERAND_KIMM32: |
2974 | // Check if this operand is an immediate. |
2975 | // FrameIndex operands will be replaced by immediates, so they are |
2976 | // allowed. |
2977 | if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { |
2978 | ErrInfo = "Expected immediate, but got non-immediate"; |
2979 | return false; |
2980 | } |
2981 | LLVM_FALLTHROUGH[[clang::fallthrough]]; |
2982 | default: |
2983 | continue; |
2984 | } |
2985 | |
2986 | if (!MI.getOperand(i).isReg()) |
2987 | continue; |
2988 | |
2989 | if (RegClass != -1) { |
2990 | unsigned Reg = MI.getOperand(i).getReg(); |
2991 | if (Reg == AMDGPU::NoRegister || |
2992 | TargetRegisterInfo::isVirtualRegister(Reg)) |
2993 | continue; |
2994 | |
2995 | const TargetRegisterClass *RC = RI.getRegClass(RegClass); |
2996 | if (!RC->contains(Reg)) { |
2997 | ErrInfo = "Operand has incorrect register class."; |
2998 | return false; |
2999 | } |
3000 | } |
3001 | } |
3002 | |
3003 | // Verify SDWA |
3004 | if (isSDWA(MI)) { |
3005 | if (!ST.hasSDWA()) { |
3006 | ErrInfo = "SDWA is not supported on this target"; |
3007 | return false; |
3008 | } |
3009 | |
3010 | int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); |
3011 | |
3012 | const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; |
3013 | |
3014 | for (int OpIdx: OpIndicies) { |
3015 | if (OpIdx == -1) |
3016 | continue; |
3017 | const MachineOperand &MO = MI.getOperand(OpIdx); |
3018 | |
3019 | if (!ST.hasSDWAScalar()) { |
3020 | // Only VGPRS on VI |
3021 | if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { |
3022 | ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; |
3023 | return false; |
3024 | } |
3025 | } else { |
3026 | // No immediates on GFX9 |
3027 | if (!MO.isReg()) { |
3028 | ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9"; |
3029 | return false; |
3030 | } |
3031 | } |
3032 | } |
3033 | |
3034 | if (!ST.hasSDWAOmod()) { |
3035 | // No omod allowed on VI |
3036 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); |
3037 | if (OMod != nullptr && |
3038 | (!OMod->isImm() || OMod->getImm() != 0)) { |
3039 | ErrInfo = "OMod not allowed in SDWA instructions on VI"; |
3040 | return false; |
3041 | } |
3042 | } |
3043 | |
3044 | uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); |
3045 | if (isVOPC(BasicOpcode)) { |
3046 | if (!ST.hasSDWASdst() && DstIdx != -1) { |
3047 | // Only vcc allowed as dst on VI for VOPC |
3048 | const MachineOperand &Dst = MI.getOperand(DstIdx); |
3049 | if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { |
3050 | ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; |
3051 | return false; |
3052 | } |
3053 | } else if (!ST.hasSDWAOutModsVOPC()) { |
3054 | // No clamp allowed on GFX9 for VOPC |
3055 | const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); |
3056 | if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { |
3057 | ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; |
3058 | return false; |
3059 | } |
3060 | |
3061 | // No omod allowed on GFX9 for VOPC |
3062 | const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); |
3063 | if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { |
3064 | ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; |
3065 | return false; |
3066 | } |
3067 | } |
3068 | } |
3069 | |
3070 | const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); |
3071 | if (DstUnused && DstUnused->isImm() && |
3072 | DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { |
3073 | const MachineOperand &Dst = MI.getOperand(DstIdx); |
3074 | if (!Dst.isReg() || !Dst.isTied()) { |
3075 | ErrInfo = "Dst register should have tied register"; |
3076 | return false; |
3077 | } |
3078 | |
3079 | const MachineOperand &TiedMO = |
3080 | MI.getOperand(MI.findTiedOperandIdx(DstIdx)); |
3081 | if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { |
3082 | ErrInfo = |
3083 | "Dst register should be tied to implicit use of preserved register"; |
3084 | return false; |
3085 | } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) && |
3086 | Dst.getReg() != TiedMO.getReg()) { |
3087 | ErrInfo = "Dst register should use same physical register as preserved"; |
3088 | return false; |
3089 | } |
3090 | } |
3091 | } |
3092 | |
3093 | // Verify MIMG |
3094 | if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { |
3095 | // Ensure that the return type used is large enough for all the options |
3096 | // being used TFE/LWE require an extra result register. |
3097 | const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); |
3098 | if (DMask) { |
3099 | uint64_t DMaskImm = DMask->getImm(); |
3100 | uint32_t RegCount = |
3101 | isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); |
3102 | const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); |
3103 | const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); |
3104 | const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); |
3105 | |
3106 | // Adjust for packed 16 bit values |
3107 | if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) |
3108 | RegCount >>= 1; |
3109 | |
3110 | // Adjust if using LWE or TFE |
3111 | if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) |
3112 | RegCount += 1; |
3113 | |
3114 | const uint32_t DstIdx = |
3115 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); |
3116 | const MachineOperand &Dst = MI.getOperand(DstIdx); |
3117 | if (Dst.isReg()) { |
3118 | const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); |
3119 | uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; |
3120 | if (RegCount > DstSize) { |
3121 | ErrInfo = "MIMG instruction returns too many registers for dst " |
3122 | "register class"; |
3123 | return false; |
3124 | } |
3125 | } |
3126 | } |
3127 | } |
3128 | |
3129 | // Verify VOP*. Ignore multiple sgpr operands on writelane. |
3130 | if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 |
3131 | && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { |
3132 | // Only look at the true operands. Only a real operand can use the constant |
3133 | // bus, and we don't want to check pseudo-operands like the source modifier |
3134 | // flags. |
3135 | const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; |
3136 | |
3137 | unsigned ConstantBusCount = 0; |
3138 | unsigned LiteralCount = 0; |
3139 | |
3140 | if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) |
3141 | ++ConstantBusCount; |
3142 | |
3143 | SmallVector<unsigned, 2> SGPRsUsed; |
3144 | unsigned SGPRUsed = findImplicitSGPRRead(MI); |
3145 | if (SGPRUsed != AMDGPU::NoRegister) { |
3146 | ++ConstantBusCount; |
3147 | SGPRsUsed.push_back(SGPRUsed); |
3148 | } |
3149 | |
3150 | for (int OpIdx : OpIndices) { |
3151 | if (OpIdx == -1) |
3152 | break; |
3153 | const MachineOperand &MO = MI.getOperand(OpIdx); |
3154 | if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { |
3155 | if (MO.isReg()) { |
3156 | SGPRUsed = MO.getReg(); |
3157 | if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { |
3158 | return !RI.regsOverlap(SGPRUsed, SGPR); |
3159 | })) { |
3160 | ++ConstantBusCount; |
3161 | SGPRsUsed.push_back(SGPRUsed); |
3162 | } |
3163 | } else { |
3164 | ++ConstantBusCount; |
3165 | ++LiteralCount; |
3166 | } |
3167 | } |
3168 | } |
3169 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
3170 | // v_writelane_b32 is an exception from constant bus restriction: |
3171 | // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const |
3172 | if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && |
3173 | Opcode != AMDGPU::V_WRITELANE_B32) { |
3174 | ErrInfo = "VOP* instruction violates constant bus restriction"; |
3175 | return false; |
3176 | } |
3177 | |
3178 | if (isVOP3(MI) && LiteralCount) { |
3179 | if (LiteralCount && !ST.hasVOP3Literal()) { |
3180 | ErrInfo = "VOP3 instruction uses literal"; |
3181 | return false; |
3182 | } |
3183 | if (LiteralCount > 1) { |
3184 | ErrInfo = "VOP3 instruction uses more than one literal"; |
3185 | return false; |
3186 | } |
3187 | } |
3188 | } |
3189 | |
3190 | // Verify misc. restrictions on specific instructions. |
3191 | if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || |
3192 | Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { |
3193 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); |
3194 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); |
3195 | const MachineOperand &Src2 = MI.getOperand(Src2Idx); |
3196 | if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { |
3197 | if (!compareMachineOp(Src0, Src1) && |
3198 | !compareMachineOp(Src0, Src2)) { |
3199 | ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; |
3200 | return false; |
3201 | } |
3202 | } |
3203 | } |
3204 | |
3205 | if (isSOP2(MI) || isSOPC(MI)) { |
3206 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); |
3207 | const MachineOperand &Src1 = MI.getOperand(Src1Idx); |
3208 | unsigned Immediates = 0; |
3209 | |
3210 | if (!Src0.isReg() && |
3211 | !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) |
3212 | Immediates++; |
3213 | if (!Src1.isReg() && |
3214 | !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) |
3215 | Immediates++; |
3216 | |
3217 | if (Immediates > 1) { |
3218 | ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; |
3219 | return false; |
3220 | } |
3221 | } |
3222 | |
3223 | if (isSOPK(MI)) { |
3224 | auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); |
3225 | if (Desc.isBranch()) { |
3226 | if (!Op->isMBB()) { |
3227 | ErrInfo = "invalid branch target for SOPK instruction"; |
3228 | return false; |
3229 | } |
3230 | } else { |
3231 | uint64_t Imm = Op->getImm(); |
3232 | if (sopkIsZext(MI)) { |
3233 | if (!isUInt<16>(Imm)) { |
3234 | ErrInfo = "invalid immediate for SOPK instruction"; |
3235 | return false; |
3236 | } |
3237 | } else { |
3238 | if (!isInt<16>(Imm)) { |
3239 | ErrInfo = "invalid immediate for SOPK instruction"; |
3240 | return false; |
3241 | } |
3242 | } |
3243 | } |
3244 | } |
3245 | |
3246 | if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || |
3247 | Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || |
3248 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || |
3249 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { |
3250 | const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || |
3251 | Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; |
3252 | |
3253 | const unsigned StaticNumOps = Desc.getNumOperands() + |
3254 | Desc.getNumImplicitUses(); |
3255 | const unsigned NumImplicitOps = IsDst ? 2 : 1; |
3256 | |
3257 | // Allow additional implicit operands. This allows a fixup done by the post |
3258 | // RA scheduler where the main implicit operand is killed and implicit-defs |
3259 | // are added for sub-registers that remain live after this instruction. |
3260 | if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { |
3261 | ErrInfo = "missing implicit register operands"; |
3262 | return false; |
3263 | } |
3264 | |
3265 | const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); |
3266 | if (IsDst) { |
3267 | if (!Dst->isUse()) { |
3268 | ErrInfo = "v_movreld_b32 vdst should be a use operand"; |
3269 | return false; |
3270 | } |
3271 | |
3272 | unsigned UseOpIdx; |
3273 | if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || |
3274 | UseOpIdx != StaticNumOps + 1) { |
3275 | ErrInfo = "movrel implicit operands should be tied"; |
3276 | return false; |
3277 | } |
3278 | } |
3279 | |
3280 | const MachineOperand &Src0 = MI.getOperand(Src0Idx); |
3281 | const MachineOperand &ImpUse |
3282 | = MI.getOperand(StaticNumOps + NumImplicitOps - 1); |
3283 | if (!ImpUse.isReg() || !ImpUse.isUse() || |
3284 | !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { |
3285 | ErrInfo = "src0 should be subreg of implicit vector use"; |
3286 | return false; |
3287 | } |
3288 | } |
3289 | |
3290 | // Make sure we aren't losing exec uses in the td files. This mostly requires |
3291 | // being careful when using let Uses to try to add other use registers. |
3292 | if (shouldReadExec(MI)) { |
3293 | if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { |
3294 | ErrInfo = "VALU instruction does not implicitly read exec mask"; |
3295 | return false; |
3296 | } |
3297 | } |
3298 | |
3299 | if (isSMRD(MI)) { |
3300 | if (MI.mayStore()) { |
3301 | // The register offset form of scalar stores may only use m0 as the |
3302 | // soffset register. |
3303 | const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); |
3304 | if (Soff && Soff->getReg() != AMDGPU::M0) { |
3305 | ErrInfo = "scalar stores must use m0 as offset register"; |
3306 | return false; |
3307 | } |
3308 | } |
3309 | } |
3310 | |
3311 | if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) { |
3312 | const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); |
3313 | if (Offset->getImm() != 0) { |
3314 | ErrInfo = "subtarget does not support offsets in flat instructions"; |
3315 | return false; |
3316 | } |
3317 | } |
3318 | |
3319 | if (isMIMG(MI)) { |
3320 | const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); |
3321 | if (DimOp) { |
3322 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, |
3323 | AMDGPU::OpName::vaddr0); |
3324 | int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); |
3325 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); |
3326 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = |
3327 | AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); |
3328 | const AMDGPU::MIMGDimInfo *Dim = |
3329 | AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); |
3330 | |
3331 | if (!Dim) { |
3332 | ErrInfo = "dim is out of range"; |
3333 | return false; |
3334 | } |
3335 | |
3336 | bool IsNSA = SRsrcIdx - VAddr0Idx > 1; |
3337 | unsigned AddrWords = BaseOpcode->NumExtraArgs + |
3338 | (BaseOpcode->Gradients ? Dim->NumGradients : 0) + |
3339 | (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + |
3340 | (BaseOpcode->LodOrClampOrMip ? 1 : 0); |
3341 | |
3342 | unsigned VAddrWords; |
3343 | if (IsNSA) { |
3344 | VAddrWords = SRsrcIdx - VAddr0Idx; |
3345 | } else { |
3346 | const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); |
3347 | VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; |
3348 | if (AddrWords > 8) |
3349 | AddrWords = 16; |
3350 | else if (AddrWords > 4) |
3351 | AddrWords = 8; |
3352 | else if (AddrWords == 3 && VAddrWords == 4) { |
3353 | // CodeGen uses the V4 variant of instructions for three addresses, |
3354 | // because the selection DAG does not support non-power-of-two types. |
3355 | AddrWords = 4; |
3356 | } |
3357 | } |
3358 | |
3359 | if (VAddrWords != AddrWords) { |
3360 | ErrInfo = "bad vaddr size"; |
3361 | return false; |
3362 | } |
3363 | } |
3364 | } |
3365 | |
3366 | const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); |
3367 | if (DppCt) { |
3368 | using namespace AMDGPU::DPP; |
3369 | |
3370 | unsigned DC = DppCt->getImm(); |
3371 | if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || |
3372 | DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || |
3373 | (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || |
3374 | (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || |
3375 | (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || |
3376 | (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) { |
3377 | ErrInfo = "Invalid dpp_ctrl value"; |
3378 | return false; |
3379 | } |
3380 | } |
3381 | |
3382 | return true; |
3383 | } |
3384 | |
3385 | unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { |
3386 | switch (MI.getOpcode()) { |
3387 | default: return AMDGPU::INSTRUCTION_LIST_END; |
3388 | case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; |
3389 | case AMDGPU::COPY: return AMDGPU::COPY; |
3390 | case AMDGPU::PHI: return AMDGPU::PHI; |
3391 | case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; |
3392 | case AMDGPU::WQM: return AMDGPU::WQM; |
3393 | case AMDGPU::WWM: return AMDGPU::WWM; |
3394 | case AMDGPU::S_MOV_B32: |
3395 | return MI.getOperand(1).isReg() ? |
3396 | AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; |
3397 | case AMDGPU::S_ADD_I32: |
3398 | return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; |
3399 | case AMDGPU::S_ADDC_U32: |
3400 | return AMDGPU::V_ADDC_U32_e32; |
3401 | case AMDGPU::S_SUB_I32: |
3402 | return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; |
3403 | // FIXME: These are not consistently handled, and selected when the carry is |
3404 | // used. |
3405 | case AMDGPU::S_ADD_U32: |
3406 | return AMDGPU::V_ADD_I32_e32; |
3407 | case AMDGPU::S_SUB_U32: |
3408 | return AMDGPU::V_SUB_I32_e32; |
3409 | case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; |
3410 | case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32; |
3411 | case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32; |
3412 | case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32; |
3413 | case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; |
3414 | case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; |
3415 | case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; |
3416 | case AMDGPU::S_XNOR_B32: |
3417 | return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; |
3418 | case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; |
3419 | case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; |
3420 | case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; |
3421 | case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; |
3422 | case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; |
3423 | case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; |
3424 | case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; |
3425 | case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; |
3426 | case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; |
3427 | case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; |
3428 | case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; |
3429 | case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; |
3430 | case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; |
3431 | case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; |
3432 | case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; |
3433 | case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; |
3434 | case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; |
3435 | case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; |
3436 | case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; |
3437 | case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; |
3438 | case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; |
3439 | case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; |
3440 | case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; |
3441 | case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; |
3442 | case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; |
3443 | case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; |
3444 | case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; |
3445 | case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; |
3446 | case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; |
3447 | case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; |
3448 | case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; |
3449 | case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; |
3450 | case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; |
3451 | case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; |
3452 | case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; |
3453 | case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; |
3454 | case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; |
3455 | case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; |
3456 | } |
3457 | llvm_unreachable(::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3458) |
3458 | "Unexpected scalar opcode without corresponding vector one!")::llvm::llvm_unreachable_internal("Unexpected scalar opcode without corresponding vector one!" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3458); |
3459 | } |
3460 | |
3461 | const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, |
3462 | unsigned OpNo) const { |
3463 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); |
3464 | const MCInstrDesc &Desc = get(MI.getOpcode()); |
3465 | if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || |
3466 | Desc.OpInfo[OpNo].RegClass == -1) { |
3467 | unsigned Reg = MI.getOperand(OpNo).getReg(); |
3468 | |
3469 | if (TargetRegisterInfo::isVirtualRegister(Reg)) |
3470 | return MRI.getRegClass(Reg); |
3471 | return RI.getPhysRegClass(Reg); |
3472 | } |
3473 | |
3474 | unsigned RCID = Desc.OpInfo[OpNo].RegClass; |
3475 | return RI.getRegClass(RCID); |
3476 | } |
3477 | |
3478 | void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { |
3479 | MachineBasicBlock::iterator I = MI; |
3480 | MachineBasicBlock *MBB = MI.getParent(); |
3481 | MachineOperand &MO = MI.getOperand(OpIdx); |
3482 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); |
3483 | const SIRegisterInfo *TRI = |
3484 | static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); |
3485 | unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; |
3486 | const TargetRegisterClass *RC = RI.getRegClass(RCID); |
3487 | unsigned Size = TRI->getRegSizeInBits(*RC); |
3488 | unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; |
3489 | if (MO.isReg()) |
3490 | Opcode = AMDGPU::COPY; |
3491 | else if (RI.isSGPRClass(RC)) |
3492 | Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; |
3493 | |
3494 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); |
3495 | if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) |
3496 | VRC = &AMDGPU::VReg_64RegClass; |
3497 | else |
3498 | VRC = &AMDGPU::VGPR_32RegClass; |
3499 | |
3500 | unsigned Reg = MRI.createVirtualRegister(VRC); |
3501 | DebugLoc DL = MBB->findDebugLoc(I); |
3502 | BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); |
3503 | MO.ChangeToRegister(Reg, false); |
3504 | } |
3505 | |
3506 | unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, |
3507 | MachineRegisterInfo &MRI, |
3508 | MachineOperand &SuperReg, |
3509 | const TargetRegisterClass *SuperRC, |
3510 | unsigned SubIdx, |
3511 | const TargetRegisterClass *SubRC) |
3512 | const { |
3513 | MachineBasicBlock *MBB = MI->getParent(); |
3514 | DebugLoc DL = MI->getDebugLoc(); |
3515 | unsigned SubReg = MRI.createVirtualRegister(SubRC); |
3516 | |
3517 | if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { |
3518 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) |
3519 | .addReg(SuperReg.getReg(), 0, SubIdx); |
3520 | return SubReg; |
3521 | } |
3522 | |
3523 | // Just in case the super register is itself a sub-register, copy it to a new |
3524 | // value so we don't need to worry about merging its subreg index with the |
3525 | // SubIdx passed to this function. The register coalescer should be able to |
3526 | // eliminate this extra copy. |
3527 | unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); |
3528 | |
3529 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) |
3530 | .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); |
3531 | |
3532 | BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) |
3533 | .addReg(NewSuperReg, 0, SubIdx); |
3534 | |
3535 | return SubReg; |
3536 | } |
3537 | |
3538 | MachineOperand SIInstrInfo::buildExtractSubRegOrImm( |
3539 | MachineBasicBlock::iterator MII, |
3540 | MachineRegisterInfo &MRI, |
3541 | MachineOperand &Op, |
3542 | const TargetRegisterClass *SuperRC, |
3543 | unsigned SubIdx, |
3544 | const TargetRegisterClass *SubRC) const { |
3545 | if (Op.isImm()) { |
3546 | if (SubIdx == AMDGPU::sub0) |
3547 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); |
3548 | if (SubIdx == AMDGPU::sub1) |
3549 | return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); |
3550 | |
3551 | llvm_unreachable("Unhandled register index for immediate")::llvm::llvm_unreachable_internal("Unhandled register index for immediate" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3551); |
3552 | } |
3553 | |
3554 | unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, |
3555 | SubIdx, SubRC); |
3556 | return MachineOperand::CreateReg(SubReg, false); |
3557 | } |
3558 | |
3559 | // Change the order of operands from (0, 1, 2) to (0, 2, 1) |
3560 | void SIInstrInfo::swapOperands(MachineInstr &Inst) const { |
3561 | assert(Inst.getNumExplicitOperands() == 3)((Inst.getNumExplicitOperands() == 3) ? static_cast<void> (0) : __assert_fail ("Inst.getNumExplicitOperands() == 3", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3561, __PRETTY_FUNCTION__)); |
3562 | MachineOperand Op1 = Inst.getOperand(1); |
3563 | Inst.RemoveOperand(1); |
3564 | Inst.addOperand(Op1); |
3565 | } |
3566 | |
3567 | bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, |
3568 | const MCOperandInfo &OpInfo, |
3569 | const MachineOperand &MO) const { |
3570 | if (!MO.isReg()) |
3571 | return false; |
3572 | |
3573 | unsigned Reg = MO.getReg(); |
3574 | const TargetRegisterClass *RC = |
3575 | TargetRegisterInfo::isVirtualRegister(Reg) ? |
3576 | MRI.getRegClass(Reg) : |
3577 | RI.getPhysRegClass(Reg); |
3578 | |
3579 | const SIRegisterInfo *TRI = |
3580 | static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); |
3581 | RC = TRI->getSubRegClass(RC, MO.getSubReg()); |
3582 | |
3583 | // In order to be legal, the common sub-class must be equal to the |
3584 | // class of the current operand. For example: |
3585 | // |
3586 | // v_mov_b32 s0 ; Operand defined as vsrc_b32 |
3587 | // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL |
3588 | // |
3589 | // s_sendmsg 0, s0 ; Operand defined as m0reg |
3590 | // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL |
3591 | |
3592 | return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; |
3593 | } |
3594 | |
3595 | bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, |
3596 | const MCOperandInfo &OpInfo, |
3597 | const MachineOperand &MO) const { |
3598 | if (MO.isReg()) |
3599 | return isLegalRegOperand(MRI, OpInfo, MO); |
3600 | |
3601 | // Handle non-register types that are treated like immediates. |
3602 | assert(MO.isImm() || MO.isTargetIndex() || MO.isFI())((MO.isImm() || MO.isTargetIndex() || MO.isFI()) ? static_cast <void> (0) : __assert_fail ("MO.isImm() || MO.isTargetIndex() || MO.isFI()" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3602, __PRETTY_FUNCTION__)); |
3603 | return true; |
3604 | } |
3605 | |
3606 | bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, |
3607 | const MachineOperand *MO) const { |
3608 | const MachineFunction &MF = *MI.getParent()->getParent(); |
3609 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
3610 | const MCInstrDesc &InstDesc = MI.getDesc(); |
3611 | const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; |
3612 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
3613 | const TargetRegisterClass *DefinedRC = |
3614 | OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; |
3615 | if (!MO) |
3616 | MO = &MI.getOperand(OpIdx); |
3617 | |
3618 | int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); |
3619 | int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; |
3620 | if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { |
3621 | if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) |
3622 | return false; |
3623 | |
3624 | SmallDenseSet<RegSubRegPair> SGPRsUsed; |
3625 | if (MO->isReg()) |
3626 | SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); |
3627 | |
3628 | for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { |
3629 | if (i == OpIdx) |
3630 | continue; |
3631 | const MachineOperand &Op = MI.getOperand(i); |
3632 | if (Op.isReg()) { |
3633 | RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); |
3634 | if (!SGPRsUsed.count(SGPR) && |
3635 | usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { |
3636 | if (--ConstantBusLimit <= 0) |
3637 | return false; |
3638 | SGPRsUsed.insert(SGPR); |
3639 | } |
3640 | } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { |
3641 | if (--ConstantBusLimit <= 0) |
3642 | return false; |
3643 | } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && |
3644 | isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { |
3645 | if (!VOP3LiteralLimit--) |
3646 | return false; |
3647 | if (--ConstantBusLimit <= 0) |
3648 | return false; |
3649 | } |
3650 | } |
3651 | } |
3652 | |
3653 | if (MO->isReg()) { |
3654 | assert(DefinedRC)((DefinedRC) ? static_cast<void> (0) : __assert_fail ("DefinedRC" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3654, __PRETTY_FUNCTION__)); |
3655 | return isLegalRegOperand(MRI, OpInfo, *MO); |
3656 | } |
3657 | |
3658 | // Handle non-register types that are treated like immediates. |
3659 | assert(MO->isImm() || MO->isTargetIndex() || MO->isFI())((MO->isImm() || MO->isTargetIndex() || MO->isFI()) ? static_cast<void> (0) : __assert_fail ("MO->isImm() || MO->isTargetIndex() || MO->isFI()" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3659, __PRETTY_FUNCTION__)); |
3660 | |
3661 | if (!DefinedRC) { |
3662 | // This operand expects an immediate. |
3663 | return true; |
3664 | } |
3665 | |
3666 | return isImmOperandLegal(MI, OpIdx, *MO); |
3667 | } |
3668 | |
3669 | void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, |
3670 | MachineInstr &MI) const { |
3671 | unsigned Opc = MI.getOpcode(); |
3672 | const MCInstrDesc &InstrDesc = get(Opc); |
3673 | |
3674 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); |
3675 | MachineOperand &Src1 = MI.getOperand(Src1Idx); |
3676 | |
3677 | // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 |
3678 | // we need to only have one constant bus use before GFX10. |
3679 | bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; |
3680 | if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1) { |
3681 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); |
3682 | MachineOperand &Src0 = MI.getOperand(Src0Idx); |
3683 | |
3684 | if (Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || |
3685 | isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) |
3686 | legalizeOpWithMove(MI, Src0Idx); |
3687 | } |
3688 | |
3689 | // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for |
3690 | // both the value to write (src0) and lane select (src1). Fix up non-SGPR |
3691 | // src0/src1 with V_READFIRSTLANE. |
3692 | if (Opc == AMDGPU::V_WRITELANE_B32) { |
3693 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); |
3694 | MachineOperand &Src0 = MI.getOperand(Src0Idx); |
3695 | const DebugLoc &DL = MI.getDebugLoc(); |
3696 | if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { |
3697 | unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
3698 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) |
3699 | .add(Src0); |
3700 | Src0.ChangeToRegister(Reg, false); |
3701 | } |
3702 | if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { |
3703 | unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
3704 | const DebugLoc &DL = MI.getDebugLoc(); |
3705 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) |
3706 | .add(Src1); |
3707 | Src1.ChangeToRegister(Reg, false); |
3708 | } |
3709 | return; |
3710 | } |
3711 | |
3712 | // VOP2 src0 instructions support all operand types, so we don't need to check |
3713 | // their legality. If src1 is already legal, we don't need to do anything. |
3714 | if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) |
3715 | return; |
3716 | |
3717 | // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for |
3718 | // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane |
3719 | // select is uniform. |
3720 | if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && |
3721 | RI.isVGPR(MRI, Src1.getReg())) { |
3722 | unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
3723 | const DebugLoc &DL = MI.getDebugLoc(); |
3724 | BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) |
3725 | .add(Src1); |
3726 | Src1.ChangeToRegister(Reg, false); |
3727 | return; |
3728 | } |
3729 | |
3730 | // We do not use commuteInstruction here because it is too aggressive and will |
3731 | // commute if it is possible. We only want to commute here if it improves |
3732 | // legality. This can be called a fairly large number of times so don't waste |
3733 | // compile time pointlessly swapping and checking legality again. |
3734 | if (HasImplicitSGPR || !MI.isCommutable()) { |
3735 | legalizeOpWithMove(MI, Src1Idx); |
3736 | return; |
3737 | } |
3738 | |
3739 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); |
3740 | MachineOperand &Src0 = MI.getOperand(Src0Idx); |
3741 | |
3742 | // If src0 can be used as src1, commuting will make the operands legal. |
3743 | // Otherwise we have to give up and insert a move. |
3744 | // |
3745 | // TODO: Other immediate-like operand kinds could be commuted if there was a |
3746 | // MachineOperand::ChangeTo* for them. |
3747 | if ((!Src1.isImm() && !Src1.isReg()) || |
3748 | !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { |
3749 | legalizeOpWithMove(MI, Src1Idx); |
3750 | return; |
3751 | } |
3752 | |
3753 | int CommutedOpc = commuteOpcode(MI); |
3754 | if (CommutedOpc == -1) { |
3755 | legalizeOpWithMove(MI, Src1Idx); |
3756 | return; |
3757 | } |
3758 | |
3759 | MI.setDesc(get(CommutedOpc)); |
3760 | |
3761 | unsigned Src0Reg = Src0.getReg(); |
3762 | unsigned Src0SubReg = Src0.getSubReg(); |
3763 | bool Src0Kill = Src0.isKill(); |
3764 | |
3765 | if (Src1.isImm()) |
3766 | Src0.ChangeToImmediate(Src1.getImm()); |
3767 | else if (Src1.isReg()) { |
3768 | Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); |
3769 | Src0.setSubReg(Src1.getSubReg()); |
3770 | } else |
3771 | llvm_unreachable("Should only have register or immediate operands")::llvm::llvm_unreachable_internal("Should only have register or immediate operands" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 3771); |
3772 | |
3773 | Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); |
3774 | Src1.setSubReg(Src0SubReg); |
3775 | } |
3776 | |
3777 | // Legalize VOP3 operands. All operand types are supported for any operand |
3778 | // but only one literal constant and only starting from GFX10. |
3779 | void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, |
3780 | MachineInstr &MI) const { |
3781 | unsigned Opc = MI.getOpcode(); |
3782 | |
3783 | int VOP3Idx[3] = { |
3784 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), |
3785 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), |
3786 | AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) |
3787 | }; |
3788 | |
3789 | // Find the one SGPR operand we are allowed to use. |
3790 | int ConstantBusLimit = ST.getConstantBusLimit(Opc); |
3791 | int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; |
3792 | SmallDenseSet<unsigned> SGPRsUsed; |
3793 | unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); |
3794 | if (SGPRReg != AMDGPU::NoRegister) { |
3795 | SGPRsUsed.insert(SGPRReg); |
3796 | --ConstantBusLimit; |
3797 | } |
3798 | |
3799 | for (unsigned i = 0; i < 3; ++i) { |
3800 | int Idx = VOP3Idx[i]; |
3801 | if (Idx == -1) |
3802 | break; |
3803 | MachineOperand &MO = MI.getOperand(Idx); |
3804 | |
3805 | if (!MO.isReg()) { |
3806 | if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) |
3807 | continue; |
3808 | |
3809 | if (LiteralLimit > 0 && ConstantBusLimit > 0) { |
3810 | --LiteralLimit; |
3811 | --ConstantBusLimit; |
3812 | continue; |
3813 | } |
3814 | |
3815 | --LiteralLimit; |
3816 | --ConstantBusLimit; |
3817 | legalizeOpWithMove(MI, Idx); |
3818 | continue; |
3819 | } |
3820 | |
3821 | if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) |
3822 | continue; // VGPRs are legal |
3823 | |
3824 | // We can use one SGPR in each VOP3 instruction prior to GFX10 |
3825 | // and two starting from GFX10. |
3826 | if (SGPRsUsed.count(MO.getReg())) |
3827 | continue; |
3828 | if (ConstantBusLimit > 0) { |
3829 | SGPRsUsed.insert(MO.getReg()); |
3830 | --ConstantBusLimit; |
3831 | continue; |
3832 | } |
3833 | |
3834 | // If we make it this far, then the operand is not legal and we must |
3835 | // legalize it. |
3836 | legalizeOpWithMove(MI, Idx); |
3837 | } |
3838 | } |
3839 | |
3840 | unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, |
3841 | MachineRegisterInfo &MRI) const { |
3842 | const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); |
3843 | const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); |
3844 | unsigned DstReg = MRI.createVirtualRegister(SRC); |
3845 | unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; |
3846 | |
3847 | if (SubRegs == 1) { |
3848 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), |
3849 | get(AMDGPU::V_READFIRSTLANE_B32), DstReg) |
3850 | .addReg(SrcReg); |
3851 | return DstReg; |
3852 | } |
3853 | |
3854 | SmallVector<unsigned, 8> SRegs; |
3855 | for (unsigned i = 0; i < SubRegs; ++i) { |
3856 | unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
3857 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), |
3858 | get(AMDGPU::V_READFIRSTLANE_B32), SGPR) |
3859 | .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); |
3860 | SRegs.push_back(SGPR); |
3861 | } |
3862 | |
3863 | MachineInstrBuilder MIB = |
3864 | BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), |
3865 | get(AMDGPU::REG_SEQUENCE), DstReg); |
3866 | for (unsigned i = 0; i < SubRegs; ++i) { |
3867 | MIB.addReg(SRegs[i]); |
3868 | MIB.addImm(RI.getSubRegFromChannel(i)); |
3869 | } |
3870 | return DstReg; |
3871 | } |
3872 | |
3873 | void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, |
3874 | MachineInstr &MI) const { |
3875 | |
3876 | // If the pointer is store in VGPRs, then we need to move them to |
3877 | // SGPRs using v_readfirstlane. This is safe because we only select |
3878 | // loads with uniform pointers to SMRD instruction so we know the |
3879 | // pointer value is uniform. |
3880 | MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); |
3881 | if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { |
3882 | unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); |
3883 | SBase->setReg(SGPR); |
3884 | } |
3885 | MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); |
3886 | if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { |
3887 | unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); |
3888 | SOff->setReg(SGPR); |
3889 | } |
3890 | } |
3891 | |
3892 | void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, |
3893 | MachineBasicBlock::iterator I, |
3894 | const TargetRegisterClass *DstRC, |
3895 | MachineOperand &Op, |
3896 | MachineRegisterInfo &MRI, |
3897 | const DebugLoc &DL) const { |
3898 | unsigned OpReg = Op.getReg(); |
3899 | unsigned OpSubReg = Op.getSubReg(); |
3900 | |
3901 | const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( |
3902 | RI.getRegClassForReg(MRI, OpReg), OpSubReg); |
3903 | |
3904 | // Check if operand is already the correct register class. |
3905 | if (DstRC == OpRC) |
3906 | return; |
3907 | |
3908 | unsigned DstReg = MRI.createVirtualRegister(DstRC); |
3909 | MachineInstr *Copy = |
3910 | BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); |
3911 | |
3912 | Op.setReg(DstReg); |
3913 | Op.setSubReg(0); |
3914 | |
3915 | MachineInstr *Def = MRI.getVRegDef(OpReg); |
3916 | if (!Def) |
3917 | return; |
3918 | |
3919 | // Try to eliminate the copy if it is copying an immediate value. |
3920 | if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) |
3921 | FoldImmediate(*Copy, *Def, OpReg, &MRI); |
3922 | } |
3923 | |
3924 | // Emit the actual waterfall loop, executing the wrapped instruction for each |
3925 | // unique value of \p Rsrc across all lanes. In the best case we execute 1 |
3926 | // iteration, in the worst case we execute 64 (once per lane). |
3927 | static void |
3928 | emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, |
3929 | MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, |
3930 | const DebugLoc &DL, MachineOperand &Rsrc) { |
3931 | MachineBasicBlock::iterator I = LoopBB.begin(); |
3932 | |
3933 | unsigned VRsrc = Rsrc.getReg(); |
3934 | unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); |
3935 | |
3936 | unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
3937 | unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
3938 | unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
3939 | unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
3940 | unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
3941 | unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
3942 | unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
3943 | unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
3944 | unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); |
3945 | |
3946 | // Beginning of the loop, read the next Rsrc variant. |
3947 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) |
3948 | .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0); |
3949 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1) |
3950 | .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1); |
3951 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2) |
3952 | .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2); |
3953 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3) |
3954 | .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3); |
3955 | |
3956 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc) |
3957 | .addReg(SRsrcSub0) |
3958 | .addImm(AMDGPU::sub0) |
3959 | .addReg(SRsrcSub1) |
3960 | .addImm(AMDGPU::sub1) |
3961 | .addReg(SRsrcSub2) |
3962 | .addImm(AMDGPU::sub2) |
3963 | .addReg(SRsrcSub3) |
3964 | .addImm(AMDGPU::sub3); |
3965 | |
3966 | // Update Rsrc operand to use the SGPR Rsrc. |
3967 | Rsrc.setReg(SRsrc); |
3968 | Rsrc.setIsKill(true); |
3969 | |
3970 | // Identify all lanes with identical Rsrc operands in their VGPRs. |
3971 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0) |
3972 | .addReg(SRsrc, 0, AMDGPU::sub0_sub1) |
3973 | .addReg(VRsrc, 0, AMDGPU::sub0_sub1); |
3974 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) |
3975 | .addReg(SRsrc, 0, AMDGPU::sub2_sub3) |
3976 | .addReg(VRsrc, 0, AMDGPU::sub2_sub3); |
3977 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond) |
3978 | .addReg(CondReg0) |
3979 | .addReg(CondReg1); |
3980 | |
3981 | MRI.setSimpleHint(SaveExec, AndCond); |
3982 | |
3983 | // Update EXEC to matching lanes, saving original to SaveExec. |
3984 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec) |
3985 | .addReg(AndCond, RegState::Kill); |
3986 | |
3987 | // The original instruction is here; we insert the terminators after it. |
3988 | I = LoopBB.end(); |
3989 | |
3990 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. |
3991 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) |
3992 | .addReg(AMDGPU::EXEC) |
3993 | .addReg(SaveExec); |
3994 | BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); |
3995 | } |
3996 | |
3997 | // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register |
3998 | // with SGPRs by iterating over all unique values across all lanes. |
3999 | static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, |
4000 | MachineOperand &Rsrc, MachineDominatorTree *MDT) { |
4001 | MachineBasicBlock &MBB = *MI.getParent(); |
4002 | MachineFunction &MF = *MBB.getParent(); |
4003 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
4004 | MachineBasicBlock::iterator I(&MI); |
4005 | const DebugLoc &DL = MI.getDebugLoc(); |
4006 | |
4007 | unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
4008 | |
4009 | // Save the EXEC mask |
4010 | BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec) |
4011 | .addReg(AMDGPU::EXEC); |
4012 | |
4013 | // Killed uses in the instruction we are waterfalling around will be |
4014 | // incorrect due to the added control-flow. |
4015 | for (auto &MO : MI.uses()) { |
4016 | if (MO.isReg() && MO.isUse()) { |
4017 | MRI.clearKillFlags(MO.getReg()); |
4018 | } |
4019 | } |
4020 | |
4021 | // To insert the loop we need to split the block. Move everything after this |
4022 | // point to a new block, and insert a new empty block between the two. |
4023 | MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); |
4024 | MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); |
4025 | MachineFunction::iterator MBBI(MBB); |
4026 | ++MBBI; |
4027 | |
4028 | MF.insert(MBBI, LoopBB); |
4029 | MF.insert(MBBI, RemainderBB); |
4030 | |
4031 | LoopBB->addSuccessor(LoopBB); |
4032 | LoopBB->addSuccessor(RemainderBB); |
4033 | |
4034 | // Move MI to the LoopBB, and the remainder of the block to RemainderBB. |
4035 | MachineBasicBlock::iterator J = I++; |
4036 | RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); |
4037 | RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); |
4038 | LoopBB->splice(LoopBB->begin(), &MBB, J); |
4039 | |
4040 | MBB.addSuccessor(LoopBB); |
4041 | |
4042 | // Update dominators. We know that MBB immediately dominates LoopBB, that |
4043 | // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately |
4044 | // dominates all of the successors transferred to it from MBB that MBB used |
4045 | // to dominate. |
4046 | if (MDT) { |
4047 | MDT->addNewBlock(LoopBB, &MBB); |
4048 | MDT->addNewBlock(RemainderBB, LoopBB); |
4049 | for (auto &Succ : RemainderBB->successors()) { |
4050 | if (MDT->dominates(&MBB, Succ)) { |
4051 | MDT->changeImmediateDominator(Succ, RemainderBB); |
4052 | } |
4053 | } |
4054 | } |
4055 | |
4056 | emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); |
4057 | |
4058 | // Restore the EXEC mask |
4059 | MachineBasicBlock::iterator First = RemainderBB->begin(); |
4060 | BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) |
4061 | .addReg(SaveExec); |
4062 | } |
4063 | |
4064 | // Extract pointer from Rsrc and return a zero-value Rsrc replacement. |
4065 | static std::tuple<unsigned, unsigned> |
4066 | extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { |
4067 | MachineBasicBlock &MBB = *MI.getParent(); |
4068 | MachineFunction &MF = *MBB.getParent(); |
4069 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
4070 | |
4071 | // Extract the ptr from the resource descriptor. |
4072 | unsigned RsrcPtr = |
4073 | TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, |
4074 | AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); |
4075 | |
4076 | // Create an empty resource descriptor |
4077 | unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
4078 | unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
4079 | unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); |
4080 | unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); |
4081 | uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); |
4082 | |
4083 | // Zero64 = 0 |
4084 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) |
4085 | .addImm(0); |
4086 | |
4087 | // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} |
4088 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) |
4089 | .addImm(RsrcDataFormat & 0xFFFFFFFF); |
4090 | |
4091 | // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} |
4092 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) |
4093 | .addImm(RsrcDataFormat >> 32); |
4094 | |
4095 | // NewSRsrc = {Zero64, SRsrcFormat} |
4096 | BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) |
4097 | .addReg(Zero64) |
4098 | .addImm(AMDGPU::sub0_sub1) |
4099 | .addReg(SRsrcFormatLo) |
4100 | .addImm(AMDGPU::sub2) |
4101 | .addReg(SRsrcFormatHi) |
4102 | .addImm(AMDGPU::sub3); |
4103 | |
4104 | return std::make_tuple(RsrcPtr, NewSRsrc); |
4105 | } |
4106 | |
4107 | void SIInstrInfo::legalizeOperands(MachineInstr &MI, |
4108 | MachineDominatorTree *MDT) const { |
4109 | MachineFunction &MF = *MI.getParent()->getParent(); |
4110 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
4111 | |
4112 | // Legalize VOP2 |
4113 | if (isVOP2(MI) || isVOPC(MI)) { |
4114 | legalizeOperandsVOP2(MRI, MI); |
4115 | return; |
4116 | } |
4117 | |
4118 | // Legalize VOP3 |
4119 | if (isVOP3(MI)) { |
4120 | legalizeOperandsVOP3(MRI, MI); |
4121 | return; |
4122 | } |
4123 | |
4124 | // Legalize SMRD |
4125 | if (isSMRD(MI)) { |
4126 | legalizeOperandsSMRD(MRI, MI); |
4127 | return; |
4128 | } |
4129 | |
4130 | // Legalize REG_SEQUENCE and PHI |
4131 | // The register class of the operands much be the same type as the register |
4132 | // class of the output. |
4133 | if (MI.getOpcode() == AMDGPU::PHI) { |
4134 | const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; |
4135 | for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { |
4136 | if (!MI.getOperand(i).isReg() || |
4137 | !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) |
4138 | continue; |
4139 | const TargetRegisterClass *OpRC = |
4140 | MRI.getRegClass(MI.getOperand(i).getReg()); |
4141 | if (RI.hasVGPRs(OpRC)) { |
4142 | VRC = OpRC; |
4143 | } else { |
4144 | SRC = OpRC; |
4145 | } |
4146 | } |
4147 | |
4148 | // If any of the operands are VGPR registers, then they all most be |
4149 | // otherwise we will create illegal VGPR->SGPR copies when legalizing |
4150 | // them. |
4151 | if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { |
4152 | if (!VRC) { |
4153 | assert(SRC)((SRC) ? static_cast<void> (0) : __assert_fail ("SRC", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4153, __PRETTY_FUNCTION__)); |
4154 | if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { |
4155 | VRC = &AMDGPU::VReg_1RegClass; |
4156 | } else |
4157 | VRC = RI.getEquivalentVGPRClass(SRC); |
4158 | } |
4159 | RC = VRC; |
4160 | } else { |
4161 | RC = SRC; |
4162 | } |
4163 | |
4164 | // Update all the operands so they have the same type. |
4165 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { |
4166 | MachineOperand &Op = MI.getOperand(I); |
4167 | if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) |
4168 | continue; |
4169 | |
4170 | // MI is a PHI instruction. |
4171 | MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); |
4172 | MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); |
4173 | |
4174 | // Avoid creating no-op copies with the same src and dst reg class. These |
4175 | // confuse some of the machine passes. |
4176 | legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); |
4177 | } |
4178 | } |
4179 | |
4180 | // REG_SEQUENCE doesn't really require operand legalization, but if one has a |
4181 | // VGPR dest type and SGPR sources, insert copies so all operands are |
4182 | // VGPRs. This seems to help operand folding / the register coalescer. |
4183 | if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { |
4184 | MachineBasicBlock *MBB = MI.getParent(); |
4185 | const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); |
4186 | if (RI.hasVGPRs(DstRC)) { |
4187 | // Update all the operands so they are VGPR register classes. These may |
4188 | // not be the same register class because REG_SEQUENCE supports mixing |
4189 | // subregister index types e.g. sub0_sub1 + sub2 + sub3 |
4190 | for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { |
4191 | MachineOperand &Op = MI.getOperand(I); |
4192 | if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) |
4193 | continue; |
4194 | |
4195 | const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); |
4196 | const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); |
4197 | if (VRC == OpRC) |
4198 | continue; |
4199 | |
4200 | legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); |
4201 | Op.setIsKill(); |
4202 | } |
4203 | } |
4204 | |
4205 | return; |
4206 | } |
4207 | |
4208 | // Legalize INSERT_SUBREG |
4209 | // src0 must have the same register class as dst |
4210 | if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { |
4211 | unsigned Dst = MI.getOperand(0).getReg(); |
4212 | unsigned Src0 = MI.getOperand(1).getReg(); |
4213 | const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); |
4214 | const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); |
4215 | if (DstRC != Src0RC) { |
4216 | MachineBasicBlock *MBB = MI.getParent(); |
4217 | MachineOperand &Op = MI.getOperand(1); |
4218 | legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); |
4219 | } |
4220 | return; |
4221 | } |
4222 | |
4223 | // Legalize SI_INIT_M0 |
4224 | if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { |
4225 | MachineOperand &Src = MI.getOperand(0); |
4226 | if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg()))) |
4227 | Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); |
4228 | return; |
4229 | } |
4230 | |
4231 | // Legalize MIMG and MUBUF/MTBUF for shaders. |
4232 | // |
4233 | // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via |
4234 | // scratch memory access. In both cases, the legalization never involves |
4235 | // conversion to the addr64 form. |
4236 | if (isMIMG(MI) || |
4237 | (AMDGPU::isShader(MF.getFunction().getCallingConv()) && |
4238 | (isMUBUF(MI) || isMTBUF(MI)))) { |
4239 | MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); |
4240 | if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { |
4241 | unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); |
4242 | SRsrc->setReg(SGPR); |
4243 | } |
4244 | |
4245 | MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); |
4246 | if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { |
4247 | unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); |
4248 | SSamp->setReg(SGPR); |
4249 | } |
4250 | return; |
4251 | } |
4252 | |
4253 | // Legalize MUBUF* instructions. |
4254 | int RsrcIdx = |
4255 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); |
4256 | if (RsrcIdx != -1) { |
4257 | // We have an MUBUF instruction |
4258 | MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); |
4259 | unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; |
4260 | if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), |
4261 | RI.getRegClass(RsrcRC))) { |
4262 | // The operands are legal. |
4263 | // FIXME: We may need to legalize operands besided srsrc. |
4264 | return; |
4265 | } |
4266 | |
4267 | // Legalize a VGPR Rsrc. |
4268 | // |
4269 | // If the instruction is _ADDR64, we can avoid a waterfall by extracting |
4270 | // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using |
4271 | // a zero-value SRsrc. |
4272 | // |
4273 | // If the instruction is _OFFSET (both idxen and offen disabled), and we |
4274 | // support ADDR64 instructions, we can convert to ADDR64 and do the same as |
4275 | // above. |
4276 | // |
4277 | // Otherwise we are on non-ADDR64 hardware, and/or we have |
4278 | // idxen/offen/bothen and we fall back to a waterfall loop. |
4279 | |
4280 | MachineBasicBlock &MBB = *MI.getParent(); |
4281 | |
4282 | MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); |
4283 | if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { |
4284 | // This is already an ADDR64 instruction so we need to add the pointer |
4285 | // extracted from the resource descriptor to the current value of VAddr. |
4286 | unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4287 | unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4288 | unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); |
4289 | |
4290 | unsigned RsrcPtr, NewSRsrc; |
4291 | std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); |
4292 | |
4293 | // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 |
4294 | DebugLoc DL = MI.getDebugLoc(); |
4295 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) |
4296 | .addReg(RsrcPtr, 0, AMDGPU::sub0) |
4297 | .addReg(VAddr->getReg(), 0, AMDGPU::sub0); |
4298 | |
4299 | // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 |
4300 | BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) |
4301 | .addReg(RsrcPtr, 0, AMDGPU::sub1) |
4302 | .addReg(VAddr->getReg(), 0, AMDGPU::sub1); |
4303 | |
4304 | // NewVaddr = {NewVaddrHi, NewVaddrLo} |
4305 | BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) |
4306 | .addReg(NewVAddrLo) |
4307 | .addImm(AMDGPU::sub0) |
4308 | .addReg(NewVAddrHi) |
4309 | .addImm(AMDGPU::sub1); |
4310 | |
4311 | VAddr->setReg(NewVAddr); |
4312 | Rsrc->setReg(NewSRsrc); |
4313 | } else if (!VAddr && ST.hasAddr64()) { |
4314 | // This instructions is the _OFFSET variant, so we need to convert it to |
4315 | // ADDR64. |
4316 | assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()((MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration () < AMDGPUSubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here" ) ? static_cast<void> (0) : __assert_fail ("MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4318, __PRETTY_FUNCTION__)) |
4317 | < AMDGPUSubtarget::VOLCANIC_ISLANDS &&((MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration () < AMDGPUSubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here" ) ? static_cast<void> (0) : __assert_fail ("MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4318, __PRETTY_FUNCTION__)) |
4318 | "FIXME: Need to emit flat atomics here")((MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration () < AMDGPUSubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here" ) ? static_cast<void> (0) : __assert_fail ("MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && \"FIXME: Need to emit flat atomics here\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4318, __PRETTY_FUNCTION__)); |
4319 | |
4320 | unsigned RsrcPtr, NewSRsrc; |
4321 | std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); |
4322 | |
4323 | unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); |
4324 | MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); |
4325 | MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); |
4326 | MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); |
4327 | unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); |
4328 | |
4329 | // Atomics rith return have have an additional tied operand and are |
4330 | // missing some of the special bits. |
4331 | MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); |
4332 | MachineInstr *Addr64; |
4333 | |
4334 | if (!VDataIn) { |
4335 | // Regular buffer load / store. |
4336 | MachineInstrBuilder MIB = |
4337 | BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) |
4338 | .add(*VData) |
4339 | .addReg(NewVAddr) |
4340 | .addReg(NewSRsrc) |
4341 | .add(*SOffset) |
4342 | .add(*Offset); |
4343 | |
4344 | // Atomics do not have this operand. |
4345 | if (const MachineOperand *GLC = |
4346 | getNamedOperand(MI, AMDGPU::OpName::glc)) { |
4347 | MIB.addImm(GLC->getImm()); |
4348 | } |
4349 | if (const MachineOperand *DLC = |
4350 | getNamedOperand(MI, AMDGPU::OpName::dlc)) { |
4351 | MIB.addImm(DLC->getImm()); |
4352 | } |
4353 | |
4354 | MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); |
4355 | |
4356 | if (const MachineOperand *TFE = |
4357 | getNamedOperand(MI, AMDGPU::OpName::tfe)) { |
4358 | MIB.addImm(TFE->getImm()); |
4359 | } |
4360 | |
4361 | MIB.cloneMemRefs(MI); |
4362 | Addr64 = MIB; |
4363 | } else { |
4364 | // Atomics with return. |
4365 | Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) |
4366 | .add(*VData) |
4367 | .add(*VDataIn) |
4368 | .addReg(NewVAddr) |
4369 | .addReg(NewSRsrc) |
4370 | .add(*SOffset) |
4371 | .add(*Offset) |
4372 | .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) |
4373 | .cloneMemRefs(MI); |
4374 | } |
4375 | |
4376 | MI.removeFromParent(); |
4377 | |
4378 | // NewVaddr = {NewVaddrHi, NewVaddrLo} |
4379 | BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), |
4380 | NewVAddr) |
4381 | .addReg(RsrcPtr, 0, AMDGPU::sub0) |
4382 | .addImm(AMDGPU::sub0) |
4383 | .addReg(RsrcPtr, 0, AMDGPU::sub1) |
4384 | .addImm(AMDGPU::sub1); |
4385 | } else { |
4386 | // This is another variant; legalize Rsrc with waterfall loop from VGPRs |
4387 | // to SGPRs. |
4388 | loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); |
4389 | } |
4390 | } |
4391 | } |
4392 | |
4393 | void SIInstrInfo::moveToVALU(MachineInstr &TopInst, |
4394 | MachineDominatorTree *MDT) const { |
4395 | SetVectorType Worklist; |
4396 | Worklist.insert(&TopInst); |
4397 | |
4398 | while (!Worklist.empty()) { |
4399 | MachineInstr &Inst = *Worklist.pop_back_val(); |
4400 | MachineBasicBlock *MBB = Inst.getParent(); |
4401 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); |
4402 | |
4403 | unsigned Opcode = Inst.getOpcode(); |
4404 | unsigned NewOpcode = getVALUOp(Inst); |
4405 | |
4406 | // Handle some special cases |
4407 | switch (Opcode) { |
4408 | default: |
4409 | break; |
4410 | case AMDGPU::S_ADD_U64_PSEUDO: |
4411 | case AMDGPU::S_SUB_U64_PSEUDO: |
4412 | splitScalar64BitAddSub(Worklist, Inst, MDT); |
4413 | Inst.eraseFromParent(); |
4414 | continue; |
4415 | case AMDGPU::S_ADD_I32: |
4416 | case AMDGPU::S_SUB_I32: |
4417 | // FIXME: The u32 versions currently selected use the carry. |
4418 | if (moveScalarAddSub(Worklist, Inst, MDT)) |
4419 | continue; |
4420 | |
4421 | // Default handling |
4422 | break; |
4423 | case AMDGPU::S_AND_B64: |
4424 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); |
4425 | Inst.eraseFromParent(); |
4426 | continue; |
4427 | |
4428 | case AMDGPU::S_OR_B64: |
4429 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); |
4430 | Inst.eraseFromParent(); |
4431 | continue; |
4432 | |
4433 | case AMDGPU::S_XOR_B64: |
4434 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); |
4435 | Inst.eraseFromParent(); |
4436 | continue; |
4437 | |
4438 | case AMDGPU::S_NAND_B64: |
4439 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); |
4440 | Inst.eraseFromParent(); |
4441 | continue; |
4442 | |
4443 | case AMDGPU::S_NOR_B64: |
4444 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); |
4445 | Inst.eraseFromParent(); |
4446 | continue; |
4447 | |
4448 | case AMDGPU::S_XNOR_B64: |
4449 | if (ST.hasDLInsts()) |
4450 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); |
4451 | else |
4452 | splitScalar64BitXnor(Worklist, Inst, MDT); |
4453 | Inst.eraseFromParent(); |
4454 | continue; |
4455 | |
4456 | case AMDGPU::S_ANDN2_B64: |
4457 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); |
4458 | Inst.eraseFromParent(); |
4459 | continue; |
4460 | |
4461 | case AMDGPU::S_ORN2_B64: |
4462 | splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); |
4463 | Inst.eraseFromParent(); |
4464 | continue; |
4465 | |
4466 | case AMDGPU::S_NOT_B64: |
4467 | splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); |
4468 | Inst.eraseFromParent(); |
4469 | continue; |
4470 | |
4471 | case AMDGPU::S_BCNT1_I32_B64: |
4472 | splitScalar64BitBCNT(Worklist, Inst); |
4473 | Inst.eraseFromParent(); |
4474 | continue; |
4475 | |
4476 | case AMDGPU::S_BFE_I64: |
4477 | splitScalar64BitBFE(Worklist, Inst); |
4478 | Inst.eraseFromParent(); |
4479 | continue; |
4480 | |
4481 | case AMDGPU::S_LSHL_B32: |
4482 | if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { |
4483 | NewOpcode = AMDGPU::V_LSHLREV_B32_e64; |
4484 | swapOperands(Inst); |
4485 | } |
4486 | break; |
4487 | case AMDGPU::S_ASHR_I32: |
4488 | if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { |
4489 | NewOpcode = AMDGPU::V_ASHRREV_I32_e64; |
4490 | swapOperands(Inst); |
4491 | } |
4492 | break; |
4493 | case AMDGPU::S_LSHR_B32: |
4494 | if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { |
4495 | NewOpcode = AMDGPU::V_LSHRREV_B32_e64; |
4496 | swapOperands(Inst); |
4497 | } |
4498 | break; |
4499 | case AMDGPU::S_LSHL_B64: |
4500 | if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { |
4501 | NewOpcode = AMDGPU::V_LSHLREV_B64; |
4502 | swapOperands(Inst); |
4503 | } |
4504 | break; |
4505 | case AMDGPU::S_ASHR_I64: |
4506 | if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { |
4507 | NewOpcode = AMDGPU::V_ASHRREV_I64; |
4508 | swapOperands(Inst); |
4509 | } |
4510 | break; |
4511 | case AMDGPU::S_LSHR_B64: |
4512 | if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { |
4513 | NewOpcode = AMDGPU::V_LSHRREV_B64; |
4514 | swapOperands(Inst); |
4515 | } |
4516 | break; |
4517 | |
4518 | case AMDGPU::S_ABS_I32: |
4519 | lowerScalarAbs(Worklist, Inst); |
4520 | Inst.eraseFromParent(); |
4521 | continue; |
4522 | |
4523 | case AMDGPU::S_CBRANCH_SCC0: |
4524 | case AMDGPU::S_CBRANCH_SCC1: |
4525 | // Clear unused bits of vcc |
4526 | BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), |
4527 | AMDGPU::VCC) |
4528 | .addReg(AMDGPU::EXEC) |
4529 | .addReg(AMDGPU::VCC); |
4530 | break; |
4531 | |
4532 | case AMDGPU::S_BFE_U64: |
4533 | case AMDGPU::S_BFM_B64: |
4534 | llvm_unreachable("Moving this op to VALU not implemented")::llvm::llvm_unreachable_internal("Moving this op to VALU not implemented" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4534); |
4535 | |
4536 | case AMDGPU::S_PACK_LL_B32_B16: |
4537 | case AMDGPU::S_PACK_LH_B32_B16: |
4538 | case AMDGPU::S_PACK_HH_B32_B16: |
4539 | movePackToVALU(Worklist, MRI, Inst); |
4540 | Inst.eraseFromParent(); |
4541 | continue; |
4542 | |
4543 | case AMDGPU::S_XNOR_B32: |
4544 | lowerScalarXnor(Worklist, Inst); |
4545 | Inst.eraseFromParent(); |
4546 | continue; |
4547 | |
4548 | case AMDGPU::S_NAND_B32: |
4549 | splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); |
4550 | Inst.eraseFromParent(); |
4551 | continue; |
4552 | |
4553 | case AMDGPU::S_NOR_B32: |
4554 | splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); |
4555 | Inst.eraseFromParent(); |
4556 | continue; |
4557 | |
4558 | case AMDGPU::S_ANDN2_B32: |
4559 | splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); |
4560 | Inst.eraseFromParent(); |
4561 | continue; |
4562 | |
4563 | case AMDGPU::S_ORN2_B32: |
4564 | splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); |
4565 | Inst.eraseFromParent(); |
4566 | continue; |
4567 | } |
4568 | |
4569 | if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { |
4570 | // We cannot move this instruction to the VALU, so we should try to |
4571 | // legalize its operands instead. |
4572 | legalizeOperands(Inst, MDT); |
4573 | continue; |
4574 | } |
4575 | |
4576 | // Use the new VALU Opcode. |
4577 | const MCInstrDesc &NewDesc = get(NewOpcode); |
4578 | Inst.setDesc(NewDesc); |
4579 | |
4580 | // Remove any references to SCC. Vector instructions can't read from it, and |
4581 | // We're just about to add the implicit use / defs of VCC, and we don't want |
4582 | // both. |
4583 | for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { |
4584 | MachineOperand &Op = Inst.getOperand(i); |
4585 | if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { |
4586 | // Only propagate through live-def of SCC. |
4587 | if (Op.isDef() && !Op.isDead()) |
4588 | addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); |
4589 | Inst.RemoveOperand(i); |
4590 | } |
4591 | } |
4592 | |
4593 | if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { |
4594 | // We are converting these to a BFE, so we need to add the missing |
4595 | // operands for the size and offset. |
4596 | unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; |
4597 | Inst.addOperand(MachineOperand::CreateImm(0)); |
4598 | Inst.addOperand(MachineOperand::CreateImm(Size)); |
4599 | |
4600 | } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { |
4601 | // The VALU version adds the second operand to the result, so insert an |
4602 | // extra 0 operand. |
4603 | Inst.addOperand(MachineOperand::CreateImm(0)); |
4604 | } |
4605 | |
4606 | Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); |
4607 | |
4608 | if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { |
4609 | const MachineOperand &OffsetWidthOp = Inst.getOperand(2); |
4610 | // If we need to move this to VGPRs, we need to unpack the second operand |
4611 | // back into the 2 separate ones for bit offset and width. |
4612 | assert(OffsetWidthOp.isImm() &&((OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset" ) ? static_cast<void> (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4613, __PRETTY_FUNCTION__)) |
4613 | "Scalar BFE is only implemented for constant width and offset")((OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset" ) ? static_cast<void> (0) : __assert_fail ("OffsetWidthOp.isImm() && \"Scalar BFE is only implemented for constant width and offset\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4613, __PRETTY_FUNCTION__)); |
4614 | uint32_t Imm = OffsetWidthOp.getImm(); |
4615 | |
4616 | uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. |
4617 | uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. |
4618 | Inst.RemoveOperand(2); // Remove old immediate. |
4619 | Inst.addOperand(MachineOperand::CreateImm(Offset)); |
4620 | Inst.addOperand(MachineOperand::CreateImm(BitWidth)); |
4621 | } |
4622 | |
4623 | bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); |
4624 | unsigned NewDstReg = AMDGPU::NoRegister; |
4625 | if (HasDst) { |
4626 | unsigned DstReg = Inst.getOperand(0).getReg(); |
4627 | if (TargetRegisterInfo::isPhysicalRegister(DstReg)) |
4628 | continue; |
4629 | |
4630 | // Update the destination register class. |
4631 | const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); |
4632 | if (!NewDstRC) |
4633 | continue; |
4634 | |
4635 | if (Inst.isCopy() && |
4636 | TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && |
4637 | NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { |
4638 | // Instead of creating a copy where src and dst are the same register |
4639 | // class, we just replace all uses of dst with src. These kinds of |
4640 | // copies interfere with the heuristics MachineSink uses to decide |
4641 | // whether or not to split a critical edge. Since the pass assumes |
4642 | // that copies will end up as machine instructions and not be |
4643 | // eliminated. |
4644 | addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); |
4645 | MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); |
4646 | MRI.clearKillFlags(Inst.getOperand(1).getReg()); |
4647 | Inst.getOperand(0).setReg(DstReg); |
4648 | |
4649 | // Make sure we don't leave around a dead VGPR->SGPR copy. Normally |
4650 | // these are deleted later, but at -O0 it would leave a suspicious |
4651 | // looking illegal copy of an undef register. |
4652 | for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) |
4653 | Inst.RemoveOperand(I); |
4654 | Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); |
4655 | continue; |
4656 | } |
4657 | |
4658 | NewDstReg = MRI.createVirtualRegister(NewDstRC); |
4659 | MRI.replaceRegWith(DstReg, NewDstReg); |
4660 | } |
4661 | |
4662 | // Legalize the operands |
4663 | legalizeOperands(Inst, MDT); |
4664 | |
4665 | if (HasDst) |
4666 | addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); |
4667 | } |
4668 | } |
4669 | |
4670 | // Add/sub require special handling to deal with carry outs. |
4671 | bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, |
4672 | MachineDominatorTree *MDT) const { |
4673 | if (ST.hasAddNoCarry()) { |
4674 | // Assume there is no user of scc since we don't select this in that case. |
4675 | // Since scc isn't used, it doesn't really matter if the i32 or u32 variant |
4676 | // is used. |
4677 | |
4678 | MachineBasicBlock &MBB = *Inst.getParent(); |
4679 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
4680 | |
4681 | unsigned OldDstReg = Inst.getOperand(0).getReg(); |
4682 | unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4683 | |
4684 | unsigned Opc = Inst.getOpcode(); |
4685 | assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32)((Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32) ? static_cast <void> (0) : __assert_fail ("Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4685, __PRETTY_FUNCTION__)); |
4686 | |
4687 | unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? |
4688 | AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; |
4689 | |
4690 | assert(Inst.getOperand(3).getReg() == AMDGPU::SCC)((Inst.getOperand(3).getReg() == AMDGPU::SCC) ? static_cast< void> (0) : __assert_fail ("Inst.getOperand(3).getReg() == AMDGPU::SCC" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 4690, __PRETTY_FUNCTION__)); |
4691 | Inst.RemoveOperand(3); |
4692 | |
4693 | Inst.setDesc(get(NewOpc)); |
4694 | Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit |
4695 | Inst.addImplicitDefUseOperands(*MBB.getParent()); |
4696 | MRI.replaceRegWith(OldDstReg, ResultReg); |
4697 | legalizeOperands(Inst, MDT); |
4698 | |
4699 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); |
4700 | return true; |
4701 | } |
4702 | |
4703 | return false; |
4704 | } |
4705 | |
4706 | void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, |
4707 | MachineInstr &Inst) const { |
4708 | MachineBasicBlock &MBB = *Inst.getParent(); |
4709 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
4710 | MachineBasicBlock::iterator MII = Inst; |
4711 | DebugLoc DL = Inst.getDebugLoc(); |
4712 | |
4713 | MachineOperand &Dest = Inst.getOperand(0); |
4714 | MachineOperand &Src = Inst.getOperand(1); |
4715 | unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4716 | unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4717 | |
4718 | unsigned SubOp = ST.hasAddNoCarry() ? |
4719 | AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; |
4720 | |
4721 | BuildMI(MBB, MII, DL, get(SubOp), TmpReg) |
4722 | .addImm(0) |
4723 | .addReg(Src.getReg()); |
4724 | |
4725 | BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) |
4726 | .addReg(Src.getReg()) |
4727 | .addReg(TmpReg); |
4728 | |
4729 | MRI.replaceRegWith(Dest.getReg(), ResultReg); |
4730 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); |
4731 | } |
4732 | |
4733 | void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, |
4734 | MachineInstr &Inst) const { |
4735 | MachineBasicBlock &MBB = *Inst.getParent(); |
4736 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
4737 | MachineBasicBlock::iterator MII = Inst; |
4738 | const DebugLoc &DL = Inst.getDebugLoc(); |
4739 | |
4740 | MachineOperand &Dest = Inst.getOperand(0); |
4741 | MachineOperand &Src0 = Inst.getOperand(1); |
4742 | MachineOperand &Src1 = Inst.getOperand(2); |
4743 | |
4744 | if (ST.hasDLInsts()) { |
4745 | unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4746 | legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); |
4747 | legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); |
4748 | |
4749 | BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) |
4750 | .add(Src0) |
4751 | .add(Src1); |
4752 | |
4753 | MRI.replaceRegWith(Dest.getReg(), NewDest); |
4754 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); |
4755 | } else { |
4756 | // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can |
4757 | // invert either source and then perform the XOR. If either source is a |
4758 | // scalar register, then we can leave the inversion on the scalar unit to |
4759 | // acheive a better distrubution of scalar and vector instructions. |
4760 | bool Src0IsSGPR = Src0.isReg() && |
4761 | RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); |
4762 | bool Src1IsSGPR = Src1.isReg() && |
4763 | RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); |
4764 | MachineInstr *Not = nullptr; |
4765 | MachineInstr *Xor = nullptr; |
4766 | unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
4767 | unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
4768 | |
4769 | // Build a pair of scalar instructions and add them to the work list. |
4770 | // The next iteration over the work list will lower these to the vector |
4771 | // unit as necessary. |
4772 | if (Src0IsSGPR) { |
4773 | Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) |
Value stored to 'Not' is never read | |
4774 | .add(Src0); |
4775 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) |
4776 | .addReg(Temp) |
4777 | .add(Src1); |
4778 | } else if (Src1IsSGPR) { |
4779 | Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) |
4780 | .add(Src1); |
4781 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) |
4782 | .add(Src0) |
4783 | .addReg(Temp); |
4784 | } else { |
4785 | Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) |
4786 | .add(Src0) |
4787 | .add(Src1); |
4788 | Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) |
4789 | .addReg(Temp); |
4790 | Worklist.insert(Not); |
4791 | } |
4792 | |
4793 | MRI.replaceRegWith(Dest.getReg(), NewDest); |
4794 | |
4795 | Worklist.insert(Xor); |
4796 | |
4797 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); |
4798 | } |
4799 | } |
4800 | |
4801 | void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, |
4802 | MachineInstr &Inst, |
4803 | unsigned Opcode) const { |
4804 | MachineBasicBlock &MBB = *Inst.getParent(); |
4805 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
4806 | MachineBasicBlock::iterator MII = Inst; |
4807 | const DebugLoc &DL = Inst.getDebugLoc(); |
4808 | |
4809 | MachineOperand &Dest = Inst.getOperand(0); |
4810 | MachineOperand &Src0 = Inst.getOperand(1); |
4811 | MachineOperand &Src1 = Inst.getOperand(2); |
4812 | |
4813 | unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
4814 | unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
4815 | |
4816 | MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) |
4817 | .add(Src0) |
4818 | .add(Src1); |
4819 | |
4820 | MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) |
4821 | .addReg(Interm); |
4822 | |
4823 | Worklist.insert(&Op); |
4824 | Worklist.insert(&Not); |
4825 | |
4826 | MRI.replaceRegWith(Dest.getReg(), NewDest); |
4827 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); |
4828 | } |
4829 | |
4830 | void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, |
4831 | MachineInstr &Inst, |
4832 | unsigned Opcode) const { |
4833 | MachineBasicBlock &MBB = *Inst.getParent(); |
4834 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
4835 | MachineBasicBlock::iterator MII = Inst; |
4836 | const DebugLoc &DL = Inst.getDebugLoc(); |
4837 | |
4838 | MachineOperand &Dest = Inst.getOperand(0); |
4839 | MachineOperand &Src0 = Inst.getOperand(1); |
4840 | MachineOperand &Src1 = Inst.getOperand(2); |
4841 | |
4842 | unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
4843 | unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); |
4844 | |
4845 | MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) |
4846 | .add(Src1); |
4847 | |
4848 | MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) |
4849 | .add(Src0) |
4850 | .addReg(Interm); |
4851 | |
4852 | Worklist.insert(&Not); |
4853 | Worklist.insert(&Op); |
4854 | |
4855 | MRI.replaceRegWith(Dest.getReg(), NewDest); |
4856 | addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); |
4857 | } |
4858 | |
4859 | void SIInstrInfo::splitScalar64BitUnaryOp( |
4860 | SetVectorType &Worklist, MachineInstr &Inst, |
4861 | unsigned Opcode) const { |
4862 | MachineBasicBlock &MBB = *Inst.getParent(); |
4863 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
4864 | |
4865 | MachineOperand &Dest = Inst.getOperand(0); |
4866 | MachineOperand &Src0 = Inst.getOperand(1); |
4867 | DebugLoc DL = Inst.getDebugLoc(); |
4868 | |
4869 | MachineBasicBlock::iterator MII = Inst; |
4870 | |
4871 | const MCInstrDesc &InstDesc = get(Opcode); |
4872 | const TargetRegisterClass *Src0RC = Src0.isReg() ? |
4873 | MRI.getRegClass(Src0.getReg()) : |
4874 | &AMDGPU::SGPR_32RegClass; |
4875 | |
4876 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); |
4877 | |
4878 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, |
4879 | AMDGPU::sub0, Src0SubRC); |
4880 | |
4881 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); |
4882 | const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); |
4883 | const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); |
4884 | |
4885 | unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); |
4886 | MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); |
4887 | |
4888 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, |
4889 | AMDGPU::sub1, Src0SubRC); |
4890 | |
4891 | unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); |
4892 | MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); |
4893 | |
4894 | unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); |
4895 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) |
4896 | .addReg(DestSub0) |
4897 | .addImm(AMDGPU::sub0) |
4898 | .addReg(DestSub1) |
4899 | .addImm(AMDGPU::sub1); |
4900 | |
4901 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); |
4902 | |
4903 | Worklist.insert(&LoHalf); |
4904 | Worklist.insert(&HiHalf); |
4905 | |
4906 | // We don't need to legalizeOperands here because for a single operand, src0 |
4907 | // will support any kind of input. |
4908 | |
4909 | // Move all users of this moved value. |
4910 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); |
4911 | } |
4912 | |
4913 | void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, |
4914 | MachineInstr &Inst, |
4915 | MachineDominatorTree *MDT) const { |
4916 | bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); |
4917 | |
4918 | MachineBasicBlock &MBB = *Inst.getParent(); |
4919 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
4920 | |
4921 | unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); |
4922 | unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4923 | unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
4924 | |
4925 | unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
4926 | unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); |
4927 | |
4928 | MachineOperand &Dest = Inst.getOperand(0); |
4929 | MachineOperand &Src0 = Inst.getOperand(1); |
4930 | MachineOperand &Src1 = Inst.getOperand(2); |
4931 | const DebugLoc &DL = Inst.getDebugLoc(); |
4932 | MachineBasicBlock::iterator MII = Inst; |
4933 | |
4934 | const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); |
4935 | const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); |
4936 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); |
4937 | const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); |
4938 | |
4939 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, |
4940 | AMDGPU::sub0, Src0SubRC); |
4941 | MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, |
4942 | AMDGPU::sub0, Src1SubRC); |
4943 | |
4944 | |
4945 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, |
4946 | AMDGPU::sub1, Src0SubRC); |
4947 | MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, |
4948 | AMDGPU::sub1, Src1SubRC); |
4949 | |
4950 | unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; |
4951 | MachineInstr *LoHalf = |
4952 | BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) |
4953 | .addReg(CarryReg, RegState::Define) |
4954 | .add(SrcReg0Sub0) |
4955 | .add(SrcReg1Sub0) |
4956 | .addImm(0); // clamp bit |
4957 | |
4958 | unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; |
4959 | MachineInstr *HiHalf = |
4960 | BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) |
4961 | .addReg(DeadCarryReg, RegState::Define | RegState::Dead) |
4962 | .add(SrcReg0Sub1) |
4963 | .add(SrcReg1Sub1) |
4964 | .addReg(CarryReg, RegState::Kill) |
4965 | .addImm(0); // clamp bit |
4966 | |
4967 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) |
4968 | .addReg(DestSub0) |
4969 | .addImm(AMDGPU::sub0) |
4970 | .addReg(DestSub1) |
4971 | .addImm(AMDGPU::sub1); |
4972 | |
4973 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); |
4974 | |
4975 | // Try to legalize the operands in case we need to swap the order to keep it |
4976 | // valid. |
4977 | legalizeOperands(*LoHalf, MDT); |
4978 | legalizeOperands(*HiHalf, MDT); |
4979 | |
4980 | // Move all users of this moved vlaue. |
4981 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); |
4982 | } |
4983 | |
4984 | void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, |
4985 | MachineInstr &Inst, unsigned Opcode, |
4986 | MachineDominatorTree *MDT) const { |
4987 | MachineBasicBlock &MBB = *Inst.getParent(); |
4988 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
4989 | |
4990 | MachineOperand &Dest = Inst.getOperand(0); |
4991 | MachineOperand &Src0 = Inst.getOperand(1); |
4992 | MachineOperand &Src1 = Inst.getOperand(2); |
4993 | DebugLoc DL = Inst.getDebugLoc(); |
4994 | |
4995 | MachineBasicBlock::iterator MII = Inst; |
4996 | |
4997 | const MCInstrDesc &InstDesc = get(Opcode); |
4998 | const TargetRegisterClass *Src0RC = Src0.isReg() ? |
4999 | MRI.getRegClass(Src0.getReg()) : |
5000 | &AMDGPU::SGPR_32RegClass; |
5001 | |
5002 | const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); |
5003 | const TargetRegisterClass *Src1RC = Src1.isReg() ? |
5004 | MRI.getRegClass(Src1.getReg()) : |
5005 | &AMDGPU::SGPR_32RegClass; |
5006 | |
5007 | const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); |
5008 | |
5009 | MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, |
5010 | AMDGPU::sub0, Src0SubRC); |
5011 | MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, |
5012 | AMDGPU::sub0, Src1SubRC); |
5013 | MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, |
5014 | AMDGPU::sub1, Src0SubRC); |
5015 | MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, |
5016 | AMDGPU::sub1, Src1SubRC); |
5017 | |
5018 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); |
5019 | const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); |
5020 | const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); |
5021 | |
5022 | unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); |
5023 | MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) |
5024 | .add(SrcReg0Sub0) |
5025 | .add(SrcReg1Sub0); |
5026 | |
5027 | unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); |
5028 | MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) |
5029 | .add(SrcReg0Sub1) |
5030 | .add(SrcReg1Sub1); |
5031 | |
5032 | unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); |
5033 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) |
5034 | .addReg(DestSub0) |
5035 | .addImm(AMDGPU::sub0) |
5036 | .addReg(DestSub1) |
5037 | .addImm(AMDGPU::sub1); |
5038 | |
5039 | MRI.replaceRegWith(Dest.getReg(), FullDestReg); |
5040 | |
5041 | Worklist.insert(&LoHalf); |
5042 | Worklist.insert(&HiHalf); |
5043 | |
5044 | // Move all users of this moved vlaue. |
5045 | addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); |
5046 | } |
5047 | |
5048 | void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, |
5049 | MachineInstr &Inst, |
5050 | MachineDominatorTree *MDT) const { |
5051 | MachineBasicBlock &MBB = *Inst.getParent(); |
5052 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
5053 | |
5054 | MachineOperand &Dest = Inst.getOperand(0); |
5055 | MachineOperand &Src0 = Inst.getOperand(1); |
5056 | MachineOperand &Src1 = Inst.getOperand(2); |
5057 | const DebugLoc &DL = Inst.getDebugLoc(); |
5058 | |
5059 | MachineBasicBlock::iterator MII = Inst; |
5060 | |
5061 | const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); |
5062 | |
5063 | unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
5064 | |
5065 | MachineOperand* Op0; |
5066 | MachineOperand* Op1; |
5067 | |
5068 | if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { |
5069 | Op0 = &Src0; |
5070 | Op1 = &Src1; |
5071 | } else { |
5072 | Op0 = &Src1; |
5073 | Op1 = &Src0; |
5074 | } |
5075 | |
5076 | BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) |
5077 | .add(*Op0); |
5078 | |
5079 | unsigned NewDest = MRI.createVirtualRegister(DestRC); |
5080 | |
5081 | MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) |
5082 | .addReg(Interm) |
5083 | .add(*Op1); |
5084 | |
5085 | MRI.replaceRegWith(Dest.getReg(), NewDest); |
5086 | |
5087 | Worklist.insert(&Xor); |
5088 | } |
5089 | |
5090 | void SIInstrInfo::splitScalar64BitBCNT( |
5091 | SetVectorType &Worklist, MachineInstr &Inst) const { |
5092 | MachineBasicBlock &MBB = *Inst.getParent(); |
5093 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
5094 | |
5095 | MachineBasicBlock::iterator MII = Inst; |
5096 | const DebugLoc &DL = Inst.getDebugLoc(); |
5097 | |
5098 | MachineOperand &Dest = Inst.getOperand(0); |
5099 | MachineOperand &Src = Inst.getOperand(1); |
5100 | |
5101 | const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); |
5102 | const TargetRegisterClass *SrcRC = Src.isReg() ? |
5103 | MRI.getRegClass(Src.getReg()) : |
5104 | &AMDGPU::SGPR_32RegClass; |
5105 | |
5106 | unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5107 | unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5108 | |
5109 | const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); |
5110 | |
5111 | MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, |
5112 | AMDGPU::sub0, SrcSubRC); |
5113 | MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, |
5114 | AMDGPU::sub1, SrcSubRC); |
5115 | |
5116 | BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); |
5117 | |
5118 | BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); |
5119 | |
5120 | MRI.replaceRegWith(Dest.getReg(), ResultReg); |
5121 | |
5122 | // We don't need to legalize operands here. src0 for etiher instruction can be |
5123 | // an SGPR, and the second input is unused or determined here. |
5124 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); |
5125 | } |
5126 | |
5127 | void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, |
5128 | MachineInstr &Inst) const { |
5129 | MachineBasicBlock &MBB = *Inst.getParent(); |
5130 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
5131 | MachineBasicBlock::iterator MII = Inst; |
5132 | const DebugLoc &DL = Inst.getDebugLoc(); |
5133 | |
5134 | MachineOperand &Dest = Inst.getOperand(0); |
5135 | uint32_t Imm = Inst.getOperand(2).getImm(); |
5136 | uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. |
5137 | uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. |
5138 | |
5139 | (void) Offset; |
5140 | |
5141 | // Only sext_inreg cases handled. |
5142 | assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&((Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && "Not implemented") ? static_cast <void> (0) : __assert_fail ("Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && \"Not implemented\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5143, __PRETTY_FUNCTION__)) |
5143 | Offset == 0 && "Not implemented")((Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && "Not implemented") ? static_cast <void> (0) : __assert_fail ("Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && \"Not implemented\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5143, __PRETTY_FUNCTION__)); |
5144 | |
5145 | if (BitWidth < 32) { |
5146 | unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5147 | unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5148 | unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); |
5149 | |
5150 | BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) |
5151 | .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) |
5152 | .addImm(0) |
5153 | .addImm(BitWidth); |
5154 | |
5155 | BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) |
5156 | .addImm(31) |
5157 | .addReg(MidRegLo); |
5158 | |
5159 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) |
5160 | .addReg(MidRegLo) |
5161 | .addImm(AMDGPU::sub0) |
5162 | .addReg(MidRegHi) |
5163 | .addImm(AMDGPU::sub1); |
5164 | |
5165 | MRI.replaceRegWith(Dest.getReg(), ResultReg); |
5166 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); |
5167 | return; |
5168 | } |
5169 | |
5170 | MachineOperand &Src = Inst.getOperand(1); |
5171 | unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5172 | unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); |
5173 | |
5174 | BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) |
5175 | .addImm(31) |
5176 | .addReg(Src.getReg(), 0, AMDGPU::sub0); |
5177 | |
5178 | BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) |
5179 | .addReg(Src.getReg(), 0, AMDGPU::sub0) |
5180 | .addImm(AMDGPU::sub0) |
5181 | .addReg(TmpReg) |
5182 | .addImm(AMDGPU::sub1); |
5183 | |
5184 | MRI.replaceRegWith(Dest.getReg(), ResultReg); |
5185 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); |
5186 | } |
5187 | |
5188 | void SIInstrInfo::addUsersToMoveToVALUWorklist( |
5189 | unsigned DstReg, |
5190 | MachineRegisterInfo &MRI, |
5191 | SetVectorType &Worklist) const { |
5192 | for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), |
5193 | E = MRI.use_end(); I != E;) { |
5194 | MachineInstr &UseMI = *I->getParent(); |
5195 | |
5196 | unsigned OpNo = 0; |
5197 | |
5198 | switch (UseMI.getOpcode()) { |
5199 | case AMDGPU::COPY: |
5200 | case AMDGPU::WQM: |
5201 | case AMDGPU::WWM: |
5202 | case AMDGPU::REG_SEQUENCE: |
5203 | case AMDGPU::PHI: |
5204 | case AMDGPU::INSERT_SUBREG: |
5205 | break; |
5206 | default: |
5207 | OpNo = I.getOperandNo(); |
5208 | break; |
5209 | } |
5210 | |
5211 | if (!RI.hasVGPRs(getOpRegClass(UseMI, OpNo))) { |
5212 | Worklist.insert(&UseMI); |
5213 | |
5214 | do { |
5215 | ++I; |
5216 | } while (I != E && I->getParent() == &UseMI); |
5217 | } else { |
5218 | ++I; |
5219 | } |
5220 | } |
5221 | } |
5222 | |
5223 | void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, |
5224 | MachineRegisterInfo &MRI, |
5225 | MachineInstr &Inst) const { |
5226 | unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5227 | MachineBasicBlock *MBB = Inst.getParent(); |
5228 | MachineOperand &Src0 = Inst.getOperand(1); |
5229 | MachineOperand &Src1 = Inst.getOperand(2); |
5230 | const DebugLoc &DL = Inst.getDebugLoc(); |
5231 | |
5232 | switch (Inst.getOpcode()) { |
5233 | case AMDGPU::S_PACK_LL_B32_B16: { |
5234 | unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5235 | unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5236 | |
5237 | // FIXME: Can do a lot better if we know the high bits of src0 or src1 are |
5238 | // 0. |
5239 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) |
5240 | .addImm(0xffff); |
5241 | |
5242 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) |
5243 | .addReg(ImmReg, RegState::Kill) |
5244 | .add(Src0); |
5245 | |
5246 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) |
5247 | .add(Src1) |
5248 | .addImm(16) |
5249 | .addReg(TmpReg, RegState::Kill); |
5250 | break; |
5251 | } |
5252 | case AMDGPU::S_PACK_LH_B32_B16: { |
5253 | unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5254 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) |
5255 | .addImm(0xffff); |
5256 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) |
5257 | .addReg(ImmReg, RegState::Kill) |
5258 | .add(Src0) |
5259 | .add(Src1); |
5260 | break; |
5261 | } |
5262 | case AMDGPU::S_PACK_HH_B32_B16: { |
5263 | unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5264 | unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
5265 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) |
5266 | .addImm(16) |
5267 | .add(Src0); |
5268 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) |
5269 | .addImm(0xffff0000); |
5270 | BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) |
5271 | .add(Src1) |
5272 | .addReg(ImmReg, RegState::Kill) |
5273 | .addReg(TmpReg, RegState::Kill); |
5274 | break; |
5275 | } |
5276 | default: |
5277 | llvm_unreachable("unhandled s_pack_* instruction")::llvm::llvm_unreachable_internal("unhandled s_pack_* instruction" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5277); |
5278 | } |
5279 | |
5280 | MachineOperand &Dest = Inst.getOperand(0); |
5281 | MRI.replaceRegWith(Dest.getReg(), ResultReg); |
5282 | addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); |
5283 | } |
5284 | |
5285 | void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, |
5286 | MachineInstr &SCCDefInst, |
5287 | SetVectorType &Worklist) const { |
5288 | // Ensure that def inst defines SCC, which is still live. |
5289 | assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&((Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst) ? static_cast<void> (0) : __assert_fail ("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5290, __PRETTY_FUNCTION__)) |
5290 | !Op.isDead() && Op.getParent() == &SCCDefInst)((Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst) ? static_cast<void> (0) : __assert_fail ("Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5290, __PRETTY_FUNCTION__)); |
5291 | // This assumes that all the users of SCC are in the same block |
5292 | // as the SCC def. |
5293 | for (MachineInstr &MI : // Skip the def inst itself. |
5294 | make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), |
5295 | SCCDefInst.getParent()->end())) { |
5296 | // Check if SCC is used first. |
5297 | if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) |
5298 | Worklist.insert(&MI); |
5299 | // Exit if we find another SCC def. |
5300 | if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) |
5301 | return; |
5302 | } |
5303 | } |
5304 | |
5305 | const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( |
5306 | const MachineInstr &Inst) const { |
5307 | const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); |
5308 | |
5309 | switch (Inst.getOpcode()) { |
5310 | // For target instructions, getOpRegClass just returns the virtual register |
5311 | // class associated with the operand, so we need to find an equivalent VGPR |
5312 | // register class in order to move the instruction to the VALU. |
5313 | case AMDGPU::COPY: |
5314 | case AMDGPU::PHI: |
5315 | case AMDGPU::REG_SEQUENCE: |
5316 | case AMDGPU::INSERT_SUBREG: |
5317 | case AMDGPU::WQM: |
5318 | case AMDGPU::WWM: |
5319 | if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) |
5320 | return nullptr; |
5321 | |
5322 | NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); |
5323 | if (!NewDstRC) |
5324 | return nullptr; |
5325 | return NewDstRC; |
5326 | default: |
5327 | return NewDstRC; |
5328 | } |
5329 | } |
5330 | |
5331 | // Find the one SGPR operand we are allowed to use. |
5332 | unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, |
5333 | int OpIndices[3]) const { |
5334 | const MCInstrDesc &Desc = MI.getDesc(); |
5335 | |
5336 | // Find the one SGPR operand we are allowed to use. |
5337 | // |
5338 | // First we need to consider the instruction's operand requirements before |
5339 | // legalizing. Some operands are required to be SGPRs, such as implicit uses |
5340 | // of VCC, but we are still bound by the constant bus requirement to only use |
5341 | // one. |
5342 | // |
5343 | // If the operand's class is an SGPR, we can never move it. |
5344 | |
5345 | unsigned SGPRReg = findImplicitSGPRRead(MI); |
5346 | if (SGPRReg != AMDGPU::NoRegister) |
5347 | return SGPRReg; |
5348 | |
5349 | unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; |
5350 | const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); |
5351 | |
5352 | for (unsigned i = 0; i < 3; ++i) { |
5353 | int Idx = OpIndices[i]; |
5354 | if (Idx == -1) |
5355 | break; |
5356 | |
5357 | const MachineOperand &MO = MI.getOperand(Idx); |
5358 | if (!MO.isReg()) |
5359 | continue; |
5360 | |
5361 | // Is this operand statically required to be an SGPR based on the operand |
5362 | // constraints? |
5363 | const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); |
5364 | bool IsRequiredSGPR = RI.isSGPRClass(OpRC); |
5365 | if (IsRequiredSGPR) |
5366 | return MO.getReg(); |
5367 | |
5368 | // If this could be a VGPR or an SGPR, Check the dynamic register class. |
5369 | unsigned Reg = MO.getReg(); |
5370 | const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); |
5371 | if (RI.isSGPRClass(RegRC)) |
5372 | UsedSGPRs[i] = Reg; |
5373 | } |
5374 | |
5375 | // We don't have a required SGPR operand, so we have a bit more freedom in |
5376 | // selecting operands to move. |
5377 | |
5378 | // Try to select the most used SGPR. If an SGPR is equal to one of the |
5379 | // others, we choose that. |
5380 | // |
5381 | // e.g. |
5382 | // V_FMA_F32 v0, s0, s0, s0 -> No moves |
5383 | // V_FMA_F32 v0, s0, s1, s0 -> Move s1 |
5384 | |
5385 | // TODO: If some of the operands are 64-bit SGPRs and some 32, we should |
5386 | // prefer those. |
5387 | |
5388 | if (UsedSGPRs[0] != AMDGPU::NoRegister) { |
5389 | if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) |
5390 | SGPRReg = UsedSGPRs[0]; |
5391 | } |
5392 | |
5393 | if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { |
5394 | if (UsedSGPRs[1] == UsedSGPRs[2]) |
5395 | SGPRReg = UsedSGPRs[1]; |
5396 | } |
5397 | |
5398 | return SGPRReg; |
5399 | } |
5400 | |
5401 | MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, |
5402 | unsigned OperandName) const { |
5403 | int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); |
5404 | if (Idx == -1) |
5405 | return nullptr; |
5406 | |
5407 | return &MI.getOperand(Idx); |
5408 | } |
5409 | |
5410 | uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { |
5411 | if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { |
5412 | return (16ULL << 44) | // IMG_FORMAT_32_FLOAT |
5413 | (1ULL << 56) | // RESOURCE_LEVEL = 1 |
5414 | (3ULL << 60); // OOB_SELECT = 3 |
5415 | } |
5416 | |
5417 | uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; |
5418 | if (ST.isAmdHsaOS()) { |
5419 | // Set ATC = 1. GFX9 doesn't have this bit. |
5420 | if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
5421 | RsrcDataFormat |= (1ULL << 56); |
5422 | |
5423 | // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. |
5424 | // BTW, it disables TC L2 and therefore decreases performance. |
5425 | if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) |
5426 | RsrcDataFormat |= (2ULL << 59); |
5427 | } |
5428 | |
5429 | return RsrcDataFormat; |
5430 | } |
5431 | |
5432 | uint64_t SIInstrInfo::getScratchRsrcWords23() const { |
5433 | uint64_t Rsrc23 = getDefaultRsrcDataFormat() | |
5434 | AMDGPU::RSRC_TID_ENABLE | |
5435 | 0xffffffff; // Size; |
5436 | |
5437 | // GFX9 doesn't have ELEMENT_SIZE. |
5438 | if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { |
5439 | uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; |
5440 | Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; |
5441 | } |
5442 | |
5443 | // IndexStride = 64 / 32. |
5444 | uint64_t IndexStride = ST.getGeneration() <= AMDGPUSubtarget::GFX9 ? 3 : 2; |
5445 | Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; |
5446 | |
5447 | // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. |
5448 | // Clear them unless we want a huge stride. |
5449 | if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && |
5450 | ST.getGeneration() <= AMDGPUSubtarget::GFX9) |
5451 | Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; |
5452 | |
5453 | return Rsrc23; |
5454 | } |
5455 | |
5456 | bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { |
5457 | unsigned Opc = MI.getOpcode(); |
5458 | |
5459 | return isSMRD(Opc); |
5460 | } |
5461 | |
5462 | bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { |
5463 | unsigned Opc = MI.getOpcode(); |
5464 | |
5465 | return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); |
5466 | } |
5467 | |
5468 | unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, |
5469 | int &FrameIndex) const { |
5470 | const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); |
5471 | if (!Addr || !Addr->isFI()) |
5472 | return AMDGPU::NoRegister; |
5473 | |
5474 | assert(!MI.memoperands_empty() &&((!MI.memoperands_empty() && (*MI.memoperands_begin() )->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS) ? static_cast <void> (0) : __assert_fail ("!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5475, __PRETTY_FUNCTION__)) |
5475 | (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS)((!MI.memoperands_empty() && (*MI.memoperands_begin() )->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS) ? static_cast <void> (0) : __assert_fail ("!MI.memoperands_empty() && (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5475, __PRETTY_FUNCTION__)); |
5476 | |
5477 | FrameIndex = Addr->getIndex(); |
5478 | return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); |
5479 | } |
5480 | |
5481 | unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, |
5482 | int &FrameIndex) const { |
5483 | const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); |
5484 | assert(Addr && Addr->isFI())((Addr && Addr->isFI()) ? static_cast<void> ( 0) : __assert_fail ("Addr && Addr->isFI()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5484, __PRETTY_FUNCTION__)); |
5485 | FrameIndex = Addr->getIndex(); |
5486 | return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); |
5487 | } |
5488 | |
5489 | unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, |
5490 | int &FrameIndex) const { |
5491 | if (!MI.mayLoad()) |
5492 | return AMDGPU::NoRegister; |
5493 | |
5494 | if (isMUBUF(MI) || isVGPRSpill(MI)) |
5495 | return isStackAccess(MI, FrameIndex); |
5496 | |
5497 | if (isSGPRSpill(MI)) |
5498 | return isSGPRStackAccess(MI, FrameIndex); |
5499 | |
5500 | return AMDGPU::NoRegister; |
5501 | } |
5502 | |
5503 | unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, |
5504 | int &FrameIndex) const { |
5505 | if (!MI.mayStore()) |
5506 | return AMDGPU::NoRegister; |
5507 | |
5508 | if (isMUBUF(MI) || isVGPRSpill(MI)) |
5509 | return isStackAccess(MI, FrameIndex); |
5510 | |
5511 | if (isSGPRSpill(MI)) |
5512 | return isSGPRStackAccess(MI, FrameIndex); |
5513 | |
5514 | return AMDGPU::NoRegister; |
5515 | } |
5516 | |
5517 | unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { |
5518 | unsigned Size = 0; |
5519 | MachineBasicBlock::const_instr_iterator I = MI.getIterator(); |
5520 | MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); |
5521 | while (++I != E && I->isInsideBundle()) { |
5522 | assert(!I->isBundle() && "No nested bundle!")((!I->isBundle() && "No nested bundle!") ? static_cast <void> (0) : __assert_fail ("!I->isBundle() && \"No nested bundle!\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5522, __PRETTY_FUNCTION__)); |
5523 | Size += getInstSizeInBytes(*I); |
5524 | } |
5525 | |
5526 | return Size; |
5527 | } |
5528 | |
5529 | unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { |
5530 | unsigned Opc = MI.getOpcode(); |
5531 | const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); |
5532 | unsigned DescSize = Desc.getSize(); |
5533 | |
5534 | // If we have a definitive size, we can use it. Otherwise we need to inspect |
5535 | // the operands to know the size. |
5536 | if (isFixedSize(MI)) |
5537 | return DescSize; |
5538 | |
5539 | // 4-byte instructions may have a 32-bit literal encoded after them. Check |
5540 | // operands that coud ever be literals. |
5541 | if (isVALU(MI) || isSALU(MI)) { |
5542 | int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); |
5543 | if (Src0Idx == -1) |
5544 | return DescSize; // No operands. |
5545 | |
5546 | if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) |
5547 | return isVOP3(MI) ? 12 : (DescSize + 4); |
5548 | |
5549 | int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); |
5550 | if (Src1Idx == -1) |
5551 | return DescSize; |
5552 | |
5553 | if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) |
5554 | return isVOP3(MI) ? 12 : (DescSize + 4); |
5555 | |
5556 | int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); |
5557 | if (Src2Idx == -1) |
5558 | return DescSize; |
5559 | |
5560 | if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) |
5561 | return isVOP3(MI) ? 12 : (DescSize + 4); |
5562 | |
5563 | return DescSize; |
5564 | } |
5565 | |
5566 | // Check whether we have extra NSA words. |
5567 | if (isMIMG(MI)) { |
5568 | int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); |
5569 | if (VAddr0Idx < 0) |
5570 | return 8; |
5571 | |
5572 | int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); |
5573 | return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); |
5574 | } |
5575 | |
5576 | switch (Opc) { |
5577 | case TargetOpcode::IMPLICIT_DEF: |
5578 | case TargetOpcode::KILL: |
5579 | case TargetOpcode::DBG_VALUE: |
5580 | case TargetOpcode::EH_LABEL: |
5581 | return 0; |
5582 | case TargetOpcode::BUNDLE: |
5583 | return getInstBundleSize(MI); |
5584 | case TargetOpcode::INLINEASM: |
5585 | case TargetOpcode::INLINEASM_BR: { |
5586 | const MachineFunction *MF = MI.getParent()->getParent(); |
5587 | const char *AsmStr = MI.getOperand(0).getSymbolName(); |
5588 | return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), |
5589 | &MF->getSubtarget()); |
5590 | } |
5591 | default: |
5592 | return DescSize; |
5593 | } |
5594 | } |
5595 | |
5596 | bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { |
5597 | if (!isFLAT(MI)) |
5598 | return false; |
5599 | |
5600 | if (MI.memoperands_empty()) |
5601 | return true; |
5602 | |
5603 | for (const MachineMemOperand *MMO : MI.memoperands()) { |
5604 | if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) |
5605 | return true; |
5606 | } |
5607 | return false; |
5608 | } |
5609 | |
5610 | bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { |
5611 | return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; |
5612 | } |
5613 | |
5614 | void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, |
5615 | MachineBasicBlock *IfEnd) const { |
5616 | MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); |
5617 | assert(TI != IfEntry->end())((TI != IfEntry->end()) ? static_cast<void> (0) : __assert_fail ("TI != IfEntry->end()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5617, __PRETTY_FUNCTION__)); |
5618 | |
5619 | MachineInstr *Branch = &(*TI); |
5620 | MachineFunction *MF = IfEntry->getParent(); |
5621 | MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); |
5622 | |
5623 | if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { |
5624 | unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
5625 | MachineInstr *SIIF = |
5626 | BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) |
5627 | .add(Branch->getOperand(0)) |
5628 | .add(Branch->getOperand(1)); |
5629 | MachineInstr *SIEND = |
5630 | BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) |
5631 | .addReg(DstReg); |
5632 | |
5633 | IfEntry->erase(TI); |
5634 | IfEntry->insert(IfEntry->end(), SIIF); |
5635 | IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); |
5636 | } |
5637 | } |
5638 | |
5639 | void SIInstrInfo::convertNonUniformLoopRegion( |
5640 | MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { |
5641 | MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); |
5642 | // We expect 2 terminators, one conditional and one unconditional. |
5643 | assert(TI != LoopEnd->end())((TI != LoopEnd->end()) ? static_cast<void> (0) : __assert_fail ("TI != LoopEnd->end()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5643, __PRETTY_FUNCTION__)); |
5644 | |
5645 | MachineInstr *Branch = &(*TI); |
5646 | MachineFunction *MF = LoopEnd->getParent(); |
5647 | MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); |
5648 | |
5649 | if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { |
5650 | |
5651 | unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
5652 | unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
5653 | MachineInstrBuilder HeaderPHIBuilder = |
5654 | BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); |
5655 | for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), |
5656 | E = LoopEntry->pred_end(); |
5657 | PI != E; ++PI) { |
5658 | if (*PI == LoopEnd) { |
5659 | HeaderPHIBuilder.addReg(BackEdgeReg); |
5660 | } else { |
5661 | MachineBasicBlock *PMBB = *PI; |
5662 | unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
5663 | materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), |
5664 | ZeroReg, 0); |
5665 | HeaderPHIBuilder.addReg(ZeroReg); |
5666 | } |
5667 | HeaderPHIBuilder.addMBB(*PI); |
5668 | } |
5669 | MachineInstr *HeaderPhi = HeaderPHIBuilder; |
5670 | MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), |
5671 | get(AMDGPU::SI_IF_BREAK), BackEdgeReg) |
5672 | .addReg(DstReg) |
5673 | .add(Branch->getOperand(0)); |
5674 | MachineInstr *SILOOP = |
5675 | BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) |
5676 | .addReg(BackEdgeReg) |
5677 | .addMBB(LoopEntry); |
5678 | |
5679 | LoopEntry->insert(LoopEntry->begin(), HeaderPhi); |
5680 | LoopEnd->erase(TI); |
5681 | LoopEnd->insert(LoopEnd->end(), SIIFBREAK); |
5682 | LoopEnd->insert(LoopEnd->end(), SILOOP); |
5683 | } |
5684 | } |
5685 | |
5686 | ArrayRef<std::pair<int, const char *>> |
5687 | SIInstrInfo::getSerializableTargetIndices() const { |
5688 | static const std::pair<int, const char *> TargetIndices[] = { |
5689 | {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, |
5690 | {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, |
5691 | {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, |
5692 | {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, |
5693 | {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; |
5694 | return makeArrayRef(TargetIndices); |
5695 | } |
5696 | |
5697 | /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The |
5698 | /// post-RA version of misched uses CreateTargetMIHazardRecognizer. |
5699 | ScheduleHazardRecognizer * |
5700 | SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, |
5701 | const ScheduleDAG *DAG) const { |
5702 | return new GCNHazardRecognizer(DAG->MF); |
5703 | } |
5704 | |
5705 | /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer |
5706 | /// pass. |
5707 | ScheduleHazardRecognizer * |
5708 | SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { |
5709 | return new GCNHazardRecognizer(MF); |
5710 | } |
5711 | |
5712 | std::pair<unsigned, unsigned> |
5713 | SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { |
5714 | return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); |
5715 | } |
5716 | |
5717 | ArrayRef<std::pair<unsigned, const char *>> |
5718 | SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { |
5719 | static const std::pair<unsigned, const char *> TargetFlags[] = { |
5720 | { MO_GOTPCREL, "amdgpu-gotprel" }, |
5721 | { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, |
5722 | { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, |
5723 | { MO_REL32_LO, "amdgpu-rel32-lo" }, |
5724 | { MO_REL32_HI, "amdgpu-rel32-hi" } |
5725 | }; |
5726 | |
5727 | return makeArrayRef(TargetFlags); |
5728 | } |
5729 | |
5730 | bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { |
5731 | return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && |
5732 | MI.modifiesRegister(AMDGPU::EXEC, &RI); |
5733 | } |
5734 | |
5735 | MachineInstrBuilder |
5736 | SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, |
5737 | MachineBasicBlock::iterator I, |
5738 | const DebugLoc &DL, |
5739 | unsigned DestReg) const { |
5740 | if (ST.hasAddNoCarry()) |
5741 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); |
5742 | |
5743 | MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
5744 | unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
5745 | MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC); |
5746 | |
5747 | return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) |
5748 | .addReg(UnusedCarry, RegState::Define | RegState::Dead); |
5749 | } |
5750 | |
5751 | bool SIInstrInfo::isKillTerminator(unsigned Opcode) { |
5752 | switch (Opcode) { |
5753 | case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: |
5754 | case AMDGPU::SI_KILL_I1_TERMINATOR: |
5755 | return true; |
5756 | default: |
5757 | return false; |
5758 | } |
5759 | } |
5760 | |
5761 | const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { |
5762 | switch (Opcode) { |
5763 | case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: |
5764 | return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); |
5765 | case AMDGPU::SI_KILL_I1_PSEUDO: |
5766 | return get(AMDGPU::SI_KILL_I1_TERMINATOR); |
5767 | default: |
5768 | llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO")::llvm::llvm_unreachable_internal("invalid opcode, expected SI_KILL_*_PSEUDO" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5768); |
5769 | } |
5770 | } |
5771 | |
5772 | bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { |
5773 | if (!isSMRD(MI)) |
5774 | return false; |
5775 | |
5776 | // Check that it is using a buffer resource. |
5777 | int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); |
5778 | if (Idx == -1) // e.g. s_memtime |
5779 | return false; |
5780 | |
5781 | const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; |
5782 | return RCID == AMDGPU::SReg_128RegClassID; |
5783 | } |
5784 | |
5785 | // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td |
5786 | enum SIEncodingFamily { |
5787 | SI = 0, |
5788 | VI = 1, |
5789 | SDWA = 2, |
5790 | SDWA9 = 3, |
5791 | GFX80 = 4, |
5792 | GFX9 = 5, |
5793 | GFX10 = 6, |
5794 | SDWA10 = 7 |
5795 | }; |
5796 | |
5797 | static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { |
5798 | switch (ST.getGeneration()) { |
5799 | default: |
5800 | break; |
5801 | case AMDGPUSubtarget::SOUTHERN_ISLANDS: |
5802 | case AMDGPUSubtarget::SEA_ISLANDS: |
5803 | return SIEncodingFamily::SI; |
5804 | case AMDGPUSubtarget::VOLCANIC_ISLANDS: |
5805 | case AMDGPUSubtarget::GFX9: |
5806 | return SIEncodingFamily::VI; |
5807 | case AMDGPUSubtarget::GFX10: |
5808 | return SIEncodingFamily::GFX10; |
5809 | } |
5810 | llvm_unreachable("Unknown subtarget generation!")::llvm::llvm_unreachable_internal("Unknown subtarget generation!" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5810); |
5811 | } |
5812 | |
5813 | int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { |
5814 | SIEncodingFamily Gen = subtargetEncodingFamily(ST); |
5815 | |
5816 | if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && |
5817 | ST.getGeneration() == AMDGPUSubtarget::GFX9) |
5818 | Gen = SIEncodingFamily::GFX9; |
5819 | |
5820 | // Adjust the encoding family to GFX80 for D16 buffer instructions when the |
5821 | // subtarget has UnpackedD16VMem feature. |
5822 | // TODO: remove this when we discard GFX80 encoding. |
5823 | if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) |
5824 | Gen = SIEncodingFamily::GFX80; |
5825 | |
5826 | if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { |
5827 | switch (ST.getGeneration()) { |
5828 | default: |
5829 | Gen = SIEncodingFamily::SDWA; |
5830 | break; |
5831 | case AMDGPUSubtarget::GFX9: |
5832 | Gen = SIEncodingFamily::SDWA9; |
5833 | break; |
5834 | case AMDGPUSubtarget::GFX10: |
5835 | Gen = SIEncodingFamily::SDWA10; |
5836 | break; |
5837 | } |
5838 | } |
5839 | |
5840 | int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); |
5841 | |
5842 | // -1 means that Opcode is already a native instruction. |
5843 | if (MCOp == -1) |
5844 | return Opcode; |
5845 | |
5846 | // (uint16_t)-1 means that Opcode is a pseudo instruction that has |
5847 | // no encoding in the given subtarget generation. |
5848 | if (MCOp == (uint16_t)-1) |
5849 | return -1; |
5850 | |
5851 | return MCOp; |
5852 | } |
5853 | |
5854 | static |
5855 | TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { |
5856 | assert(RegOpnd.isReg())((RegOpnd.isReg()) ? static_cast<void> (0) : __assert_fail ("RegOpnd.isReg()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5856, __PRETTY_FUNCTION__)); |
5857 | return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : |
5858 | getRegSubRegPair(RegOpnd); |
5859 | } |
5860 | |
5861 | TargetInstrInfo::RegSubRegPair |
5862 | llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { |
5863 | assert(MI.isRegSequence())((MI.isRegSequence()) ? static_cast<void> (0) : __assert_fail ("MI.isRegSequence()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5863, __PRETTY_FUNCTION__)); |
5864 | for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) |
5865 | if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { |
5866 | auto &RegOp = MI.getOperand(1 + 2 * I); |
5867 | return getRegOrUndef(RegOp); |
5868 | } |
5869 | return TargetInstrInfo::RegSubRegPair(); |
5870 | } |
5871 | |
5872 | // Try to find the definition of reg:subreg in subreg-manipulation pseudos |
5873 | // Following a subreg of reg:subreg isn't supported |
5874 | static bool followSubRegDef(MachineInstr &MI, |
5875 | TargetInstrInfo::RegSubRegPair &RSR) { |
5876 | if (!RSR.SubReg) |
5877 | return false; |
5878 | switch (MI.getOpcode()) { |
5879 | default: break; |
5880 | case AMDGPU::REG_SEQUENCE: |
5881 | RSR = getRegSequenceSubReg(MI, RSR.SubReg); |
5882 | return true; |
5883 | // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg |
5884 | case AMDGPU::INSERT_SUBREG: |
5885 | if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) |
5886 | // inserted the subreg we're looking for |
5887 | RSR = getRegOrUndef(MI.getOperand(2)); |
5888 | else { // the subreg in the rest of the reg |
5889 | auto R1 = getRegOrUndef(MI.getOperand(1)); |
5890 | if (R1.SubReg) // subreg of subreg isn't supported |
5891 | return false; |
5892 | RSR.Reg = R1.Reg; |
5893 | } |
5894 | return true; |
5895 | } |
5896 | return false; |
5897 | } |
5898 | |
5899 | MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, |
5900 | MachineRegisterInfo &MRI) { |
5901 | assert(MRI.isSSA())((MRI.isSSA()) ? static_cast<void> (0) : __assert_fail ( "MRI.isSSA()", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5901, __PRETTY_FUNCTION__)); |
5902 | if (!TargetRegisterInfo::isVirtualRegister(P.Reg)) |
5903 | return nullptr; |
5904 | |
5905 | auto RSR = P; |
5906 | auto *DefInst = MRI.getVRegDef(RSR.Reg); |
5907 | while (auto *MI = DefInst) { |
5908 | DefInst = nullptr; |
5909 | switch (MI->getOpcode()) { |
5910 | case AMDGPU::COPY: |
5911 | case AMDGPU::V_MOV_B32_e32: { |
5912 | auto &Op1 = MI->getOperand(1); |
5913 | if (Op1.isReg() && |
5914 | TargetRegisterInfo::isVirtualRegister(Op1.getReg())) { |
5915 | if (Op1.isUndef()) |
5916 | return nullptr; |
5917 | RSR = getRegSubRegPair(Op1); |
5918 | DefInst = MRI.getVRegDef(RSR.Reg); |
5919 | } |
5920 | break; |
5921 | } |
5922 | default: |
5923 | if (followSubRegDef(*MI, RSR)) { |
5924 | if (!RSR.Reg) |
5925 | return nullptr; |
5926 | DefInst = MRI.getVRegDef(RSR.Reg); |
5927 | } |
5928 | } |
5929 | if (!DefInst) |
5930 | return MI; |
5931 | } |
5932 | return nullptr; |
5933 | } |
5934 | |
5935 | bool llvm::isEXECMaskConstantBetweenDefAndUses(unsigned VReg, |
5936 | MachineRegisterInfo &MRI) { |
5937 | assert(MRI.isSSA() && "Must be run on SSA")((MRI.isSSA() && "Must be run on SSA") ? static_cast< void> (0) : __assert_fail ("MRI.isSSA() && \"Must be run on SSA\"" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIInstrInfo.cpp" , 5937, __PRETTY_FUNCTION__)); |
5938 | auto *TRI = MRI.getTargetRegisterInfo(); |
5939 | |
5940 | auto *DefI = MRI.getVRegDef(VReg); |
5941 | auto *BB = DefI->getParent(); |
5942 | |
5943 | DenseSet<MachineInstr*> Uses; |
5944 | for (auto &Use : MRI.use_nodbg_operands(VReg)) { |
5945 | auto *I = Use.getParent(); |
5946 | if (I->getParent() != BB) |
5947 | return false; |
5948 | Uses.insert(I); |
5949 | } |
5950 | |
5951 | auto E = BB->end(); |
5952 | for (auto I = std::next(DefI->getIterator()); I != E; ++I) { |
5953 | Uses.erase(&*I); |
5954 | // don't check the last use |
5955 | if (Uses.empty() || I->modifiesRegister(AMDGPU::EXEC, TRI)) |
5956 | break; |
5957 | } |
5958 | return Uses.empty(); |
5959 | } |